In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



## Load Article Title by Year 

In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [4]:
#create list of categories
categories_set = set([])
for category_group in categories_dict.values():
    categories_set = categories_set.union(set(category_group))

categories_list = list(categories_set)

In [5]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

### Compute year for article

In [6]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [7]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

## Compute Category Rank by Decade

* a word may appear in more than one category
* a word appearing more than once in the title is not double counted in rank

In [8]:
def get_article_categories(title):
    """
    returns a dictionary of categories and their frequency 
    based on the words in the title 
        formatted as a dictionary: category --> frequency
        words appearing more than once in the title are not double counted
    """
    categories = defaultdict(int)
    try:
        for word in title.split():
            #eliminate duplicates 
            word_categories = set(categories_dict.get(word.lower().strip("()"), []))
            for category in word_categories:
                categories[category] += 1
        if categories:
            return categories
        else:
            return False
    except AttributeError:
        return False
        
len_categories = len(list(categories_set))

def get_categories_freq_list(title):
    """
    returns a list of frequency count for each of the 371 categories
    based on the title of the given article
    """
    categories = get_article_categories(title)
    freq_list = np.zeros(len_categories)
    # confirm categories aren't empty
    if categories:
        for category, count in categories.items():
            index = categories_list.index(category)
            freq_list[index] += count
    return freq_list
        

In [9]:
get_article_categories("apple computer the technology")

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

### Compute Category Frequency by article

In [9]:
fln_df.head()

Unnamed: 0,article,first link,year
0,Varagavan,Tavush,2015
1,Potolemy,Ptolemy,2015
2,Australia–Hungary relations,Foreign relations of Australia,2015
3,My Life in the Bush of Ghosts (novel),Africa,2015
4,Russian monitor Rusalka,Imperial Russian Navy,2015


# Sample from articles

In [10]:
fln_df[:1000].head()

Unnamed: 0,article,first link,year
0,Varagavan,Tavush,2015
1,Potolemy,Ptolemy,2015
2,Australia–Hungary relations,Foreign relations of Australia,2015
3,My Life in the Bush of Ghosts (novel),Africa,2015
4,Russian monitor Rusalka,Imperial Russian Navy,2015


In [11]:
categories_df = pd.DataFrame.from_items(fln_df[:50000]["article"].map(get_categories_freq_list).items(), 
                        columns=categories_list, orient='index')

In [12]:
pd.concat([fln_df[:50000], categories_df], axis=1)

Unnamed: 0,article,first link,year,Educational administration,Inhabitant,Order,Kind/sort,Means of travel,Absence of movement,Armed encounter,...,Textiles,Enquiry,Wholeness,Aspects of faith,Inhabiting temporarily,Worship,Change,Entertainment,Pertaining to animal body,Virtue
0,Varagavan,Tavush,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Potolemy,Ptolemy,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Australia–Hungary relations,Foreign relations of Australia,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,My Life in the Bush of Ghosts (novel),Africa,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Russian monitor Rusalka,Imperial Russian Navy,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,The Motion Lounge,"Williamsburg, Brooklyn",2015,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
6,Sam querrey,Sam Querrey,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"All Souls Church, Halifax",redundant church,2015,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Idaho Potato Commission,Idaho,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,FK Sloboda Mrkonjic Grad,FK Sloboda Mrkonjić Grad,2015,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
articles_by_category_df = pd.concat([fln_df[:50000], categories_df], axis=1)

In [14]:
articles_by_category_df.to_json("/Users/mark/Desktop/temp_data/sample_articles_by_category.json")

## Sample by Category and Corresponding list of articles

In [15]:
def get_article_categories_list(title):
    """
    returns a list of categories (not accounting for frequency)
    based on the words in the title 
        words appearing more than once in the title are not double counted
    """
    article_categories = set([])
    try:
        for word in title.split():
            #eliminate duplicates 
            word_categories = set(categories_dict.get(word.lower().strip("()"), []))
            article_categories = article_categories.union(word_categories)
        if article_categories:
            return list(article_categories)
        else:
            return []
    except AttributeError:
        return []

In [16]:
get_article_categories_list("technology apple")

['Expectation',
 'Farming',
 'Linguistics',
 'Study of work',
 'Particular plants',
 'Ability',
 'Naming',
 'Knowledge']

In [17]:
categories_df = pd.DataFrame(categories_list, columns=["category"]).set_index("category")

In [18]:
categories_df.head()

Educational administration
Inhabitant
Order
Kind/sort
Means of travel


In [20]:
combined_df = fln_df[:50000]
combined_df["categories_list"] = pd.Series(fln_df[:50000]["article"].map(get_article_categories_list))
# create column of empty list for tuples
categories_df["list_of_article_year_tuples"] = np.empty((len(categories_df), 0)).tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [21]:
combined_df.head()

Unnamed: 0,article,first link,year,categories_list
0,Varagavan,Tavush,2015,[]
1,Potolemy,Ptolemy,2015,[]
2,Australia–Hungary relations,Foreign relations of Australia,2015,"[Sexual, Relationship]"
3,My Life in the Bush of Ghosts (novel),Africa,2015,"[Place of education, Taking, Legal possession,..."
4,Russian monitor Rusalka,Imperial Russian Navy,2015,"[Memory, Broadcasting, Navigation, Safety, Equ..."


In [None]:
def get_category_article_and_year(df_row):
    if df_row["categories_list"]:
        categories_and_tuple = []
        for category in df_row["categories_list"]:
            categories_and_tuple.append((category, (df_row["article"], df_row["year"])))
        return categories_and_tuple
    else:
        return []
            
            

values = combined_df.apply(get_category_article_and_year, axis=1)

values.head()

0                                                   []
1                                                   []
2    [(Sexual, (Australia–Hungary relations, 2015))...
3    [(Place of education, (My Life in the Bush of ...
4    [(Memory, (Russian monitor Rusalka, 2015)), (B...
dtype: object

In [None]:
for categories_values in values:
    for category_tuple in categories_values:
        category_value = categories_df.loc[category_tuple[0]]["list_of_article_year_tuples"]
        category_value.append(category_tuple[1])

In [None]:
categories_df.head()

In [None]:
# write to disk
categories_df.to_json("/Users/mark/Desktop/temp_data/categories_sample_articles.json")

#schema:
    # category: [article1, year1], [article2, year2]

### Check whether all categories have at least one article

In [None]:
for c in categories_df["list_of_article_year_tuples"]:
    if len(c) == 0:
        print(c)

### only six categories are missing article entries