In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



## Load Article Title by Year 

In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [4]:
#create list of categories
categories_set = set([])
for category_group in categories_dict.values():
    categories_set = categories_set.union(set(category_group))

categories_list = list(categories_set)

In [5]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

### Compute year for article

In [6]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [7]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

## Compute Category Rank by Decade

* a word may appear in more than one category
* a word appearing more than once in the title is not double counted in rank

In [8]:
def get_article_categories(title):
    """
    returns a dictionary of categories and their frequency 
    based on the words in the title 
        formatted as a dictionary: category --> frequency
        words appearing more than once in the title are not double counted
    """
    categories = defaultdict(int)
    try:
        for word in title.split():
            #eliminate duplicates 
            word_categories = set(categories_dict.get(word.lower().strip("()"), []))
            for category in word_categories:
                categories[category] += 1
        if categories:
            return categories
        else:
            return False
    except AttributeError:
        return False
        
len_categories = len(list(categories_set))

def get_categories_freq_list(title):
    """
    returns a list of frequency count for each of the 371 categories
    based on the title of the given article
    """
    categories = get_article_categories(title)
    freq_list = np.zeros(len_categories)
    # confirm categories aren't empty
    if categories:
        for category, count in categories.items():
            index = categories_list.index(category)
            freq_list[index] += count
    return freq_list
        

In [9]:
get_article_categories("apple computer the technology")

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

### Compute Category Frequency by article

In [10]:
fln_df.head()

Unnamed: 0,article,first link,year
0,Adam Cardinal Maida,Adam Maida,2015
1,Kuźnica Strobińska,village,2015
2,Nitol botnet,botnet,2015
3,Aghdash,Aqdash,2015
4,Witchcraft Act 1735,Parliament of Great Britain,2015
