In [None]:
import pandas as pd
from scipy import stats 
import numpy as np
import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import mpld3

from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



In [None]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [None]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

## FLN through Time

In [None]:
#load word with years json

with open("data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

In [None]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [None]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

In [None]:
fln_df.head()

### FLN in 1890

In [10]:
fln_df[fln_df['year'] < 1890]

Unnamed: 0,article,first link,year
9,Reveille (mascot),Reveille (dog),1881
17,Below Zero,Below Zero (1930 film),1604
33,Patella (genus),Patella (gastropod),1671
39,Forest Lawn Cemetery (Cathedral City),mausoleum,1587
46,Basal metazoa,"Animal#Ctenophora, Porifera, Placozoa, Cnidari...",1874
50,Nuclear fuels,Nuclear fuel,1846
53,First School War,Belgium,1205
69,Royal Hunt,progressive metal,1375
76,Flux Magazine (US),Flux Magazine,1828
81,Real Yoga,yoga,1820


## Categories

In [7]:
def get_article_category(title):
    """
    returns each article's categories based on  
    the words title
        formatted as a dictionary:
            category: frequency
    """
    categories = defaultdict(int)
    for word in title.split():
        #eliminate duplicates 
        word_categories = set(categories_dict.get(word.lower().strip("()"), []))
        for category in word_categories:
            categories[category] += 1
    if categories:
        return categories
    else:
        return ["None"]

def rank_categories(year):
    """
    returns a dictionary of catogries and their rank
    rank = number of articles with the category tag
    
    dictionary is based on the FLN of the input year
        input year must be an int
    """
    categories_rank = defaultdict(int)

    for article in fln_df[fln_df['year'] < year]['article']:
        article_categories = get_article_category(article)
        for category in article_categories:
            categories_rank[category] += 1
    
    return categories_rank

### Categories in 1890

In [15]:
%%timeit
rank_categories(1890)

1 loops, best of 3: 47.7 s per loop


### Categories by decade

In [17]:
# runtime ~30min

decades = range(1000, 2020, 10)

categories_decades = {}

for year in decades:
    categories_decades[year] = rank_categories(year)

In [18]:
# write data

temp_store_path = "/Users/mark/Desktop/temp_data/"
with open(temp_store_path + "categories_decades.json", "w") as f:
    json.dump(categories_decades, f)

In [23]:
# transform data

categories_decades_transformed = defaultdict(dict)
#category: {"year": "freq"}

for year in decades:
    for category in categories_decades[year]:
        categories_decades_transformed[category][year] = categories_decades[year][category]
        

In [24]:
categories_decades_transformed

defaultdict(dict,
            {'A language': {1000: 270,
              1010: 270,
              1020: 270,
              1030: 270,
              1040: 271,
              1050: 271,
              1060: 271,
              1070: 271,
              1080: 271,
              1090: 271,
              1100: 271,
              1110: 272,
              1120: 272,
              1130: 276,
              1140: 276,
              1150: 276,
              1160: 307,
              1170: 307,
              1180: 491,
              1190: 494,
              1200: 494,
              1210: 949,
              1220: 950,
              1230: 1467,
              1240: 1541,
              1250: 1575,
              1260: 1755,
              1270: 1758,
              1280: 1804,
              1290: 1818,
              1300: 2996,
              1310: 5100,
              1320: 5228,
              1330: 5850,
              1340: 6224,
              1350: 6681,
              1360: 6790,
              1370: 7213,
   

In [25]:
#write transformed data to memory
temp_store_path = "/Users/mark/Desktop/temp_data/"
with open(temp_store_path + "categories_decades_transformed.json", "w") as f:
    json.dump(categories_decades_transformed, f)

In [30]:
list(categories_decades_transformed['Administration of justice'].keys())

[1280,
 1540,
 1030,
 1800,
 1290,
 1550,
 1040,
 1810,
 1300,
 1560,
 1050,
 1820,
 1310,
 1570,
 1060,
 1830,
 1320,
 1580,
 1070,
 1840,
 1330,
 1590,
 1080,
 1850,
 1340,
 1600,
 1090,
 1860,
 1350,
 1610,
 1100,
 1870,
 1360,
 1620,
 1110,
 1880,
 1370,
 1630,
 1120,
 1890,
 1380,
 1640,
 1130,
 1900,
 1390,
 1650,
 1140,
 1910,
 1400,
 1660,
 1150,
 1920,
 1410,
 1670,
 1160,
 1930,
 1420,
 1680,
 1170,
 1940,
 1430,
 1690,
 1180,
 1950,
 1440,
 1700,
 1190,
 1960,
 1450,
 1710,
 1200,
 1970,
 1460,
 1720,
 1210,
 1980,
 1470,
 1730,
 1220,
 1990,
 1480,
 1740,
 1230,
 2000,
 1490,
 1750,
 1240,
 2010,
 1500,
 1760,
 1250,
 1510,
 1000,
 1770,
 1260,
 1520,
 1010,
 1780,
 1270,
 1530,
 1020,
 1790]