In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import mpld3

from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

## FLN through Time

In [5]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

In [6]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [7]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

In [8]:
fln_df.head()

Unnamed: 0,article,first link,year
0,Fire Creek (disambiguation),Fire Creek,1827
1,"Confidence, California",Unincorporated area,2015
2,Lukas Wilaschek,Andrey Mishin,2015
3,Sir Thomas Cookes,"Sir Thomas Cookes, 2nd Baronet",2015
4,PICMG 2.2,PICMG,2015


### FLN in 1890

In [9]:
fln_df[fln_df['year'] < 1890]

Unnamed: 0,article,first link,year
0,Fire Creek (disambiguation),Fire Creek,1827
25,All I Want is You and You and You,All I Want Is You... and You... and You...,1523
36,Age hardening,Precipitation hardening,1630
39,Glory hole (sexual slang),public toilet,1651
40,Ripuarian (language),Ripuarian language,1781
49,Glider (band),Jim Bogios,1200
50,John Fen,John Fenn (priest),1572
62,Martin Matador,MGM-1 Matador,1674
68,Defiance Records,record label,1581
69,All Points West Festival,All Points West Music & Arts Festival,1838


## Categories

In [10]:
def get_article_category(title):
    """
    returns each article's categories based on  
    the words title
        formatted as a dictionary:
            category: frequency
    """
    categories = defaultdict(int)
    for word in title.split():
        #eliminate duplicates 
        word_categories = set(categories_dict.get(word.lower().strip("()"), []))
        for category in word_categories:
            categories[category] += 1
    if categories:
        return categories
    else:
        return ["None"]

def rank_categories(year):
    """
    returns a dictionary of catogries and their rank
    rank = number of articles with the category tag
    
    dictionary is based on the FLN of the input year
        input year must be an int
    """
    categories_rank = defaultdict(int)

    for article in fln_df[fln_df['year'] < year]['article']:
        article_categories = get_article_category(article)
        for category in article_categories:
            categories_rank[category] += 1
    
    return categories_rank

### Categories in 1890

In [11]:
%%timeit
rank_categories(1890)

1 loops, best of 3: 38.5 s per loop


### Categories by decade

In [12]:
# runtime ~30min

decades = range(1000, 2020, 10)

categories_decades = {}

for year in decades:
    categories_decades[year] = rank_categories(year)

In [13]:
# write data

temp_store_path = "/Users/mark/Desktop/temp_data/"
with open(temp_store_path + "categories_decades.json", "w") as f:
    json.dump(categories_decades, f)

In [14]:
# transform data

categories_decades_transformed = defaultdict(dict)
#category: {"year": "freq"}

for year in decades:
    for category in categories_decades[year]:
        categories_decades_transformed[category][year] = categories_decades[year][category]
        

In [15]:
categories_decades_transformed

defaultdict(dict,
            {'A language': {1000: 4165,
              1010: 4165,
              1020: 4165,
              1030: 4165,
              1040: 4165,
              1050: 4165,
              1060: 4165,
              1070: 4165,
              1080: 4165,
              1090: 4165,
              1100: 4165,
              1110: 4174,
              1120: 4174,
              1130: 4180,
              1140: 4181,
              1150: 4181,
              1160: 4371,
              1170: 4371,
              1180: 4706,
              1190: 4746,
              1200: 4746,
              1210: 5212,
              1220: 5212,
              1230: 6319,
              1240: 6376,
              1250: 6394,
              1260: 6663,
              1270: 6669,
              1280: 6785,
              1290: 6787,
              1300: 8381,
              1310: 10479,
              1320: 10675,
              1330: 11212,
              1340: 11600,
              1350: 12190,
              1360: 12297,


In [16]:
#write transformed data to memory
temp_store_path = "/Users/mark/Desktop/temp_data/"
with open(temp_store_path + "categories_decades_transformed.json", "w") as f:
    json.dump(categories_decades_transformed, f)

In [17]:
list(categories_decades_transformed['Administration of justice'].keys())

[1280,
 1540,
 1030,
 1800,
 1290,
 1550,
 1040,
 1810,
 1300,
 1560,
 1050,
 1820,
 1310,
 1570,
 1060,
 1830,
 1320,
 1580,
 1070,
 1840,
 1330,
 1590,
 1080,
 1850,
 1340,
 1600,
 1090,
 1860,
 1350,
 1610,
 1100,
 1870,
 1360,
 1620,
 1110,
 1880,
 1370,
 1630,
 1120,
 1890,
 1380,
 1640,
 1130,
 1900,
 1390,
 1650,
 1140,
 1910,
 1400,
 1660,
 1150,
 1920,
 1410,
 1670,
 1160,
 1930,
 1420,
 1680,
 1170,
 1940,
 1430,
 1690,
 1180,
 1950,
 1440,
 1700,
 1190,
 1960,
 1450,
 1710,
 1200,
 1970,
 1460,
 1720,
 1210,
 1980,
 1470,
 1730,
 1220,
 1990,
 1480,
 1740,
 1230,
 2000,
 1490,
 1750,
 1240,
 2010,
 1500,
 1760,
 1250,
 1510,
 1000,
 1770,
 1260,
 1520,
 1010,
 1780,
 1270,
 1530,
 1020,
 1790]