In [52]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"

## Load Article Title by Year 

In [3]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()
fln_df = fln_df.set_index("article")

In [4]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [89]:
#create list of categories
categories_set = set([])
for category_group in categories_dict.values():
    categories_set = categories_set.union(set(category_group))

categories_list = list(categories_set)

In [6]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

In [7]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [8]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

## Compute Category Rank by Decade

* a word may appear in more than one category
* a word appearing more than once in the title is not double counted in rank

In [111]:
def get_article_categories(title):
    """
    returns a dictionary of categories and their frequency 
    based on the words in the title 
        formatted as a dictionary: category --> frequency
        words appearing more than once in the title are not double counted
    """
    categories = defaultdict(int)
    for word in title.split():
        #eliminate duplicates 
        word_categories = set(categories_dict.get(word.lower().strip("()"), []))
        for category in word_categories:
            categories[category] += 1
    if categories:
        return categories
    else:
        return False

def get_category_freq_list(title):
    """
    returns a list of frequency count for each of the 371 categories
    based on the title of the given article
    """
    categories = get_article_categories(title)
    freq_list = [0]*len(categories_list)
    # confirm categories aren't empty
    if categories:
        for category, count in categories.items():
            index = categories_list.index(category)
            freq_list[index] += count
    return freq_list
        

In [112]:
get_article_categories("apple computer the technology")

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

In [113]:
for k, v in get_article_categories("apple computer the technology").items():
    print(k)

Ability
Study of work
Number
Particular plants
Naming
Linguistics
Knowledge
Expectation
Farming


In [114]:
get_category_freq_list("apple computer the technology")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,


In [115]:
fln_df.head()

Unnamed: 0_level_0,first link,year
article,Unnamed: 1_level_1,Unnamed: 2_level_1
Swimming at the 1984 Summer Olympics – Women's 200 metre freestyle,freestyle swimming,2015
WUND,WUND-FM,10
Daniele Vocaturo,chess,2015
Monserrat Montserrat,Montserrat,2015
Eric Genotte,2003 Baghdad DHL attempted shootdown incident,2015


In [178]:
pd.DataFrame([x for x in fln_df[:10].index.map(get_category_freq_list)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,366,367,368,369,370,371,372,373,374,375
0,0,0,0,0,0,0,0,1,2,0,...,0,0,0,0,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1]:
categories_count = pd.DataFrame([x for x in fln_df[:10].index.map(get_category_freq_list)])

NameError: name 'pd' is not defined

In [180]:
fln_df.head().reset_index()

Unnamed: 0,article,first link,year
0,Swimming at the 1984 Summer Olympics – Women's...,freestyle swimming,2015
1,WUND,WUND-FM,10
2,Daniele Vocaturo,chess,2015
3,Monserrat Montserrat,Montserrat,2015
4,Eric Genotte,2003 Baghdad DHL attempted shootdown incident,2015


In [182]:
fln_df.head()

Unnamed: 0_level_0,first link,year
article,Unnamed: 1_level_1,Unnamed: 2_level_1
Swimming at the 1984 Summer Olympics – Women's 200 metre freestyle,freestyle swimming,2015
WUND,WUND-FM,10
Daniele Vocaturo,chess,2015
Monserrat Montserrat,Montserrat,2015
Eric Genotte,2003 Baghdad DHL attempted shootdown incident,2015


In [184]:
test_head = pd.concat([fln_df.head().reset_index(), pd.DataFrame([x for x in fln_df[:10].index.map(get_category_freq_list)])], axis=1)
test_head

Unnamed: 0,article,first link,year,0,1,2,3,4,5,6,...,366,367,368,369,370,371,372,373,374,375
0,Swimming at the 1984 Summer Olympics – Women's...,freestyle swimming,2015.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,WUND,WUND-FM,10.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Daniele Vocaturo,chess,2015.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Monserrat Montserrat,Montserrat,2015.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Eric Genotte,2003 Baghdad DHL attempted shootdown incident,2015.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
categories_rank = defaultdict(int)

for article in fln_df['article']:
    article_categories = get_article_category(article)
    for category in article_categories:
        categories_rank[category] += 1

In [15]:
import operator
sorted(categories_rank.items(), key=operator.itemgetter(1))[-10:]

[('Food', 1477729),
 ('Inhabited place', 1479564),
 ('Relationship', 1592215),
 ('Navigation', 1649311),
 ('Rule/government', 1673423),
 ('Farming', 1747774),
 ('Sport', 1885773),
 ('Relative position', 2168997),
 ('The Arts', 2764063),
 ('None', 4303937)]

In [27]:
def rank_categories(fln_df):
    """
    returns a default dict of categories and their rank 
    based on a given dataframe of articles
    """
    categories_rank = defaultdict(int)

    for article in fln_df['article']:
        article_categories = get_article_category(article)
        for category in article_categories:
            categories_rank[category] += 1
    return categories_rank
    

In [47]:
fln_df.head()

Unnamed: 0,article,first link,year
0,Swimming at the 1984 Summer Olympics – Women's...,freestyle swimming,2015
1,WUND,WUND-FM,10
2,Daniele Vocaturo,chess,2015
3,Monserrat Montserrat,Montserrat,2015
4,Eric Genotte,2003 Baghdad DHL attempted shootdown incident,2015


* note many articles have a date of 2015
    * in our analysis of decades: 2020 includes all articles

In [65]:
for i in range(102000):
    print(i, end='\r')



In [66]:
# runtime ~25min

# yields: 
    # year --> category1 --> rank
            # catgoery 2 --> rank
        
decades = range(1000, 2030, 10)

categories_by_decade = {}
for year in decades:
    # show progress
    print(year, end="\r")
    categories_by_decade[year] = rank_categories(fln_df[fln_df["year"] < year])



KeyboardInterrupt: 

In [None]:
# started 2:02pm
# runtime ~

# yields: 
    # category1 --> year --> rank
                # year --> rank
    # category2 --> year --> rank
        
decades = range(1000, 2030, 10)

category_by_decades = {}
for year in decades:
    # show progress
    print(year, end="\r")
    
    categories_ranked = defaultdict(int)
    articles_in_year = fln_df[fln_df["year"] < year]
    
    for article in articles_in_year:
        article_categories = get_article_category(article)
        for category in article_categories:
            # increment category count in decade
            if category in category_by_decades:
                category_by_decades[category][year] = category_by_decades[category].get(year, 0) + 1
            else:
                category_by_decades[category] = {year: 1}
            #categories_ranked[category] += 1
    #save ranked categories to dictionary
    categories_by_decade[year] = categories_ranked