In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



## Load Article Title by Year 

In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [4]:
#create list of categories
categories_set = set([])
for category_group in categories_dict.values():
    categories_set = categories_set.union(set(category_group))

categories_list = list(categories_set)

In [5]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

### Compute year for article

In [6]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [7]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

## Compute Category Rank by Decade

* a word may appear in more than one category
* a word appearing more than once in the title is not double counted in rank

In [8]:
def get_article_categories(title):
    """
    returns a dictionary of categories and their frequency 
    based on the words in the title 
        formatted as a dictionary: category --> frequency
        words appearing more than once in the title are not double counted
    """
    categories = defaultdict(int)
    try:
        for word in title.split():
            #eliminate duplicates 
            word_categories = set(categories_dict.get(word.lower().strip("()"), []))
            for category in word_categories:
                categories[category] += 1
        if categories:
            return categories
        else:
            return False
    except AttributeError:
        return False
        
len_categories = len(list(categories_set))

def get_category_freq_list(title):
    """
    returns a list of frequency count for each of the 371 categories
    based on the title of the given article
    """
    categories = get_article_categories(title)
    freq_list = np.zeros(len_categories)
    # confirm categories aren't empty
    if categories:
        for category, count in categories.items():
            index = categories_list.index(category)
            freq_list[index] += count
    return freq_list
        

In [9]:
get_article_categories("apple computer the technology")

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

### Compute Category Frequency by article

In [10]:
fln_df.head()

Unnamed: 0,article,first link,year
0,Iceland–UK relations,Iceland–United Kingdom relations,2015
1,Aaja Nachle,Bollywood films of 2007,2015
2,Tom Arthur,Wales,2015
3,Alessandro Silva Perreira,Alessandro Silva Pereira,2015
4,Trapp family,Georg Johannes von Trapp,2015


In [11]:
fln_df.shape

(11277534, 3)

In [12]:
fln_df.iloc[3]["article"]

'Alessandro Silva Perreira'

In [13]:
range(fln_df.shape[0])

range(0, 11277534)

10 loops, best of 3: 151 ms per loop


In [61]:
%%timeit
pd.DataFrame.from_items(fln_df[:1000]["article"].map(get_category_freq_list).items(), 
                        columns=categories_list, orient='index')

10 loops, best of 3: 147 ms per loop


In [None]:
# runtime ~ (started 8:53pm)
test_categories_df = pd.DataFrame.from_records(fln_df.index.map(get_category_freq_list), 
                                          index=fln_df.index, columns=categories_list, coerce_float=False)

In [15]:
# runtime ~3min
test_single_df = pd.DataFrame(fln_df.index.map(get_category_freq_list))

In [17]:
test_single_df.head()

Unnamed: 0,0
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [26]:
%%timeit
test_single_df[0][:10000].apply(lambda x: x[0])

The slowest run took 6.76 times longer than the fastest. This could mean that an intermediate result is being cached 
100 loops, best of 3: 6.43 ms per loop


In [27]:
test_single_df[0].apply(lambda x: x[0])

0           0
1           0
2           0
3           0
4           0
5           0
6           0
7           0
8           0
9           0
10          0
11          0
12          0
13          0
14          0
15          0
16          0
17          0
18          0
19          0
20          0
21          0
22          0
23          0
24          0
25          0
26          0
27          0
28          0
29          0
           ..
11277504    0
11277505    0
11277506    0
11277507    0
11277508    0
11277509    0
11277510    0
11277511    0
11277512    0
11277513    0
11277514    0
11277515    0
11277516    0
11277517    0
11277518    0
11277519    0
11277520    0
11277521    0
11277522    0
11277523    0
11277524    0
11277525    0
11277526    0
11277527    0
11277528    0
11277529    0
11277530    0
11277531    0
11277532    0
11277533    0
Name: 0, dtype: float64

In [52]:
type(test_single_df[0][3])

numpy.ndarray

In [61]:
pd.DataFrame.from_records(test_single_df[0][:1000])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,366,367,368,369,370,371,372,373,374,375
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
%%timeit
pd.DataFrame.from_records(test_single_df[0][:1000].values)

10 loops, best of 3: 116 ms per loop


In [43]:
%%timeit
categories_df = pd.DataFrame()

for i, category in enumerate(categories_list):
    categories_df[category] = test_single_df[:1000][0].apply(lambda x: x[i])

1 loops, best of 3: 498 ms per loop


Unnamed: 0,0
0,"[[0.0], [0.0]]"
1,"[[0.0], [0.0]]"
2,"[[0.0], [0.0]]"
3,"[[0.0], [0.0]]"
4,"[[0.0], [0.0]]"
5,"[[0.0], [0.0]]"
6,"[[0.0], [0.0]]"
7,"[[0.0], [0.0]]"
8,"[[0.0], [0.0]]"
9,"[[0.0], [0.0]]"


In [39]:
categories_df.head()

Unnamed: 0,Ability,Unfashionableness,Legal obligation,Jurisprudence,Direction,Warriors collectively,Social event,Warrior,Armed forces,Locomotive,...,Impact,Love,The Arts,Indication,Speech,Nations,Social attitudes,Endeavour,Lack of power/authority,"Atmosphere, weather"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
%%timeit
pd.DataFrame([x for x in fln_df[:1000].index.map(get_category_freq_list)])

10 loops, best of 3: 112 ms per loop


In [24]:
t_df = pd.DataFrame(fln_df[:1000].index.map(get_category_freq_list))

In [60]:
# iteratively write data to csv

path = "/Users/mark/Desktop/temp_data/"

with open(path + "articles_rows_by_columns_categories.txt", "a") as myfile:
 
    n = 0
    stop = 100000

    for count in yield_category_counts(fln_df):
        n += 1
        if n % int(stop/10) == 0:
            print(n / stop)
        myfile.write(",".join([str(x) for x in count]))
        if n > stop:
            break
        



0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0


In [None]:
pd.DataFrame.from_csv(path + "articles_rows_by_columns_categories.txt")

In [None]:
# FAILED (TOO MUCH MEMORY)
# runtime ~ (started 6:10)
    # warning: eats up a lot of memory
categories_count_df = pd.DataFrame([x for x in fln_df.index.map(get_category_freq_list)])
categories_count_df.columns = categories_list

In [None]:
# join category count to dataframe

pd.concat(fln_df, categories_count_df, axis=1)

In [None]:
# write dataframe to disk 