In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



## Load Article Title by Year 

In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [4]:
#create list of categories
categories_set = set([])
for category_group in categories_dict.values():
    categories_set = categories_set.union(set(category_group))

categories_list = list(categories_set)

In [5]:
#load word with years json

with open("../data/words_with_years.json") as dw:
    word_years_dict = json.load(dw)
    #default dict value of 2015
    word_years_dict = defaultdict(lambda: 2015, word_years_dict)

### Compute year for article

In [6]:
def get_article_year(title):
    """
    returns the earliest year
    the article could have appeared
    by computing the latest first 
    appearance of the words in the title
    """
    years = []
    for word in title.split():
        years.append(word_years_dict[word.lower().strip("()")])
    years.sort()
    if years:
        return years[-1]
    else:
        return 2015

In [7]:
#apply function to dataframe (runtime ~3 min)
fln_df['year'] = fln_df['article'].apply(get_article_year)

## Compute Category Rank by Decade

* a word may appear in more than one category
* a word appearing more than once in the title is not double counted in rank

In [20]:
def get_article_categories(title):
    """
    returns a dictionary of categories and their frequency 
    based on the words in the title 
        formatted as a dictionary: category --> frequency
        words appearing more than once in the title are not double counted
    """
    categories = defaultdict(int)
    try:
        for word in title.split():
            #eliminate duplicates 
            word_categories = set(categories_dict.get(word.lower().strip("()"), []))
            for category in word_categories:
                categories[category] += 1
        if categories:
            return categories
        else:
            return False
    except AttributeError:
        return False
        
len_categories = len(list(categories_set))

def get_category_freq_list(title):
    """
    returns a list of frequency count for each of the 371 categories
    based on the title of the given article
    """
    categories = get_article_categories(title)
    freq_list = np.zeros(len_categories)
    # confirm categories aren't empty
    if categories:
        for category, count in categories.items():
            index = categories_list.index(category)
            freq_list[index] += count
    return freq_list
        

In [21]:
get_article_categories("apple computer the technology")

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

### Compute Category Frequency by article

In [10]:
fln_df.head()

Unnamed: 0,article,first link,year
0,"Kangavar, East Azerbaijan",Kolah Boz-e Sharqi Rural District,2015
1,Noah Ponzer,Sandy Hook Elementary School shooting,2015
2,List of colonial governors of South Carolina,Province of South Carolina,2015
3,Jacobsdorf,Municipalities of Germany,2015
4,Battle of flarchheim,Battle of Flarchheim,2015


In [11]:
fln_df.shape

(11277534, 3)

In [21]:
fln_df.iloc[3]["article"]

'Earnestown, Upper Canada'

In [24]:
range(fln_df.shape[0])

range(0, 11277534)

In [13]:
def yield_category_counts(fln_df):
    num_rows = range(fln_df.shape[0])
    for i in num_rows:
        article = fln_df.iloc[i]["article"]
        yield get_category_freq_list(article)
        


In [38]:
categories_list

['The occult',
 'Of/pertaining to events/occurrences',
 'Aspects of emotion',
 'Wrongdoing',
 'Difficulty',
 'Military equipment',
 'Frequency',
 'Transfer of property',
 'Inhabitant',
 'Armed forces',
 'Moral evil',
 'Lack of power/authority',
 'Enquiry',
 'Working',
 'People collectively',
 'Earth science',
 'Suffering',
 'Reading',
 'Love',
 'Supply',
 'Particular time',
 'Industry',
 'Failure/lack of success',
 'Region of the earth',
 'Buying',
 'Necessity',
 'Removing from dwelling',
 'Trader',
 'Chemistry',
 'Motion in specific manner',
 'Transport',
 'War',
 'Wholeness',
 'Compassion',
 'Fashionableness',
 'Drill/training',
 'Animals collectively',
 'Punishment',
 'Memory',
 'In a ripening manner',
 'Knowledge',
 'Hearing/noise',
 'Restlessness',
 'Pride',
 'Attention',
 'Merchandise',
 'Ethnicities',
 'By family relationships',
 'Trading conditions',
 'Languages of the world',
 'Variety/species',
 'Office',
 'Ill-health',
 'Customs/values/civilization',
 'Relationship',
 'By na

In [39]:
pd.DataFrame(np.random.randn(10, 5))

Unnamed: 0,0,1,2,3,4
0,-0.166146,0.705149,0.557564,0.884386,1.985566
1,-1.530212,1.773549,0.276405,0.957895,0.401078
2,-1.674192,1.441262,0.924968,0.117663,2.155317
3,1.212631,0.925633,0.551006,-0.485107,-0.997685
4,-1.383375,1.375042,0.384287,-0.503566,1.620084
5,-0.166672,-0.168069,-0.73749,-0.320284,0.497272
6,-1.471701,0.406183,0.111683,0.55417,0.049797
7,1.26841,1.875699,0.418347,-0.108458,-1.854208
8,-0.366696,0.357469,1.191215,-1.162609,-1.005563
9,0.22445,-0.191218,1.06547,1.312666,-0.674643


In [44]:
fln_df[:1000]["article"]

0                              Kangavar, East Azerbaijan
1                                            Noah Ponzer
2           List of colonial governors of South Carolina
3                                             Jacobsdorf
4                                   Battle of flarchheim
5                                        Marco II Sanudo
6                                Kedzierzyn-Kozle County
7                                        Lake Nettilling
8                                 London Buses route 499
9                                         Cucamonga Peak
10                                  The Bryan Adams band
11                                        Ashley Feraude
12                                          Prescott, MA
13                                        Debra Lawrence
14                       Texas Board of Criminal Justice
15                                          U-ka Saegusa
16                                     Carlos Blackaller
17                             

10 loops, best of 3: 151 ms per loop


In [41]:
# try adding zeros to dataframe to see if it's simply a memory issue

pd.DataFrame(fln_df[:1000]["article"].map(get_category_freq_list), columns=categories_list)

Unnamed: 0,The occult,Of/pertaining to events/occurrences,Aspects of emotion,Wrongdoing,Difficulty,Military equipment,Frequency,Transfer of property,Inhabitant,Armed forces,...,Types of laws,Reptiles,Hunting,Means of travel,Chordates,Exercise of authority,Victory,Relinquishing,Doing,Attack with aircraft


In [61]:
%%timeit
pd.DataFrame.from_items(fln_df[:1000]["article"].map(get_category_freq_list).items(), 
                        columns=categories_list, orient='index')

10 loops, best of 3: 147 ms per loop


In [65]:
# runtime ~3min
test_single_df = pd.DataFrame(fln_df.index.map(get_category_freq_list))

In [66]:
test_single_df

Unnamed: 0,0
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
test_df.apply(lambda x: x[0])

In [62]:
%%timeit
pd.DataFrame([x for x in fln_df[:1000].index.map(get_category_freq_list)])

10 loops, best of 3: 112 ms per loop


In [24]:
t_df = pd.DataFrame(fln_df[:1000].index.map(get_category_freq_list))

In [36]:
t_df.apply(lambda x: pd.DataFrame([i for i in x]))

TypeError: 'numpy.int64' object is not iterable

In [60]:
# iteratively write data to csv

path = "/Users/mark/Desktop/temp_data/"

with open(path + "articles_rows_by_columns_categories.txt", "a") as myfile:
 
    n = 0
    stop = 100000

    for count in yield_category_counts(fln_df):
        n += 1
        if n % int(stop/10) == 0:
            print(n / stop)
        myfile.write(",".join([str(x) for x in count]))
        if n > stop:
            break
        



0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0


In [None]:
pd.DataFrame.from_csv(path + "articles_rows_by_columns_categories.txt")

In [None]:
# FAILED (TOO MUCH MEMORY)
# runtime ~ (started 6:10)
    # warning: eats up a lot of memory
categories_count_df = pd.DataFrame([x for x in fln_df.index.map(get_category_freq_list)])
categories_count_df.columns = categories_list

In [None]:
# join category count to dataframe

pd.concat(fln_df, categories_count_df, axis=1)

In [None]:
# write dataframe to disk 