In [1]:
import pandas as pd
from scipy import stats 
import numpy as np
import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from collections import defaultdict

#change to appropriate path
results_path = "/Users/mark/Desktop/wiki_v4/"



In [2]:
#load Wikipedia First Link Network 


with open(results_path + "fln.json") as f:
    fln_dict = json.load(f)
fln_df = pd.DataFrame.from_dict(fln_dict, orient='index')
fln_df.index.name = 'article'
fln_df.columns = ['first link']
fln_df = fln_df.reset_index()

In [3]:
#load Categories
data_path = "/Users/mark/Dropbox/Math/Complex_Systems/research/classifying-ideas/historical_wikipedia/data/"
with open(data_path + "categories_for_words.json") as f:
    categories_dict = json.load(f)

In [4]:
len(categories_dict)

384440

In [32]:
categories_dict['apple']

['Particular plants', 'Farming', 'Expectation']

In [5]:
categories_dict['computer']

['Number', 'Number']

In [33]:
categories_dict['mouse']

['Ill-health',
 'Ill-health',
 'Invertebrates',
 'Invertebrates',
 'Mammals',
 'Mammals',
 'Food',
 'Hunting',
 'Hearing/noise',
 'Hearing/noise',
 'Creation',
 'Extension in space',
 'Extension in space',
 'Extension in space',
 'Extension in space',
 'Endeavour',
 'Endeavour',
 'Number',
 'Number',
 'Understanding',
 'Enquiry',
 'Importance',
 'Importance',
 'Love',
 'Love',
 'Love',
 'Fear',
 'Taking']

In [34]:
categories_dict['hat']

['Ill-health',
 'Growing/that grows',
 'Particular plants',
 'Food',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Clothing',
 'Suffering',
 'Love',
 'Office',
 'Office',
 'Artefacts',
 'Artefacts',
 'Artefacts',
 'Position/job',
 'Working']

In [35]:
def get_article_category(title):
    """
    returns the most frequentlly appearing category 
    of the words in the article's title
    """
    categories = defaultdict(int)
    for word in title.split():
        #eliminate duplicates 
        word_categories = set(categories_dict.get(word.lower().strip("()"), []))
        for category in word_categories:
            categories[category] += 1
    if categories:
        return categories
    else:
        return ["None"]

In [36]:
get_article_category('apple computer the technology')

defaultdict(int,
            {'Ability': 1,
             'Expectation': 1,
             'Farming': 1,
             'Knowledge': 1,
             'Linguistics': 1,
             'Naming': 1,
             'Number': 1,
             'Particular plants': 1,
             'Study of work': 1})

In [37]:
get_article_category('coffee beans production')

defaultdict(int,
            {'Administration of justice': 1,
             'Causation': 1,
             'Colour': 1,
             'Creation': 1,
             'Drink': 1,
             'Duration': 1,
             'Extension in space': 1,
             'Food': 1,
             'Industry': 1,
             'Linguistics': 1,
             'Management of money': 1,
             'Manifestation': 1,
             'Manner of action': 1,
             'Number': 1,
             'Particular plants': 1,
             'The Arts': 1})

In [47]:
get_article_category('economic production surplus trade')

defaultdict(int,
            {'Administration of justice': 1,
             'Advantage': 2,
             'Air/space travel': 1,
             'Aspects of travel': 1,
             'Atmosphere, weather': 1,
             'Attack with aircraft': 1,
             'Behaviour': 1,
             'Causation': 1,
             'Change': 1,
             'Control': 1,
             'Creation': 1,
             'Duration': 1,
             'Extension in space': 1,
             'Hostilities at sea': 1,
             'Indication': 1,
             'Industry': 1,
             'Journalism': 1,
             'Judgement, decision': 1,
             'Law enforcement': 1,
             'Linguistics': 1,
             'Management of money': 2,
             'Manifestation': 1,
             'Manner of action': 2,
             'Means of travel': 1,
             'Merchandise': 1,
             'Moral evil': 1,
             'Navigation': 1,
             'Number': 1,
             'Office': 1,
             'Progressive motion': 

In [40]:
get_article_category('linguistic anthropology discovery')

defaultdict(int,
            {'Deity': 1,
             'Discovery': 1,
             'Endeavour': 1,
             'Enquiry': 1,
             'Linguistics': 1,
             'Manifestation': 1,
             'Record': 1,
             'Relative position': 1,
             'Science of mankind': 1,
             'Sight/vision': 1,
             'Testing': 1,
             'The Arts': 2})

In [42]:
set(categories_dict['discovery'])

{'Discovery',
 'Endeavour',
 'Enquiry',
 'Manifestation',
 'Record',
 'Relative position',
 'Sight/vision',
 'Testing',
 'The Arts'}

## Most Frequently Appearing Tags in Today's Wikipedia

In [64]:
from collections import Counter

# runtime ~3min

categories_rank = defaultdict(int)

for article in fln_df['article']:
    article_categories = get_article_category(article)
    for category in article_categories:
        categories_rank[category] += 1

In [69]:
for category in sorted(categories_rank, key=categories_rank.get, reverse=True)[:50]:
    print(category, categories_rank[category])

N 4303937
o 4303937
e 4303937
n 4303937
The Arts 2764063
Relative position 2168997
Sport 1885773
Farming 1747774
Rule/government 1673423
Navigation 1649311
Relationship 1592215
Inhabited place 1479564
Food 1477729
Number 1467172
The body 1424885
Creation 1380219
Taking 1346398
Ill-health 1246438
Entertainment 1231272
Knowledge 1225008
Wholeness 1155924
Equipment 1100943
Motion in a certain direction 1095963
Particular time 1095262
Behaviour 1056998
Social class 1026757
Mammals 1015503
Quantity 972826
Means of travel 970587
Clothing 967941
Physics 942587
Industry 925279
Prosperity 901348
Order 900348
Kind/sort 897476
Land 887623
Drink 885607
Shape 878487
Healing/cure 878482
Biology 854386
Materials 847538
Hunting 842056
The universe 821289
Social relations 819644
Belief 817549
Punishment 814638
Office 786396
Textiles 780597
Printing 770296
Transport 758426


In [70]:
#fix "None" category
categories_rank["None"] = 4303937

del categories_rank["N"]
del categories_rank["o"]
del categories_rank["n"]
del categories_rank["e"]

In [71]:
#store categories rank json

#write data
temp_store_path = "/Users/mark/Desktop/temp_data/"
with open(temp_store_path + "categories_rank.json", "w") as f:
    json.dump(categories_rank, f)