In [60]:
import pandas as pd 
import json 
import os
import random
from tqdm import tqdm 
import pickle 

In [61]:
def write_pickle(file_name, content):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'wb') as handle:
        pickle.dump(content, handle)

In [62]:
def read_pickle(file_name):
    with open(file_name, 'rb') as handle:
        return pickle.load(handle)

In [63]:
def write_json(file_name, content):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w') as outfile:
        json.dump(content, outfile, sort_keys=True, indent=4)

In [64]:
# Run cell
def read_json_simple(file_name):
    with open(file_name) as json_file:
        data_dict = json.load(json_file)
        return data_dict

In [65]:
# Run cell
def read_json(file_name):
    with open(file_name) as json_file:
        data_dict = json.load(json_file, object_hook=jsonKeys2int)
        return data_dict

In [66]:
# Run cell
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [67]:
"""
Renaming the columns dataset
"""
df = pd.read_csv("wikigraph_reduced.csv", sep='\t') 
df.columns = ["Edges",'Source', 'Target']

In [68]:
df.head()

Unnamed: 0,Edges,Source,Target
0,796,95,1185516
1,909,108,1059989
2,910,108,1062426
3,911,108,1161925
4,1141,134,541222


In [69]:
"""
save renamed dataset into a csv file
"""
df.to_csv('data/dataset.csv', index=False)

In [70]:
"""
Takink as input the file wiki-topcats-page-names.txt, return a Dictionary with key the number of 
the page and as value the name of the page. 
"""

p = open("wiki-topcats-page-names.txt", "r")
pages = {}
for pag in tqdm(p): 
    list_ = pag.split()[1:]
    aux = ' '.join(list_)
    pages[int(pag.split()[0])] = aux


1791489it [00:05, 300114.83it/s]


In [71]:
write_pickle('data/pages.pkl', pages)


In [74]:
pag

{0: 'Chiasmal syndrome',
 1: 'Kleroterion',
 2: 'Pinakion',
 3: 'LyndonHochschildSerre spectral sequence',
 4: "Zariski's main theorem",
 5: 'FultonHansen connectedness theorem',
 6: "Cayley's ruled cubic surface",
 7: 'Annulus theorem',
 8: "Bing's recognition theorem",
 9: 'BochnerMartinelli formula',
 10: 'BergmanWeil formula',
 11: 'Menallen Township, Pennsylvania',
 12: 'Missouri Route 117',
 13: 'Jadwin, Missouri',
 14: 'Gladden, Missouri',
 15: 'Missouri Route 119',
 16: 'Missouri Route 68',
 17: 'Sligo, Missouri',
 18: 'Lower Parker School',
 19: 'Lecoma, Missouri',
 20: 'Doss, Missouri',
 21: 'Boss, Missouri',
 22: 'Vulcan, Missouri',
 23: 'Glover, Missouri',
 24: 'Missouri Route 114',
 25: 'Missouri Route 12',
 26: 'Missouri Route 80',
 27: 'Missouri Route 75',
 28: 'Missouri Route 91',
 29: 'Missouri Route 147',
 30: 'Missouri Route 149',
 31: 'Goldsberry, Missouri',
 32: 'Missouri Route 162',
 33: 'Missouri Route 172',
 34: 'Missouri Route 245',
 35: 'Missouri Route 273',
 

In [73]:
pag  = read_pickle('data/pages.pkl')

In [36]:
"""
Saving the first dict
"""
write_json('data/pages.json', pages)

In [37]:
voc = read_json('data/pages.json')

In [75]:
"""
Return 2 dicts:
1. The inverted has as key an integer from 0 to the len of the element in the file
and as value the number of the article that appears in the category. 
2. The 
"""
cat = open("wiki-topcats-categories.txt", "r")
categories_inv = {}
inverted = {}
categories = {}
count = 0 
for i in tqdm(cat): 
    category = i.split()[0][9:-1]
    page_in_cat = list(map(int, i.split()[1:])) 
    categories_inv[count]= category
    inverted[count] = page_in_cat
    categories[category] = page_in_cat
    count += 1

17364it [00:02, 8527.08it/s]


In [76]:
write_pickle('data/categories.pkl', categories)

In [77]:

categories  = read_pickle('data/categories.pkl')

In [79]:
"""
each pages has a list of the names of the categories wich are linked to
"""
cat = open("wiki-topcats-categories.txt", "r")
cat_per_pages = {}
for i in tqdm(cat):
    category = i.split()[0][9:-1]
    page_in_cat = list(map(int, i.split()[1:]))
    
    for x in page_in_cat:
        if x not in cat_per_pages:
            a= []
            a.append(category)
            cat_per_pages[x]=a
        else: 
            aux = list(cat_per_pages[x])
            aux.append(category)
            cat_per_pages[x]= aux

17364it [00:17, 974.94it/s] 


In [80]:
cat_per_pages

{301: ['Buprestoidea'],
 302: ['Buprestoidea'],
 303: ['Buprestoidea', 'Insect_families'],
 304: ['Buprestoidea'],
 305: ['Buprestoidea'],
 306: ['Buprestoidea'],
 307: ['Buprestoidea'],
 308: ['Buprestoidea'],
 309: ['Buprestoidea'],
 310: ['Buprestoidea'],
 311: ['Buprestoidea'],
 312: ['Buprestoidea'],
 313: ['Buprestoidea'],
 314: ['Buprestoidea'],
 315: ['Buprestoidea'],
 316: ['Buprestoidea'],
 317: ['Buprestoidea'],
 318: ['Buprestoidea'],
 319: ['Buprestoidea'],
 320: ['Buprestoidea'],
 321: ['Buprestoidea'],
 322: ['Buprestoidea'],
 323: ['Buprestoidea'],
 324: ['Buprestoidea'],
 325: ['Buprestoidea'],
 326: ['Buprestoidea'],
 327: ['Buprestoidea'],
 328: ['Buprestoidea'],
 329: ['Buprestoidea'],
 330: ['Buprestoidea'],
 331: ['Buprestoidea'],
 332: ['Buprestoidea'],
 333: ['Buprestoidea'],
 334: ['Buprestoidea'],
 335: ['Buprestoidea'],
 336: ['Buprestoidea'],
 337: ['Buprestoidea'],
 338: ['Buprestoidea'],
 339: ['Buprestoidea'],
 340: ['Buprestoidea'],
 341: ['Buprestoidea'

In [81]:
"""
reducing cat_per_pages dictionary 
to have the condition that only one page is linked to only one category. 

"""
one_cat_per_pages = {}
for key, elem in tqdm(cat_per_pages.items()):
    one_cat_per_pages[key] = random.choices(elem, k = 1)


100%|██████████| 1791489/1791489 [00:09<00:00, 189780.98it/s]


In [82]:
one_cat_per_pages

{301: ['Buprestoidea'],
 302: ['Buprestoidea'],
 303: ['Buprestoidea'],
 304: ['Buprestoidea'],
 305: ['Buprestoidea'],
 306: ['Buprestoidea'],
 307: ['Buprestoidea'],
 308: ['Buprestoidea'],
 309: ['Buprestoidea'],
 310: ['Buprestoidea'],
 311: ['Buprestoidea'],
 312: ['Buprestoidea'],
 313: ['Buprestoidea'],
 314: ['Buprestoidea'],
 315: ['Buprestoidea'],
 316: ['Buprestoidea'],
 317: ['Buprestoidea'],
 318: ['Buprestoidea'],
 319: ['Buprestoidea'],
 320: ['Buprestoidea'],
 321: ['Buprestoidea'],
 322: ['Buprestoidea'],
 323: ['Buprestoidea'],
 324: ['Buprestoidea'],
 325: ['Buprestoidea'],
 326: ['Buprestoidea'],
 327: ['Buprestoidea'],
 328: ['Buprestoidea'],
 329: ['Buprestoidea'],
 330: ['Buprestoidea'],
 331: ['Buprestoidea'],
 332: ['Buprestoidea'],
 333: ['Buprestoidea'],
 334: ['Buprestoidea'],
 335: ['Buprestoidea'],
 336: ['Buprestoidea'],
 337: ['Buprestoidea'],
 338: ['Buprestoidea'],
 339: ['Buprestoidea'],
 340: ['Buprestoidea'],
 341: ['Buprestoidea'],
 342: ['Bupresto

In [83]:
"""
Finally, we create a last dictionary that takes us back to the initial state, i.e. 
the name of the category and the pages (integers) in that category. 
This time, each page is linked to a single category.
"""

categories_red = {}
for key, elem in tqdm(one_cat_per_pages.items()):

    if elem[0] not in categories_red:
            a= []
            a.append(key)
            categories_red[elem[0]]=a
    else: 
            aux = list(categories_red[elem[0]])
            aux.append(key)
            categories_red[elem[0]]= aux


100%|██████████| 1791489/1791489 [04:55<00:00, 6070.61it/s]  


In [84]:
write_pickle('data/categories_red.pkl', categories_red)

In [85]:
categories_red  = read_pickle('data/categories_red.pkl')

In [86]:
categories_red

{'Buprestoidea': [301,
  302,
  303,
  304,
  305,
  306,
  307,
  308,
  309,
  310,
  311,
  312,
  313,
  314,
  315,
  316,
  317,
  318,
  319,
  320,
  321,
  322,
  323,
  324,
  325,
  326,
  327,
  328,
  329,
  330,
  331,
  332,
  333,
  334,
  335,
  336,
  337,
  338,
  339,
  340,
  341,
  342,
  343,
  344,
  345,
  346,
  347,
  348,
  349,
  350,
  351,
  352,
  353,
  354,
  355,
  356,
  357,
  358,
  359,
  360,
  361,
  362,
  363,
  364,
  365,
  366,
  367,
  368,
  369,
  370,
  371,
  372,
  373,
  374,
  375,
  376,
  377,
  378,
  379,
  380,
  381,
  382,
  383,
  384,
  385,
  386,
  387,
  388,
  389,
  390,
  391,
  392,
  393,
  394,
  395,
  396,
  397,
  398,
  399,
  400,
  401,
  402,
  403,
  404,
  405,
  406,
  407,
  408,
  409,
  410,
  411,
  412,
  413,
  414,
  415,
  416,
  417,
  418,
  419,
  420,
  421,
  422,
  423,
  424,
  425,
  426,
  427,
  428,
  429,
  430,
  431,
  432,
  433,
  434,
  435,
  436,
  437,
  438,
  439,
  440,
  44