# Pre -Processing 

In [1]:
import pandas as pd 
import os
import random
from tqdm import tqdm 
import pickle 
from functions import *

In [2]:
"""
Renaming the columns dataset
"""
df = pd.read_csv("wikigraph_reduced.csv", sep='\t') 
df.columns = ["Edges",'Source', 'Target']

In [3]:
df.head(5)

Unnamed: 0,Edges,Source,Target
0,796,95,1185516
1,909,108,1059989
2,910,108,1062426
3,911,108,1161925
4,1141,134,541222


In [4]:
"""
save renamed dataset into a csv file
"""
df.to_csv('data/dataset.csv', index=False)

In [5]:
"""
Looping over the txt file named "wiki-topcats-page-names.txt", 
we build a dictionary with key an integer and as value the name (as string) the name of the page. 
This index allows to work directly with page indexes.
"""
# keys: number of the page (article)
# values: name of the page (article)
p = open("wiki-topcats-page-names.txt", "r")
pages = {}
for pag in tqdm(p): 
    list_ = pag.split()[1:]
    aux = ' '.join(list_)
    pages[int(pag.split()[0])] = aux

1791489it [00:06, 265641.24it/s]


In [18]:
pages

{0: 'Chiasmal syndrome',
 1: 'Kleroterion',
 2: 'Pinakion',
 3: 'LyndonHochschildSerre spectral sequence',
 4: "Zariski's main theorem",
 5: 'FultonHansen connectedness theorem',
 6: "Cayley's ruled cubic surface",
 7: 'Annulus theorem',
 8: "Bing's recognition theorem",
 9: 'BochnerMartinelli formula',
 10: 'BergmanWeil formula',
 11: 'Menallen Township, Pennsylvania',
 12: 'Missouri Route 117',
 13: 'Jadwin, Missouri',
 14: 'Gladden, Missouri',
 15: 'Missouri Route 119',
 16: 'Missouri Route 68',
 17: 'Sligo, Missouri',
 18: 'Lower Parker School',
 19: 'Lecoma, Missouri',
 20: 'Doss, Missouri',
 21: 'Boss, Missouri',
 22: 'Vulcan, Missouri',
 23: 'Glover, Missouri',
 24: 'Missouri Route 114',
 25: 'Missouri Route 12',
 26: 'Missouri Route 80',
 27: 'Missouri Route 75',
 28: 'Missouri Route 91',
 29: 'Missouri Route 147',
 30: 'Missouri Route 149',
 31: 'Goldsberry, Missouri',
 32: 'Missouri Route 162',
 33: 'Missouri Route 172',
 34: 'Missouri Route 245',
 35: 'Missouri Route 273',
 

In [6]:
"""
usage of write/read pickle to save and read without re-run the cose
"""

write_pickle('data/pages.pkl', pages)

In [7]:
pages = read_pickle('data/pages.pkl')

In this step, we work with categories in two main steps: 
1. Since the provided dataset has been reduced from the entire one taking into account only pages belonging to categories with a length between 5000 and 30000, we considered categories within this margin.
2. Since each page (node) in the graph must belong to a single category, we created a structure in which each page in the graph (dictionary key) contains (as values) a list with all the categories to which that page belongs. 

In [8]:
"""
keys: number of the page (article)
values: [list of categories]
"""

cat = open("wiki-topcats-categories.txt", "r")
cat_per_pages = {}
for i in tqdm(cat):
    category = i.split()[0][9:-1]
    page_in_cat = list(map(int, i.split()[1:]))

    if len(page_in_cat) >5000 and len(page_in_cat) < 30000:
        for x in page_in_cat:
            if x not in cat_per_pages:
                a= []
                a.append(category)
                cat_per_pages[x]=a
            else: 
                aux = list(cat_per_pages[x])
                aux.append(category)
                cat_per_pages[x]= aux

17364it [00:03, 4865.59it/s]


This is the structure of the dictionary 

In [9]:
cat_per_pages

{22860: ['English_footballers', 'Association_football_defenders'],
 28411: ['English_footballers',
  'Association_football_defenders',
  'Year_of_birth_missing'],
 28961: ['English_footballers'],
 28979: ['English_footballers'],
 29264: ['English_footballers',
  'The_Football_League_players',
  'Association_football_defenders'],
 29573: ['English_footballers', 'The_Football_League_players'],
 29582: ['English_footballers',
  'The_Football_League_players',
  'Year_of_death_missing'],
 30896: ['English_footballers'],
 30900: ['English_footballers', 'Association_football_midfielders'],
 31902: ['English_footballers'],
 33973: ['English_footballers',
  'The_Football_League_players',
  'Association_football_forwards'],
 34042: ['English_footballers', 'Association_football_defenders'],
 41141: ['English_footballers'],
 43573: ['English_footballers', 'The_Football_League_players'],
 48582: ['English_footballers', 'The_Football_League_players'],
 48583: ['English_footballers',
  'The_Football_

In [10]:
write_pickle('data/categories.pkl', cat_per_pages)

In [11]:
categories = read_pickle('data/categories.pkl', )

As mentioned above, since a page must belong to only one category, the above structure allows us to choose only one category per page by choosing one randomly. 

In [12]:
"""
keys: number of the page (article)
values: category chosen at random and referring to that page
"""

one_cat_per_pages = {}
for key, elem in tqdm(cat_per_pages.items()):
    one_cat_per_pages[key] = random.choices(elem, k = 1)

100%|██████████| 149794/149794 [00:01<00:00, 148266.72it/s]


In [13]:
write_pickle('data/one_cat_per_pages.pkl', one_cat_per_pages)

In [14]:
one_cat_per_pages = read_pickle('data/one_cat_per_pages.pkl')

Finally, we create a last dictionary that takes us back to the initial state, i.e. 
the name of the category and the pages (integers) in that category. 
This time, each page is linked to a single category.

In [15]:
"""
keys: category chosen at random 
values: number of the pages (article) referring to the category
"""

categories_red = {}
for key, elem in tqdm(one_cat_per_pages.items()):

    if elem[0] not in categories_red:
            a= []
            a.append(key)
            categories_red[elem[0]]=a
    else: 
            aux = list(categories_red[elem[0]])
            aux.append(key)
            categories_red[elem[0]]= aux


100%|██████████| 149794/149794 [00:08<00:00, 17653.33it/s]


In [19]:
categories_red

{'English_footballers': [22860,
  28961,
  28979,
  29573,
  30896,
  30900,
  31902,
  41141,
  43573,
  48582,
  48718,
  48730,
  48922,
  54184,
  72482,
  72493,
  72496,
  72518,
  72522,
  72526,
  72528,
  72529,
  72530,
  72532,
  72538,
  72549,
  72567,
  72587,
  72589,
  72590,
  72591,
  72592,
  72593,
  72595,
  72607,
  72609,
  72613,
  72616,
  72617,
  72628,
  72630,
  72652,
  72654,
  72655,
  72656,
  72657,
  72658,
  72660,
  72661,
  72663,
  72664,
  72665,
  72668,
  72680,
  72681,
  72683,
  72685,
  72694,
  72695,
  72696,
  72698,
  72700,
  72701,
  72702,
  72703,
  72705,
  72706,
  72709,
  72710,
  72711,
  72712,
  72713,
  72714,
  72715,
  72716,
  72717,
  72718,
  72727,
  72728,
  72735,
  72736,
  72737,
  72738,
  72739,
  72742,
  72745,
  72746,
  72749,
  72750,
  72751,
  72752,
  72754,
  72756,
  72759,
  72760,
  72761,
  72764,
  72765,
  72768,
  72769,
  72772,
  72773,
  72774,
  72775,
  72776,
  72778,
  72779,
  72780,
  727

In [16]:
write_pickle('data/categories_red.pkl', categories_red)

In [17]:
categories_red = read_pickle('data/categories_red.pkl')