In [1]:
import json
import os
from collections import Counter

import networkx as nx
import pandas as pd
from tqdm import tqdm

In [18]:
annotations_path = '../out/annotations.csv'

df = pd.read_csv(annotations_path, header=None)
df.columns = ['source', 'target']

In [19]:
df.head()

Unnamed: 0,source,target
0,AWK,Język_programowania
1,AWK,Plik
2,AWK,System_operacyjny
3,AWK,Unix
4,AWK,Tablica_asocjacyjna


In [20]:
wiki_pages_dir = '../out/extracted_pages/'

wiki_pages = []
pages_category_dict = {}
for json_file in tqdm(list(os.listdir(wiki_pages_dir))):
    with open(wiki_pages_dir+json_file, 'r') as f:
        try:
            wiki_page = json.load(f)

            wiki_page_title = wiki_page['url'].split('/')[-1]
            wiki_pages.append(wiki_page_title)

            wiki_page_main_category = wiki_page['main_category']
            pages_category_dict[wiki_page_title] = wiki_page_main_category
        except UnicodeDecodeError:
            pass

100%|██████████| 75793/75793 [00:06<00:00, 11028.01it/s]


In [21]:
len(wiki_pages) - len(set(wiki_pages))

143

In [22]:
df.shape

(1086032, 2)

In [23]:
wiki_pages_set = set(wiki_pages)

df = df[
    (df['source'].isin(wiki_pages_set)) &
    (df['target'].isin(wiki_pages_set))
]

In [24]:
df.shape

(347655, 2)

In [12]:
G = nx.from_pandas_edgelist(df)

In [14]:
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')

conn_comp = list(nx.connected_components(G))
print(f'Number of connected components: {len(conn_comp)}')

conn_comp_sizes = []
for comp in sorted(conn_comp):
    conn_comp_sizes.append(len(comp))

N = 10
print(f'{N} biggest connected components sizes: {conn_comp_sizes[:N]}')

Number of nodes: 68414
Number of edges: 310081
Number of connected components: 297
10 biggest connected components sizes: [67756, 3, 3, 2, 2, 2, 1, 2, 2, 1]


In [30]:
biggest_conn_copm = sorted(list(nx.connected_components(G)))[0]
bcc_main_categories = list(map(lambda x: pages_category_dict[x], list(biggest_conn_copm)))

Counter(bcc_main_categories)

Counter({'Astronomia': 25463,
         'Informatyka': 4551,
         'Chemia': 2422,
         'Matematyka': 7261,
         'Biologia': 19030,
         'Psychologia': 4399,
         'Fizyka': 4630})