In [80]:
import json
import os
from collections import Counter

import networkx as nx
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


tqdm.pandas()

In [38]:
df = pd.read_csv('../out/wiki_pages_lemmas.csv', index_col=0)

In [39]:
df.head()

Unnamed: 0,title,text,category,noun_lemmas
0,Mirosław_Miller,Mirosław Miller – Dyrektor Międzynarodowego La...,Chemia,"['mirosława', 'miller', 'dyrektor', 'laborator..."
1,Chimerokształtne,"Chimerokształtne, chimery, przerazy (Chimaerif...",Biologia,"['chimera', 'przeraza', 'chimaeriformes', 'rzą..."
2,(2855)_Bastian,(2855) Bastian (1931 TB2) – planetoida z grupy...,Astronomia,"['bastian', 'tb2', 'planetoida', 'grupa', 'pas..."
3,Cryptocephalus_celtibericus,"""Cryptocephalus celtibericus"" – gatunek chrząs...",Biologia,"['cryptocephalus', 'celtibericus', 'gatunek', ..."
4,Język_maszynowy,"Język maszynowy, kod maszynowy – zestaw rozkaz...",Matematyka,"['język', 'koda', 'zestaw', 'rozkaz', 'proceso..."


## All nouns in graph

In [7]:
G = nx.Graph()

In [16]:
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    lemmas = row['noun_lemmas'][1:-1].replace('\'', '').split(', ')
    for lemma in lemmas:
        G.add_edge(row['title'], lemma)

100%|██████████| 75792/75792 [00:40<00:00, 1874.29it/s]


In [17]:
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')

conn_comp = list(nx.connected_components(G))
print(f'Number of connected components: {len(conn_comp)}')

conn_comp_sizes = []
for comp in sorted(conn_comp):
    conn_comp_sizes.append(len(comp))

N = 10
print(f'{N} biggest connected components sizes: {conn_comp_sizes[:N]}')

Number of nodes: 359640
Number of edges: 3632871
Number of connected components: 30
10 biggest connected components sizes: [359574, 3, 2, 2, 2, 4, 2, 3, 2, 2]


## Top common nouns in category

In [40]:
df['noun_lemmas'] = df.progress_apply(lambda row: row['noun_lemmas'][1:-1].replace('\'', '').split(', '), axis=1)

100%|██████████| 75792/75792 [00:03<00:00, 22267.37it/s]


### top 1000 words

In [59]:
top_k = 1000

In [60]:
top_nouns = {}
categories = list(df['category'].unique())
print(len(categories), 'categories')

7 categories


In [61]:
for cat in categories:
    noun_lists = df[df['category'] == cat]['noun_lemmas']
    noun_list = []
    for l in tqdm(noun_lists):
        noun_list = noun_list + l
    
    noun_list = Counter(noun_list).most_common()[0:top_k]
    noun_list = list(map(lambda x: x[0], noun_list))

    top_nouns[cat] = noun_list

100%|██████████| 2775/2775 [00:10<00:00, 255.41it/s]
100%|██████████| 23482/23482 [13:24<00:00, 29.20it/s]
100%|██████████| 25640/25640 [06:23<00:00, 66.92it/s]
100%|██████████| 8082/8082 [01:42<00:00, 78.50it/s]
100%|██████████| 5123/5123 [00:42<00:00, 121.51it/s]
100%|██████████| 4955/4955 [00:41<00:00, 120.73it/s]
100%|██████████| 5735/5735 [01:00<00:00, 95.46it/s]


In [67]:
for key, value in top_nouns.items():
    print()
    print(key)
    print()
    print(value[0:100])


Chemia

['rok', 'lek', 'badanie', 'grupa', 'związka', 'działanie', 'chemia', 'kwas', 'uniwersytet', 'to', 'praca', 'wirus', 'nauka', 'czas', 'co', 'dno', 'profesor', 'reakcja', 'substancja', 'dawka', '%', 'nagroda', 'komórka', 'białko', 'proces', 'przypadek', 'receptor', 'związek', 'wydział', 'choroba', 'mechanizm', 'rna', 'członek', 'instytut', 'układ', 'dziedzina', 'cząsteczka', 'postać', 'wynik', 'polska', 'woda', 'metoda', 'stopień', 'of', 'nazwa', 'studium', 'synteza', 'stężenie', 'temperatura', 'stan', 'raz', 'nobel', 'akademia', 'atom', 'człowiek', 'wojna', 'chemik', 'pacjent', 'efekt', 'biochemia', 'właściwość', 'preparat', 'struktura', 'organizm', 'osoba', 'zasada', 'okres', 'zastosowanie', 'funkcja', 'roztwór', 'objaw', 'środek', 'wpływ', 'university', 'miejsce', 'produkt', 'aktywność', 'doktór', 'forma', 'godzina', 'kwasy', 'zaburzenie', 'materiał', 'enzym', 'ilość', 'wzgląd', 'tytuł', 'część', 'produkcja', 'inhibitor', 'życiorys', 'sposób', 'krew', 'wieko', 'dzień', 'zakła

In [62]:
G = nx.Graph()

In [63]:
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    for lemma in row['noun_lemmas']:
        if lemma in top_nouns[row['category']]:
            G.add_edge(row['title'], lemma)

100%|██████████| 75792/75792 [01:23<00:00, 906.45it/s]


In [64]:
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')

conn_comp = list(nx.connected_components(G))
print(f'Number of connected components: {len(conn_comp)}')

conn_comp_sizes = []
for comp in sorted(conn_comp):
    conn_comp_sizes.append(len(comp))

N = 10
print(f'{N} biggest connected components sizes: {conn_comp_sizes[:N]}')

Number of nodes: 78359
Number of edges: 2168382
Number of connected components: 2
10 biggest connected components sizes: [78308, 51]


In [72]:
i = 0
titles = set(df['title'])
for node in tqdm(list(G.nodes())):
    if node in titles:
        i += 1

print()
print(i, 'articles in graph')
print(G.number_of_nodes() - i, 'words in graph')

100%|██████████| 78359/78359 [00:00<00:00, 1921850.78it/s]
75240 articles in graph
3119 words in graph



In [84]:
degrees = np.array([G.degree(n) for n in G.nodes() if G.degree(n)])
print('Min', np.min(degrees))
print('Max', np.max(degrees))
print('Mean', np.mean(degrees))
print('Median', np.median(degrees))

Min 1
Max 46369
Mean 55.34481042381858
Median 19.0


In [85]:
[n for n in G.nodes() if G.degree(n) == 46369][0]

'rok'

### top 10 000 words

In [86]:
top_k = 10000

categories = list(df['category'].unique())
for cat in categories:
    noun_lists = df[df['category'] == cat]['noun_lemmas']
    noun_list = []
    for l in tqdm(noun_lists):
        noun_list = noun_list + l
    
    noun_list = Counter(noun_list).most_common()[0:top_k]
    noun_list = list(map(lambda x: x[0], noun_list))

    top_nouns[cat] = noun_list

print()
print()
G = nx.Graph()
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    for lemma in row['noun_lemmas']:
        if lemma in top_nouns[row['category']]:
            G.add_edge(row['title'], lemma)

print()
print()
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')
conn_comp = list(nx.connected_components(G))
print(f'Number of connected components: {len(conn_comp)}')
conn_comp_sizes = []
for comp in sorted(conn_comp):
    conn_comp_sizes.append(len(comp))
N = 10
print(f'{N} biggest connected components sizes: {conn_comp_sizes[:N]}')

print()
print()
i = 0
titles = set(df['title'])
for node in tqdm(list(G.nodes())):
    if node in titles:
        i += 1
print()
print(i, 'articles in graph')
print(G.number_of_nodes() - i, 'words in graph')

print()
print()
degrees = np.array([G.degree(n) for n in G.nodes() if G.degree(n)])
print('Min', np.min(degrees))
print('Max', np.max(degrees))
print('Mean', np.mean(degrees))
print('Median', np.median(degrees))

100%|██████████| 2775/2775 [00:10<00:00, 252.32it/s]
100%|██████████| 23482/23482 [13:18<00:00, 29.42it/s]
100%|██████████| 25640/25640 [06:26<00:00, 66.28it/s]
100%|██████████| 8082/8082 [01:44<00:00, 77.69it/s]
100%|██████████| 5123/5123 [00:42<00:00, 119.55it/s]
100%|██████████| 4955/4955 [00:43<00:00, 114.58it/s]
100%|██████████| 5735/5735 [01:02<00:00, 91.09it/s]
  0%|          | 19/75792 [00:00<06:57, 181.57it/s]

100%|██████████| 75792/75792 [07:05<00:00, 177.96it/s]


Number of nodes: 111069
Number of edges: 3100925
100%|██████████| 111069/111069 [00:00<00:00, 1692253.75it/s]Number of connected components: 1
10 biggest connected components sizes: [111069]



75596 articles in graph
35473 words in graph



Min 1
Max 46369
Mean 55.83781253094923
Median 20.0


### top 100 000 words

In [87]:
top_k = 100000

categories = list(df['category'].unique())
for cat in categories:
    noun_lists = df[df['category'] == cat]['noun_lemmas']
    noun_list = []
    for l in tqdm(noun_lists):
        noun_list = noun_list + l
    
    noun_list = Counter(noun_list).most_common()[0:top_k]
    noun_list = list(map(lambda x: x[0], noun_list))

    top_nouns[cat] = noun_list

print()
print()
G = nx.Graph()
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    for lemma in row['noun_lemmas']:
        if lemma in top_nouns[row['category']]:
            G.add_edge(row['title'], lemma)

print()
print()
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')
conn_comp = list(nx.connected_components(G))
print(f'Number of connected components: {len(conn_comp)}')
conn_comp_sizes = []
for comp in sorted(conn_comp):
    conn_comp_sizes.append(len(comp))
N = 10
print(f'{N} biggest connected components sizes: {conn_comp_sizes[:N]}')

print()
print()
i = 0
titles = set(df['title'])
for node in tqdm(list(G.nodes())):
    if node in titles:
        i += 1
print()
print(i, 'articles in graph')
print(G.number_of_nodes() - i, 'words in graph')

print()
print()
degrees = np.array([G.degree(n) for n in G.nodes() if G.degree(n)])
print('Min', np.min(degrees))
print('Max', np.max(degrees))
print('Mean', np.mean(degrees))
print('Median', np.median(degrees))

100%|██████████| 2775/2775 [00:11<00:00, 234.14it/s]
100%|██████████| 23482/23482 [14:40<00:00, 26.66it/s]
100%|██████████| 25640/25640 [06:35<00:00, 64.83it/s]
100%|██████████| 8082/8082 [01:38<00:00, 81.78it/s]
100%|██████████| 5123/5123 [00:39<00:00, 128.33it/s]
100%|██████████| 4955/4955 [00:39<00:00, 126.41it/s]
100%|██████████| 5735/5735 [00:57<00:00, 99.58it/s]
  0%|          | 16/75792 [00:00<08:56, 141.29it/s]

100%|██████████| 75792/75792 [20:36<00:00, 61.28it/s]


Number of nodes: 335767
Number of edges: 3604250
100%|██████████| 335767/335767 [00:00<00:00, 2208235.59it/s]Number of connected components: 30
10 biggest connected components sizes: [335701, 3, 2, 2, 2, 4, 2, 3, 2, 2]



75649 articles in graph
260118 words in graph



Min 1
Max 46369
Mean 21.468756608004956
Median 2.0
