In [None]:
from os.path import join
import glob
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
import tensorflow as tf
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data as data_utils
import numpy as np
from typing import List
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import itertools
from tensorflow.keras import layers, models
import seaborn as sn
import zipfile
from gensim.models import KeyedVectors

In [None]:
tqdm.pandas()

## Mount Google Drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Data loading

In [None]:
data = pd.read_csv("/content/drive/Shareddrives/GCN-PROJECT/wiki_pages_lemmas.csv", index_col=0)

In [None]:
data.head()

Unnamed: 0,title,text,category,noun_lemmas
0,Mirosław_Miller,Mirosław Miller – Dyrektor Międzynarodowego La...,Chemia,"['mirosława', 'miller', 'dyrektor', 'laborator..."
1,Chimerokształtne,"Chimerokształtne, chimery, przerazy (Chimaerif...",Biologia,"['chimera', 'przeraza', 'chimaeriformes', 'rzą..."
2,(2855)_Bastian,(2855) Bastian (1931 TB2) – planetoida z grupy...,Astronomia,"['bastian', 'tb2', 'planetoida', 'grupa', 'pas..."
3,Cryptocephalus_celtibericus,"""Cryptocephalus celtibericus"" – gatunek chrząs...",Biologia,"['cryptocephalus', 'celtibericus', 'gatunek', ..."
4,Język_maszynowy,"Język maszynowy, kod maszynowy – zestaw rozkaz...",Matematyka,"['język', 'koda', 'zestaw', 'rozkaz', 'proceso..."


In [None]:
data.shape

(75792, 4)

In [None]:
data.dropna(inplace=True)

noun_lemmas column to list

In [None]:
data['noun_lemmas'] = data['noun_lemmas'].progress_apply(
    lambda row: row[1:-1].replace('\'', '').split(', ')
)

100%|██████████| 75443/75443 [00:01<00:00, 42006.30it/s]


In [None]:
type(data['noun_lemmas'].iloc[0])

list

In [None]:
noun_lemmas_list = data['noun_lemmas'].to_list()

flat_list = [item for sublist in noun_lemmas_list for item in sublist]

In [None]:
len(flat_list)

5824370

In [None]:
unique_noun_lemmas = set(flat_list)

In [None]:
len(unique_noun_lemmas)

284013

## Embedding

### word2vec model

In [None]:
model_zip_file = '/content/drive/Shareddrives/GCN-PROJECT/models/word2vec.zip'
directory_to_extract_to = './'

with zipfile.ZipFile(model_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [None]:
word2vec = KeyedVectors.load('word2vec/word2vec_100_3_polish.bin')

In [None]:
len(word2vec.get_vector('pies'))

100

In [None]:
embeddings = {}
not_found_lemmas = []

for lemma in tqdm(unique_noun_lemmas):
    try:
        vector = word2vec.get_vector(lemma)
        embeddings[lemma] = vector
    except KeyError:
        not_found_lemmas.append(lemma)

100%|██████████| 284013/284013 [00:00<00:00, 292441.81it/s]


In [None]:
len(embeddings)

174800

In [None]:
list(embeddings.keys())[:10]

['laurenta',
 'sterrewacht',
 'przełożenie',
 'rubenidów',
 'karamoja',
 'multiplexer',
 'diorella',
 'naftol',
 'tapsus',
 'axi']

In [None]:
len(not_found_lemmas)

109213

In [None]:
not_found_lemmas[:10]

['',
 'karitskaya',
 'nosaxa',
 'rl10',
 'rn10',
 'radionowela',
 'preserve">orcad',
 'liocranum',
 'j04414489+2301513',
 '”receptorów']

### Save to csv

In [None]:
df = pd.DataFrame.from_dict(
    {
        'lemma': list(embeddings.keys()),
        'vec': list(embeddings.values())
    }
)

In [None]:
df.head()

Unnamed: 0,lemma,vec
0,laurenta,"[-2.999931, -0.708544, -1.311219, 1.821917, 0...."
1,sterrewacht,"[-0.218208, 0.070714, 0.048131, 0.260631, -0.0..."
2,przełożenie,"[0.391039, -0.547801, 0.859111, 0.311708, 1.24..."
3,rubenidów,"[0.175287, 0.267407, -0.228604, 0.011897, 0.05..."
4,karamoja,"[0.003889, 0.009073, 0.058901, -0.098975, 0.04..."


In [None]:
vec_file = '/content/drive/Shareddrives/GCN-PROJECT/word2vec_lemmas.csv'

df.to_csv(vec_file)

### fastText model

In [None]:
model_zip_file = '/content/drive/Shareddrives/GCN-PROJECT/models/fasttext_v2.zip'
directory_to_extract_to = './'

with zipfile.ZipFile(model_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [None]:
fastText = KeyedVectors.load('fasttext_100_3_polish.bin')

In [None]:
len(fastText.wv['pies'])

100

In [None]:
embeddings = {}
not_found_lemmas = []

for lemma in tqdm(unique_noun_lemmas):
    try:
        vector = fastText.wv[lemma]
        embeddings[lemma] = vector
    except KeyError:
        not_found_lemmas.append(lemma)
    except AttributeError:
        not_found_lemmas.append(lemma)

100%|██████████| 284013/284013 [00:01<00:00, 163388.60it/s]


In [None]:
len(embeddings)

177370

In [None]:
list(embeddings.keys())[:10]

['laurenta',
 'sterrewacht',
 'przełożenie',
 'rubenidów',
 'karamoja',
 'multiplexer',
 'diorella',
 'naftol',
 'tapsus',
 'axi']

In [None]:
len(not_found_lemmas)

106643

In [None]:
not_found_lemmas[:10]

['',
 'karitskaya',
 'nosaxa',
 'rl10',
 'rn10',
 'radionowela',
 'preserve">orcad',
 'liocranum',
 'j04414489+2301513',
 '”receptorów']

### Save to csv

In [None]:
df = pd.DataFrame.from_dict(
    {
        'lemma': list(embeddings.keys()),
        'vec': list(embeddings.values())
    }
)

In [None]:
df.head()

Unnamed: 0,lemma,vec
0,laurenta,"[0.76456106, -2.068957, -1.6155088, -1.5636369..."
1,sterrewacht,"[-0.35970423, -1.2159524, -1.1942011, 0.820352..."
2,przełożenie,"[0.41044968, -3.8949878, -3.6449554, 4.757927,..."
3,rubenidów,"[2.432582, -1.5145886, -1.1150995, 0.33890077,..."
4,karamoja,"[2.4278355, -2.9679384, -0.6736491, 1.914125, ..."


In [None]:
vec_file = '/content/drive/Shareddrives/GCN-PROJECT/fasttext_lemmas.csv'

df.to_csv(vec_file)