In [1]:
import json

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import torch
import torch.nn.functional as F


from collections import Counter

from torch import nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

from src.word_to_embedding import WordToEmbedding

RANDOM_STATE = 42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
w2e = WordToEmbedding()

In [3]:
def generate_label_mapping(data: pd.DataFrame):
    label_to_id = {item: i for i, item in enumerate(set(data['произход']))}
    id_to_label = {index: label for label, index in label_to_id.items()}

    return label_to_id, id_to_label

In [4]:
class LoanwordsDataset(Dataset):
    def __init__(self, data: pd.DataFrame, label_to_id: dict[str, int], embeddeing_engine: WordToEmbedding):
        self.__data = data
        self.__len = self.__data.shape[0]
        
        self.__label_mapping = label_to_id
        self.__embeddeing_engine = embeddeing_engine


    def __len__(self):
        return self.__len

    def __getitem__(self, index) -> tuple[str, str]:
        entry = self.__data.iloc[index]

        word = entry['дума']
        origin = entry['произход']
        word_tensor = self.__embeddeing_engine.get_embedding(word)
        origin_tensor = self.__label_mapping[origin]

        word_tensor_shape = word_tensor.shape
        word_tensor = word_tensor.reshape((word_tensor_shape[0], 1, word_tensor_shape[1]))

        return word, word_tensor, origin, origin_tensor


In [5]:
class LoanwordClassifier(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        super().__init__()
        self.__hidden_size = hidden_size

        self.__i2h = nn.Linear(input_size + self.__hidden_size, self.__hidden_size).to(device)
        self.__h2h = nn.Linear(hidden_size, hidden_size).to(device)
        self.__h2h_2 = nn.Linear(hidden_size, hidden_size).to(device)
        self.__h2o = nn.Linear(hidden_size, output_size).to(device)
        self.__softmax = nn.LogSoftmax(dim=1).to(device)

    def forward(self, input_tensor, hidden):
        combined = torch.cat((input_tensor.to(device), hidden.to(device)), 1).to(device)
        hidden = F.tanh(self.__i2h(combined).to(device) + self.__h2h(hidden.to(device)).to(device)).to(device)
        hidden = F.tanh(hidden + self.__h2h_2(hidden.to(device)).to(device)).to(device)
        output = self.__h2o(hidden).to(device)
        output = self.__softmax(output).to(device)

        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.__hidden_size)

In [6]:
def category_from_output(output, id_to_label):
    top_n, top_i = output.topk(1)

    category_i = top_i[0].item()
    return id_to_label[category_i], category_i

In [7]:
def train_epoch(model: LoanwordClassifier, train_dataloader, loss_fn,  id_to_label, learning_rate=0.005):
    sum_loss = 0
    for word, word_tensor, label, label_tensor in tqdm(train_dataloader):
        word_tensor = word_tensor[0]
        hidden = model.init_hidden()
        model.zero_grad()

        for syllable_embedding in word_tensor:
            output, hidden = model(syllable_embedding, hidden)
        
        loss = loss_fn(output.to(device), label_tensor.to(device))
        loss.backward()

        for p in model.parameters():
            p.data.add_(p.grad.data, alpha=-learning_rate)
        
        sum_loss += loss.item()

    return sum_loss / len(train_dataloader)

In [8]:
# Train & Test loop
def train_loop(model: LoanwordClassifier, train_dataloader, epochs, loss_fn, id_to_label, learning_rate=0.005):
    for epoch in range(epochs):
        
        epoch_loss = train_epoch(model, train_dataloader, loss_fn, id_to_label, learning_rate)
        print(f"Epoch: {epoch}, loss: {epoch_loss:>7f}")

In [9]:
def test_loop(model: LoanwordClassifier, test_dataloader, id_to_label):
    predicted_ids = []
    actual_ids = []
    for word, word_tensor, label, label_tensor in tqdm(test_dataloader):
        word_tensor = word_tensor[0]

        hidden = model.init_hidden()

        for syllable_embedding in word_tensor:
            output, hidden = model(syllable_embedding, hidden)

        predicted_label, predicted_label_id = category_from_output(output, id_to_label)
        # print(predicted_label, predicted_label_id)
        predicted_ids.append(predicted_label_id)
        actual_ids.append(label_tensor)
    
    return predicted_ids, actual_ids

In [10]:
def get_distribution(train_data, label_to_id):
    counts = train_data['произход'].value_counts(normalize=True)
    weights = sorted(zip(counts.keys(), counts), key=lambda x: label_to_id[x[0]])
    weights = list(map(lambda x: x[1], weights))
    return torch.Tensor(weights)

## 5k

In [11]:
path = '/mnt/d/Projects/masters-thesis/data/loanwords_5k_bg_words.csv'
data = pd.read_csv(path)
label_to_id, id_to_label = generate_label_mapping(data)

# Running the model with normal labels 
train_data, test_data = train_test_split(data, random_state=RANDOM_STATE)


train_dataset = LoanwordsDataset(train_data, label_to_id, embeddeing_engine=w2e)
train_dataloader = DataLoader(train_dataset)

test_dataset = LoanwordsDataset(test_data, label_to_id, embeddeing_engine=w2e)
test_dataloader = DataLoader(test_dataset)

model = LoanwordClassifier(input_size=512, hidden_size=1024, output_size=len(label_to_id))
distribution = get_distribution(train_data, label_to_id).to(device)
loss_fn = nn.NLLLoss(distribution)

In [12]:
len(label_to_id)

15

In [13]:
train_dataset[0][1].shape

(4, 1, 512)

In [21]:
train_loop(model, train_dataloader, epochs=15, loss_fn=loss_fn, id_to_label=id_to_label)

100%|██████████| 7625/7625 [01:01<00:00, 123.59it/s]


Epoch: 0, loss: 1.439886


100%|██████████| 7625/7625 [00:56<00:00, 136.16it/s]


Epoch: 1, loss: 1.219642


100%|██████████| 7625/7625 [00:45<00:00, 169.22it/s]


Epoch: 2, loss: 1.098222


100%|██████████| 7625/7625 [01:02<00:00, 122.73it/s]


Epoch: 3, loss: 0.998513


100%|██████████| 7625/7625 [01:07<00:00, 113.58it/s]


Epoch: 4, loss: 0.921239


100%|██████████| 7625/7625 [01:17<00:00, 98.84it/s] 


Epoch: 5, loss: 0.882490


100%|██████████| 7625/7625 [01:06<00:00, 114.50it/s]


Epoch: 6, loss: 0.808037


100%|██████████| 7625/7625 [01:09<00:00, 109.47it/s]


Epoch: 7, loss: 0.759556


100%|██████████| 7625/7625 [01:06<00:00, 115.05it/s]


Epoch: 8, loss: 0.756344


100%|██████████| 7625/7625 [01:03<00:00, 120.19it/s]


Epoch: 9, loss: 0.705568


100%|██████████| 7625/7625 [01:07<00:00, 113.63it/s]


Epoch: 10, loss: 0.734255


100%|██████████| 7625/7625 [01:10<00:00, 108.69it/s]


Epoch: 11, loss: 0.741421


100%|██████████| 7625/7625 [01:11<00:00, 106.46it/s]


Epoch: 12, loss: 0.708927


100%|██████████| 7625/7625 [01:11<00:00, 106.20it/s]


Epoch: 13, loss: 0.689930


100%|██████████| 7625/7625 [01:12<00:00, 105.67it/s]

Epoch: 14, loss: 0.638702





In [22]:
from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

torch.save(model.state_dict(), f'models/classifier-{today_date}-1024hidden-15epochs-5kbgwords.pth')

id_to_label_json = json.dumps(id_to_label)
with open(f'models/id-to-label-{today_date}-1024hidden-15epochs-5kbgwords.json', 'w+') as fp:
    fp.write(id_to_label_json)


label_to_id_json = json.dumps(label_to_id)
with open(f'models/label-to-id-{today_date}-1024hidden-15epochs-5kbgwords', 'w+') as fp:
    fp.write(label_to_id_json)

## 13k

In [23]:
path = '/mnt/d/Projects/masters-thesis/data/loanwords_13k_bg_words.csv'
data = pd.read_csv(path)
label_to_id, id_to_label = generate_label_mapping(data)

# Running the model with normal labels 
train_data, test_data = train_test_split(data, random_state=RANDOM_STATE)


train_dataset = LoanwordsDataset(train_data, label_to_id, embeddeing_engine=w2e)
train_dataloader = DataLoader(train_dataset)

test_dataset = LoanwordsDataset(test_data, label_to_id, embeddeing_engine=w2e)
test_dataloader = DataLoader(test_dataset)

model = LoanwordClassifier(input_size=512, hidden_size=1024, output_size=len(label_to_id))
distribution = get_distribution(train_data, label_to_id).to(device)
loss_fn = nn.NLLLoss(distribution)

train_loop(model, train_dataloader, epochs=15, loss_fn=loss_fn, id_to_label=id_to_label)

from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

torch.save(model.state_dict(), f'models/classifier-{today_date}-1024hidden-15epochs-13kbgwords.pth')

id_to_label_json = json.dumps(id_to_label)
with open(f'models/id-to-label-{today_date}-1024hidden-15epochs-13kbgwords.json', 'w+') as fp:
    fp.write(id_to_label_json)


label_to_id_json = json.dumps(label_to_id)
with open(f'models/label-to-id-{today_date}-1024hidden-15epochs-13kbgwords', 'w+') as fp:
    fp.write(label_to_id_json)

100%|██████████| 14327/14327 [02:15<00:00, 106.11it/s]


Epoch: 0, loss: 0.930788


100%|██████████| 14327/14327 [02:14<00:00, 106.28it/s]


Epoch: 1, loss: 0.797522


100%|██████████| 14327/14327 [02:05<00:00, 114.21it/s]


Epoch: 2, loss: 0.727542


100%|██████████| 14327/14327 [02:00<00:00, 119.33it/s]


Epoch: 3, loss: 0.669049


100%|██████████| 14327/14327 [01:55<00:00, 124.43it/s]


Epoch: 4, loss: 0.637900


100%|██████████| 14327/14327 [01:54<00:00, 125.22it/s]


Epoch: 5, loss: 0.599897


100%|██████████| 14327/14327 [01:55<00:00, 123.64it/s]


Epoch: 6, loss: 0.583261


100%|██████████| 14327/14327 [01:56<00:00, 123.49it/s]


Epoch: 7, loss: 0.564660


100%|██████████| 14327/14327 [01:55<00:00, 124.32it/s]


Epoch: 8, loss: 0.548564


100%|██████████| 14327/14327 [01:54<00:00, 125.32it/s]


Epoch: 9, loss: 0.548488


100%|██████████| 14327/14327 [01:55<00:00, 124.15it/s]


Epoch: 10, loss: 0.545569


100%|██████████| 14327/14327 [01:57<00:00, 121.58it/s]


Epoch: 11, loss: 0.539187


100%|██████████| 14327/14327 [01:58<00:00, 120.87it/s]


Epoch: 12, loss: 0.525178


100%|██████████| 14327/14327 [02:01<00:00, 118.35it/s]


Epoch: 13, loss: 0.529139


100%|██████████| 14327/14327 [01:56<00:00, 122.48it/s]

Epoch: 14, loss: 0.528721





# 30k

In [24]:
path = '/mnt/d/Projects/masters-thesis/data/loanwords_30k_bg_words.csv'
data = pd.read_csv(path)
label_to_id, id_to_label = generate_label_mapping(data)

# Running the model with normal labels 
train_data, test_data = train_test_split(data, random_state=RANDOM_STATE)


train_dataset = LoanwordsDataset(train_data, label_to_id, embeddeing_engine=w2e)
train_dataloader = DataLoader(train_dataset)

test_dataset = LoanwordsDataset(test_data, label_to_id, embeddeing_engine=w2e)
test_dataloader = DataLoader(test_dataset)

model = LoanwordClassifier(input_size=512, hidden_size=1024, output_size=len(label_to_id))
distribution = get_distribution(train_data, label_to_id).to(device)
loss_fn = nn.NLLLoss(distribution)

train_loop(model, train_dataloader, epochs=15, loss_fn=loss_fn, id_to_label=id_to_label)

from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

torch.save(model.state_dict(), f'models/classifier-{today_date}-1024hidden-15epochs-30kbgwords.pth')

id_to_label_json = json.dumps(id_to_label)
with open(f'models/id-to-label-{today_date}-1024hidden-15epochs-30kbgwords.json', 'w+') as fp:
    fp.write(id_to_label_json)


label_to_id_json = json.dumps(label_to_id)
with open(f'models/label-to-id-{today_date}-1024hidden-15epochs-30kbgwords', 'w+') as fp:
    fp.write(label_to_id_json)

100%|██████████| 27032/27032 [02:40<00:00, 167.97it/s]


Epoch: 0, loss: 0.588038


100%|██████████| 27032/27032 [02:36<00:00, 173.20it/s]


Epoch: 1, loss: 0.510282


100%|██████████| 27032/27032 [02:35<00:00, 173.54it/s]


Epoch: 2, loss: 0.470376


100%|██████████| 27032/27032 [02:33<00:00, 176.33it/s]


Epoch: 3, loss: 0.442017


100%|██████████| 27032/27032 [02:31<00:00, 178.10it/s]


Epoch: 4, loss: 0.428195


100%|██████████| 27032/27032 [02:31<00:00, 178.60it/s]


Epoch: 5, loss: 0.408192


100%|██████████| 27032/27032 [02:29<00:00, 180.27it/s]


Epoch: 6, loss: 0.385043


100%|██████████| 27032/27032 [02:30<00:00, 179.76it/s]


Epoch: 7, loss: 0.403255


100%|██████████| 27032/27032 [02:30<00:00, 180.19it/s]


Epoch: 8, loss: 0.385706


100%|██████████| 27032/27032 [02:29<00:00, 180.51it/s]


Epoch: 9, loss: 0.370143


100%|██████████| 27032/27032 [02:29<00:00, 180.95it/s]


Epoch: 10, loss: 0.387335


100%|██████████| 27032/27032 [02:29<00:00, 180.55it/s]


Epoch: 11, loss: 0.379751


100%|██████████| 27032/27032 [02:29<00:00, 180.79it/s]


Epoch: 12, loss: 0.390908


100%|██████████| 27032/27032 [02:30<00:00, 180.02it/s]


Epoch: 13, loss: 0.382018


100%|██████████| 27032/27032 [02:30<00:00, 179.87it/s]

Epoch: 14, loss: 0.371013



