In [1]:
import json

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import torch


from collections import Counter

from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from src.word_to_embedding import WordToEmbedding

RANDOM_STATE = 42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
w2e = WordToEmbedding()

  if isinstance(row[0], str):
  word = row[0].strip()
  if isinstance(row[1], str):
  transcription = row[1].strip()


In [None]:
class LoanwordsDataset(Dataset):
    def __init__(self, data: pd.DataFrame, label_to_id: dict[str, int], embeddeing_engine: WordToEmbedding):
        self.__data = data
        self.__len = self.__data.shape[0]
        
        self.__label_mapping = label_to_id
        self.__embeddeing_engine = embeddeing_engine


    def __len__(self):
        return self.__len

    def __getitem__(self, index) -> tuple[str, str]:
        entry = self.__data.iloc[index]

        word = entry['дума']
        origin = entry['произход']
        word_tensor = self.__embeddeing_engine.get_embedding(word)
        origin_tensor = self.__label_mapping[origin]

        word_tensor_shape = word_tensor.shape
        word_tensor = word_tensor.reshape((word_tensor_shape[0], 1, word_tensor_shape[1]))

        return word, word_tensor, origin, origin_tensor


In [3]:
class LoanwordClassifier(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        super().__init__()
        self.__hidden_size = hidden_size

        self.__i2h = nn.Linear(input_size + self.__hidden_size, self.__hidden_size).to(device)
        self.__h2o = nn.Linear(hidden_size, output_size).to(device)
        self.__softmax = nn.LogSoftmax(dim=1).to(device)

    def forward(self, input_tensor, hidden):
        combined = torch.cat((input_tensor.to(device), hidden.to(device)), 1).to(device)
        hidden = self.__i2h(combined).to(device)
        output = self.__h2o(hidden).to(device)
        output = self.__softmax(output).to(device)

        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.__hidden_size)

In [5]:
def category_from_output(output, id_to_label):
    top_n, top_i = output.topk(1)

    category_i = top_i[0].item()
    return id_to_label[category_i], category_i

In [4]:
def predict(word, model, id_to_label):
    word_tensor = torch.Tensor(w2e.get_embedding(word))

    word_tensor_shape = word_tensor.shape
    word_tensor = word_tensor.reshape((word_tensor_shape[0], 1, word_tensor_shape[1]))

    hidden = model.init_hidden()

    for syllable_embedding in word_tensor:
        output, hidden = model(syllable_embedding, hidden)
    
    # predicted_label, predicted_label_id = category_from_output(output, id_to_label)
    
    print(output.shape)
    probabilities, label_ids = output.topk(5)

    return {id_to_label[label_id.item()]: probability.item() for probability, label_id in zip(probabilities[0], label_ids[0])}

In [7]:
path = 'data/loanwords.csv'
data = pd.read_csv(path)

with open('models/label-to-id-2024-01-24-1024hidden-10epochs.pth') as fp:
    label_to_id = json.load(fp)

id_to_label = {_id: label for label, _id in label_to_id.items()}

model = LoanwordClassifier(input_size=512, hidden_size=1024, output_size=len(label_to_id))

state_dict_path = 'models/classifier-2024-01-24-1024hidden-10epochs.pth'
model.load_state_dict(torch.load(state_dict_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [8]:
predict('рахат', model, id_to_label)

torch.Size([1, 17])


{'bg': -0.2701360285282135,
 'fr': -2.4236598014831543,
 'el': -2.6807174682617188,
 'de': -3.7396557331085205,
 'la': -4.143362522125244}