## Word prediction

In [1]:
from src.loanword_predictor import LoanwordPredictor
from src.word_to_embedding import WordToEmbedding

In [2]:
w2e = WordToEmbedding()
predictor = LoanwordPredictor(id_to_label_path='models/id-to-label-2024-02-06-1024hidden-10epochs.json',
                              label_to_id_path='models/label-to-id-2024-02-06-1024hidden-10epochs.json',
                              word_to_embedding=w2e,
                              classifier_state_dict_path='models/classifier-2024-02-06-1024hidden-10epochs.pth')

In [7]:
predictor.predict('юзър')

{'bg': 0.6859366972949508,
 'en': 0.11518884078887988,
 'el': 0.06588915630261272,
 'la': 0.03677257948203205,
 'fr': 0.0339133863456033}

## Sentence tokenization

In [11]:
from nltk.tokenize import word_tokenize

In [13]:
def tokenize_sentence(sentence: str) -> list[str]:
    words = word_tokenize(sentence)
    return [word.lower() for word in words]

In [14]:
sentence = 'Бързата кафява лисица прескочи новия юзър'
tokenize_sentence(sentence)

['бързата', 'кафява', 'лисица', 'прескочи', 'новия', 'юзър']

## All together

In [22]:
def loanwords_analysis(sentence: str) -> list[dict[str, float]]:
    words = tokenize_sentence(sentence)

    return [(word, probabilities) for word in words if (probabilities := predictor.predict(word))['bg'] < 0.8]

In [23]:
import json
json.dumps(loanwords_analysis(sentence), ensure_ascii=False)

'[["новия", {"bg": 0.716601178121964, "el": 0.1774362767505004, "tr": 0.03090380324062624, "en": 0.025659734668319542, "fr": 0.01685414778646352}], ["юзър", {"bg": 0.6859366972949508, "en": 0.11518884078887988, "el": 0.06588915630261272, "la": 0.03677257948203205, "fr": 0.0339133863456033}]]'