# Предобработка текста

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from __future__ import print_function
import numpy as np
import random
import sys
import io
import os
import re

# 1 Стемминг

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
words= ["learn", "learning", "learned", "learns"]
ps =  PorterStemmer()
for w in words:
    rootWord=ps.stem(w)
    print(rootWord)

learn
learn
learn
learn


In [None]:
sentence="Dear students, You should try very hard to master machine learning!"
words = nltk.word_tokenize(sentence)
ps = PorterStemmer()
word_list = []
for w in words:
    rootWord=ps.stem(w)
    word_list.append(rootWord)
print("Список слов из предложения:\n", word_list)

Список слов из предложения:
 ['dear', 'student', ',', 'you', 'should', 'tri', 'veri', 'hard', 'to', 'master', 'machin', 'learn', '!']


# 2 Лематтизация

In [None]:
from nltk.stem import WordNetLemmatizer 

In [None]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatize Single Word
print(lemmatizer.lemmatize("classes"))
print(lemmatizer.lemmatize("women"))
print(lemmatizer.lemmatize("crying"))

class
woman
cry


In [None]:
# Напишем какое-нибудь предложение
sentence = "Bad students were expelled from the institute"

# Сделаем его токенизацию, то есть разобьем на слова
word_list = nltk.word_tokenize(sentence)
print("Список слов из предложения:\n", word_list)

# Сделаем леммитизацию каждого слова
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) #Ваш код здесь
print("Предложение из лемматизированных слов:\n", lemmatized_output)

Список слов из предложения:
 ['Bad', 'students', 'were', 'expelled', 'from', 'the', 'institute']
Предложение из лемматизированных слов:
 Bad student were expelled from the institute


In [None]:
# Иногда одно и то же слово может иметь несколько лемм в зависимости от значения/части речи/контекста
print(lemmatizer.lemmatize("stripes", 'v'))  
print(lemmatizer.lemmatize("stripes", 'n')) 

strip
stripe


Можно получить [теги](https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk) токена предобученным алгоритмом.

In [None]:
print(nltk.pos_tag(['women']))
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('women', 'NNS')]
[('Bad', 'JJ'), ('students', 'NNS'), ('were', 'VBD'), ('expelled', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('institute', 'NN')]


In [None]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'women'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

# 4 Bag of words

## 4.1 Via python

In [None]:
def vectorize(tokens):
    ''' This function takes list of words in a sentence as input 
    and returns a vector of size of filtered_vocab.It puts 0 if the 
    word is not present in tokens and count of token if present.'''
    vector=[]
    for w in filtered_vocab:
        vector.append(tokens.count(w)) # Ваш код здесь
    return vector

In [None]:
def unique(sequence):
    '''This functions returns a list in which the order remains 
    same and no item repeats.Using the set() function does not 
    preserve the original ordering'''
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))] # Ваш код здесь

In [None]:
#create a list of stopwords.You can import stopwords from nltk too
stopwords=["to","is","a"]

#list of special characters.You can use regular expressions too
special_char=[",",":"," ",";",".","?"]

#Write the sentences in the corpus,in our case, just two 
string1="Welcome to Great Learning , Now start learning"
string2="Learning is a good practice"

#convert them to lower case
# Ваш код здесь
string1=string1.lower()
string2=string2.lower()

#split the sentences into tokens
# Ваш код здесь
tokens1=string1.split()
tokens2=string2.split()
print(tokens1)
print(tokens2)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'learning']
['learning', 'is', 'a', 'good', 'practice']


In [None]:
#create a vocabulary list
vocab=unique(tokens1+tokens2) # Ваш код здесь
print(vocab)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']


In [None]:
#filter the vocabulary list
# Ваш код здесь
filtered_vocab=[]
for w in vocab: 
    if w not in stopwords and w not in special_char: 
        filtered_vocab.append(w)
print(filtered_vocab)

['welcome', 'great', 'learning', 'now', 'start', 'good', 'practice']


In [None]:
#convert sentences into vectors
vector1=vectorize(tokens1)
print(vector1)
vector2=vectorize(tokens2)
print(vector2)

[1, 1, 2, 1, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 1]


## 4.2 Via sklearn

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="Welcome to Great Learning , Now start learning"
sentence_2="Learning is a good practice"
 
 
 
CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
#transform
Count_data = CountVec.fit_transform([sentence_1,sentence_2])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
print(cv_dataframe)

   good  great  learning  practice  start  welcome
0     0      1         2         0      1        1
1     1      0         1         1      0        0


# 5 N-grams

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all"
 
 
 
CountVec = CountVectorizer(ngram_range=(1,2))
#transform
Count_data = CountVec.fit_transform([sentence_1,sentence_2])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
print(cv_dataframe)

   all  anything  at  at all  for  for anything  good  good at  good job  is  \
0    0         1   0       0    1             1     1        0         1   1   
1    1         0   1       1    0             0     1        1         0   1   

   ...  job will  miss  miss it  not  not good  not miss  this  this is  will  \
0  ...         1     1        1    1         0         1     1        1     1   
1  ...         0     0        0    1         1         0     1        1     0   

   will not  
0         1  
1         0  

[2 rows x 25 columns]


# 6 TF-IDF

$TF(i,j)= \frac{n(i,j)}{\sum{ n(i,j)}}$

$IDF=\log(N/dN)$

$TF-IDF=TF*IDF$

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all"
 
#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1))
#transform
tf_idf_data = tf_idf_vec.fit_transform([sentence_1,sentence_2])
 
#create dataframe
tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())
print(tf_idf_dataframe)

        all  anything        at       for      good        is        it  \
0  0.000000  0.367724  0.000000  0.367724  0.217184  0.217184  0.367724   
1  0.542701  0.000000  0.542701  0.000000  0.320528  0.320528  0.000000   

        job      miss       not      this      will  
0  0.367724  0.367724  0.217184  0.217184  0.367724  
1  0.000000  0.000000  0.320528  0.320528  0.000000  


# 7 Word2vec

Установим полезную утилиту Python для парсинга веб-страниц. Она поможет нам достать статью Векипедии, с которой мы будем работать.

In [None]:
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


И еще установим библиотеку для синтаксического анализа HTML страниц.

In [None]:
!pip install lxml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Статья, с которой будем работать, конечно, про искусственный интеллект:) 

In [None]:
# Загрузим статью по ссылке
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')

# Сдесь статья еще в куче с HTML кодами
article = scraped_data.read()

# А теперь статья приняла нормальный вид
parsed_article = bs.BeautifulSoup(article,'lxml')

# Википедия хранит текстовое содержимое статьи внутри p-тегов. Вытащим текст
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
    article_text += p.text

Убедимся, что текст без HTML кодов.

In [None]:
article_text[0:300]

'\nArtificial intelligence (AI) is intelligence—perceiving, synthesizing, and inferring information—demonstrated by machines, as opposed to intelligence displayed by non-human animals and humans. Example tasks in which this is done include speech recognition, computer vision, translation between (natu'

Супер!

Дальше сделаем [предобработку текста](https://tproger.ru/translations/regular-expression-python/):

In [None]:
# Удаляем прописные буквы
processed_article = article_text.lower()

# Удаляем все цифры, специальные символы
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
  
# Удаляем лишние пробелы из текста
processed_article = re.sub(r'\s+', ' ', processed_article)

# Делим статью на предложения
# Хоть оно и получается одно, но это нужно, чтобы word_tokenize воспринимал слова как слова, а не буквы как слова
all_sentences = nltk.sent_tokenize(processed_article)

# Теперь разделим на слова
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Удаляем слова, не дающие никакой информации (предлоги, артикли и тд)
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

Посмотрим первые несколько слов нашего "чистого" списка.

In [None]:
all_words[0][0:5]

['artificial', 'intelligence', 'ai', 'intelligence', 'perceiving']

Теперь нужно установить библиотеку обработки естественного языка. С её помощью можно обрабатывать тексты, работать с векторными моделями слов, такими как Word2Vec (в нашем случае).

In [None]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from gensim.models import Word2Vec

Создадим модель Word2Vec с использованием статьи в Википедии, которую мы скопировали.

min_count=2 показывает, что в модель Word2Vec будут входить только те слова, которые встречаются в корпусе как минимум 2 раза.

In [None]:
word2vec = Word2Vec(all_words, min_count=2)



Посмотри уникальные слова, которые в статье встреачются как минимум дважды.

Небольшое пояснение:

Вы можете увидеть радом со словом что-то подобное(<gensim.models.keyedvectors.Vocab object at 0x00000282FDC67100>). Структура KeyedVectors представляет собой отображение между ключами и векторным представлением слова. Каждый вектор идентифицируется своим ключом поиска, чаще всего коротким строковым токеном. Это выглядит как {str => 1D numpy array}.

In [None]:
# model.wv - свойство модели, в котором хранятся отдельные ключевые векторы
vocab = word2vec.wv.vocab
print(vocab)
print(len(vocab))

{'artificial': <gensim.models.keyedvectors.Vocab object at 0x7f7a67a7cb80>, 'intelligence': <gensim.models.keyedvectors.Vocab object at 0x7f7a67a7c880>, 'ai': <gensim.models.keyedvectors.Vocab object at 0x7f7a67a7c520>, 'inferring': <gensim.models.keyedvectors.Vocab object at 0x7f7a67b459d0>, 'information': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731280>, 'demonstrated': <gensim.models.keyedvectors.Vocab object at 0x7f7a727312e0>, 'machines': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731340>, 'displayed': <gensim.models.keyedvectors.Vocab object at 0x7f7a727313a0>, 'non': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731400>, 'human': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731460>, 'humans': <gensim.models.keyedvectors.Vocab object at 0x7f7a727314c0>, 'example': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731520>, 'tasks': <gensim.models.keyedvectors.Vocab object at 0x7f7a72731580>, 'done': <gensim.models.keyedvectors.Vocab object at 0x7

Посмотрим векторное представление какого-нибудь слова

In [None]:
v1 = word2vec.wv['artificial']
print(v1)
print(len(v1))

[-0.00127061  0.00391928  0.00260979 -0.00226705 -0.00655883  0.00167293
 -0.00179178  0.00187041 -0.0018538  -0.00559112 -0.00338695  0.00370871
 -0.0043751   0.00205897 -0.00078524 -0.00019658  0.00281066 -0.0036659
  0.00122854  0.00126454 -0.00274061  0.00282936  0.00451001  0.00231109
 -0.00143916 -0.00358173 -0.00091683 -0.00313363 -0.00096833  0.00685154
  0.00505011  0.0014541  -0.00561344  0.00083294  0.00658775  0.00112949
 -0.00083542  0.00249885 -0.00298281  0.00097355  0.00465556  0.0042497
  0.00600116  0.00088228  0.00396256  0.00406945  0.00504912  0.00500488
 -0.00259289 -0.00036783  0.00596993  0.00358329 -0.00173795  0.00273071
 -0.00016877 -0.00380678 -0.00027344  0.00190332  0.00522052 -0.00342142
 -0.0044152  -0.00064577 -0.00198505 -0.00350446  0.00276139 -0.00430836
  0.00034945  0.00493928  0.00134138  0.00241355 -0.00731503 -0.00024653
  0.00355664  0.00074065 -0.00098417 -0.00502845  0.00013254  0.00040826
  0.00282404  0.00285917 -0.0034858  -0.00032966  0.0

И найдем все похожие слова. Так же увидим их индексы подобия

In [None]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('many', 0.4657021164894104),
 ('artificial', 0.4557076692581177),
 ('simulated', 0.417771577835083),
 ('input', 0.3955671191215515),
 ('defined', 0.3902028203010559),
 ('human', 0.388611376285553),
 ('ai', 0.3771279454231262),
 ('intelligent', 0.363665372133255),
 ('used', 0.35614627599716187),
 ('inputs', 0.3519206941127777)]

# 8 [Word2vec with PyTorch](https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0)
[paper](https://arxiv.org/abs/1301.3781)

## prepare model

In [None]:
import torch.nn as nn
import torch 
CBOW_N_WORDS = 4 
MAX_SEQUENCE_LENGTH = 256 
EMBED_DIMENSION = 300 
EMBED_MAX_NORM = 1 

In [None]:
class CBOW_Model(nn.Module):
  def __init__(self, vocab_size: int):
    super(CBOW_Model, self).__init__()
    self.embeddings = nn.Embedding(
      num_embeddings=vocab_size,
      embedding_dim=EMBED_DIMENSION,
      max_norm=EMBED_MAX_NORM)

    self.linear = nn.Linear(
      in_features=EMBED_DIMENSION,
      out_features=vocab_size)
    
  def forward(self, inputs_):
    x = self.embeddings(inputs_)
    x = x.mean(axis=1)
    x = self.linear(x)
    return x
CBOW_model = CBOW_Model(len(vocab.get_stoi()))

## prepare corpus

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
def get_english_tokenizer():
    """
    Documentation:
    https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer
    """
    tokenizer = get_tokenizer("basic_english", language="en")
    return tokenizer

In [None]:
tokenizer = get_english_tokenizer()

In [None]:
!pip install torchdata

In [None]:
from torchtext.datasets import WikiText2
import torchdata

In [None]:
def get_data_iterator(ds_type):
    data_iter = WikiText2(root="./gdrive/My Drive/Colab Notebooks/Seminar2_07_NLP_intro", split=(ds_type))
    data_iter = to_map_style_dataset(data_iter)
    return data_iter

In [None]:
data_iterator = get_data_iterator('train')

In [None]:
from torchtext.vocab import build_vocab_from_iterator 
MIN_WORD_FREQUENCY = 50

In [None]:
def build_vocab(data_iter, tokenizer):
  vocab = build_vocab_from_iterator(
    map(tokenizer, data_iter),
    specials=["<unk>"],
    min_freq=MIN_WORD_FREQUENCY)
  vocab.set_default_index(vocab["<unk>"])
  return vocab

In [None]:
vocab = build_vocab()

In [None]:
def collate_cbow(batch, text_pipeline):
  batch_input, batch_output = [], []
  for text in batch:
    text_tokens_ids = text_pipeline(text)
    if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
      continue
    if MAX_SEQUENCE_LENGTH:
      text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]
      for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
        token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
        output = token_id_sequence.pop(CBOW_N_WORDS)
        input_ = token_id_sequence
        batch_input.append(input_)
        batch_output.append(output)
     
  batch_input = torch.tensor(batch_input, dtype=torch.long)
  batch_output = torch.tensor(batch_output, dtype=torch.long)
  return batch_input, batch_output

In [None]:
def get_dataloader_and_vocab(
    model_name, ds_name, ds_type, data_dir, batch_size, shuffle, vocab=None):

    data_iter = get_data_iterator(ds_name, ds_type, data_dir)
    tokenizer = get_english_tokenizer()

    if not vocab:
        vocab = build_vocab(data_iter, tokenizer)
        
    text_pipeline = lambda x: vocab(tokenizer(x))

    if model_name == "cbow":
        collate_fn = collate_cbow
    elif model_name == "skipgram":
        collate_fn = collate_skipgram
    else:
        raise ValueError("Choose model from: cbow, skipgram")

    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=partial(collate_fn, text_pipeline=text_pipeline))
    
    return dataloader, vocab

## train model

In [None]:
import os
import numpy as np
import json
import torch


class Trainer:
    """Main class for model training"""
    
    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        train_steps,
        val_dataloader,
        val_steps,
        checkpoint_frequency,
        criterion,
        optimizer,
        lr_scheduler,
        device,
        model_dir,
        model_name,
    ):  
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.train_steps = train_steps
        self.val_dataloader = val_dataloader
        self.val_steps = val_steps
        self.criterion = criterion
        self.optimizer = optimizer
        self.checkpoint_frequency = checkpoint_frequency
        self.lr_scheduler = lr_scheduler
        self.device = device
        self.model_dir = model_dir
        self.model_name = model_name

        self.loss = {"train": [], "val": []}
        self.model.to(self.device)

    def train(self):
        for epoch in range(self.epochs):
            self._train_epoch()
            self._validate_epoch()
            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1],
                )
            )

            self.lr_scheduler.step()

            if self.checkpoint_frequency:
                self._save_checkpoint(epoch)

    def _train_epoch(self):
        self.model.train()
        running_loss = []

        for i, batch_data in enumerate(self.train_dataloader, 1):
            inputs = batch_data[0].to(self.device)
            labels = batch_data[1].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

            if i == self.train_steps:
                break

        epoch_loss = np.mean(running_loss)
        self.loss["train"].append(epoch_loss)

    def _validate_epoch(self):
        self.model.eval()
        running_loss = []

        with torch.no_grad():
            for i, batch_data in enumerate(self.val_dataloader, 1):
                inputs = batch_data[0].to(self.device)
                labels = batch_data[1].to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss.append(loss.item())

                if i == self.val_steps:
                    break

        epoch_loss = np.mean(running_loss)
        self.loss["val"].append(epoch_loss)

    def _save_checkpoint(self, epoch):
        """Save model checkpoint to `self.model_dir` directory"""
        epoch_num = epoch + 1
        if epoch_num % self.checkpoint_frequency == 0:
            model_path = "checkpoint_{}.pt".format(str(epoch_num).zfill(3))
            model_path = os.path.join(self.model_dir, model_path)
            torch.save(self.model, model_path)

    def save_model(self):
        """Save final model to `self.model_dir` directory"""
        model_path = os.path.join(self.model_dir, "model.pt")
        torch.save(self.model, model_path)

    def save_loss(self):
        """Save train/val loss as json file to `self.model_dir` directory"""
        loss_path = os.path.join(self.model_dir, "loss.json")
        with open(loss_path, "w") as fp:
            json.dump(self.loss, fp)

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

def get_optimizer_class():
    return optim.Adam
    
def get_lr_scheduler(optimizer, total_epochs: int, verbose: bool = True):
    """
    Scheduler to linearly decrease learning rate, 
    so thatlearning rate after the last epoch is 0.
    """
    lr_lambda = lambda epoch: (total_epochs - epoch) / total_epochs
    lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda, verbose=verbose)
    return lr_scheduler

## test model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import torch
import sys
sys.path.append("./gdrive/My Drive/Colab Notebooks/Seminar2_07_NLP_intro")
sys.path.append("./gdrive/My Drive/Colab Notebooks/Seminar2_07_NLP_intro/utils")

import utils
import constants

from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
folder = "./gdrive/My Drive/Colab Notebooks/Seminar2_07_NLP_intro"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab = torch.load(f"{folder}/vocab.pt")
model = torch.load(f"{folder}/model.pt", map_location=device)

In [None]:
vocab.get_stoi()

In [None]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(4099, 300)

In [None]:
embeddings

In [None]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [None]:
for word, sim in get_top_similar("mother").items():
    print("{}: {:.3f}".format(word, sim))

father: 0.842
husband: 0.802
daughter: 0.780
wife: 0.771
brother: 0.759
son: 0.743
friend: 0.725
parents: 0.703
followers: 0.646
isabella: 0.633


# Ссылки
- [Тэги](https://medium.com/@muddaprince456/categorizing-and-pos-tagging-with-nltk-python-28f2bc9312c3)
- [Больше про ленгвистическую предобработку текстов](https://www.mygreatlearning.com/blog/natural-language-processing-tutorial/)
- [Статья про TF-IDF](https://m.habr.com/ru/company/Voximplant/blog/446738/)