# Предобработка текста

In [1]:
!pip install nltk




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from __future__ import print_function
import numpy as np
import random
import sys
import io
import os
import re

# 1 Stemming, (removing and replacing suffixes to get to the root of the word) [WordNet](https://wordnet.princeton.edu/)

In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')  
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Safuan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Safuan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Safuan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
words= ["learn", "learning", "learned", "learns"]
ps =  PorterStemmer()
for w in words:
    rootWord=ps.stem(w)
    print(rootWord)

learn
learn
learn
learn


In [4]:
sentence="Dear students, You should try very hard to master machine learning!"
words = nltk.word_tokenize(sentence)
ps = PorterStemmer()
word_list = []
for w in words:
    rootWord=ps.stem(w)
    word_list.append(rootWord)
print("Список слов из предложения:\n", word_list)

Список слов из предложения:
 ['dear', 'student', ',', 'you', 'should', 'tri', 'veri', 'hard', 'to', 'master', 'machin', 'learn', '!']


# 2 Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer 

In [6]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatize Single Word
print(lemmatizer.lemmatize("classes"))
print(lemmatizer.lemmatize("women"))
print(lemmatizer.lemmatize("crying"))

class
woman
cry


In [7]:
# Напишем какое-нибудь предложение
sentence = "Bad students were expelled from the institute"

# Сделаем его токенизацию, то есть разобьем на слова
word_list = nltk.word_tokenize(sentence)
print("Список слов из предложения:\n", word_list)

# Сделаем леммитизацию каждого слова
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) #Ваш код здесь
print("Предложение из лемматизированных слов:\n", lemmatized_output)

Список слов из предложения:
 ['Bad', 'students', 'were', 'expelled', 'from', 'the', 'institute']
Предложение из лемматизированных слов:
 Bad student were expelled from the institute


In [8]:
# Иногда одно и то же слово может иметь несколько лемм в зависимости от значения/части речи/контекста
print(lemmatizer.lemmatize("stripes", 'v'))  
print(lemmatizer.lemmatize("stripes", 'n')) 

strip
stripe


Можно получить [теги](https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk) токена предобученным алгоритмом.

In [9]:
print(nltk.pos_tag(['women']))
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('women', 'NNS')]
[('Bad', 'JJ'), ('students', 'NNS'), ('were', 'VBD'), ('expelled', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('institute', 'NN')]


In [10]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'women'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

woman
['Bad', 'student', 'be', 'expel', 'from', 'the', 'institute']


# 3 Bag of words

## 3.1 Via python

In [11]:
def vectorize(tokens, filtered_vocab):
    ''' This function takes list of words in a sentence as input 
    and returns a vector of size of filtered_vocab.It puts 0 if the 
    word is not present in tokens and count of token if present.'''
    vector=[]
    for w in filtered_vocab:
        vector.append(tokens.count(w)) # Ваш код здесь
    return vector

In [12]:
def unique(sequence):
    '''This functions returns a list in which the order remains 
    same and no item repeats.Using the set() function does not 
    preserve the original ordering'''
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))] # Ваш код здесь

In [13]:
#create a list of stopwords.You can import stopwords from nltk too
stopwords=["to","is","a"]

#list of special characters.You can use regular expressions too
special_char=[",",":"," ",";",".","?"]

#Write the sentences in the corpus,in our case, just two 
string1="Welcome to Great Learning , Now start learning"
string2="Learning is a good practice"

#convert them to lower case
# Ваш код здесь
string1=string1.lower()
string2=string2.lower()

#split the sentences into tokens
# Ваш код здесь
tokens1=string1.split()
tokens2=string2.split()
print(tokens1)
print(tokens2)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'learning']
['learning', 'is', 'a', 'good', 'practice']


In [14]:
#create a vocabulary list
vocab=unique(tokens1+tokens2) # Ваш код здесь
print(vocab)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']


In [15]:
#filter the vocabulary list
filtered_vocab=[]
for w in vocab: 
    if w not in stopwords and w not in special_char: 
        filtered_vocab.append(w)
print(filtered_vocab)

['welcome', 'great', 'learning', 'now', 'start', 'good', 'practice']


In [16]:
#convert sentences into vectors
vector1=vectorize(tokens1, filtered_vocab)
print(vector1)
vector2=vectorize(tokens2, filtered_vocab)
print(vector2)

[1, 1, 2, 1, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 1]


## 3.2 Via sklearn

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="Welcome to Great Learning , Now start learning"
sentence_2="Learning is a good practice"
 
CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
#transform
Count_data = CountVec.fit_transform([sentence_1,sentence_2])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
cv_dataframe

Unnamed: 0,good,great,learning,practice,start,welcome
0,0,1,2,0,1,1
1,1,0,1,1,0,0


# 4 N-grams

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all" 
 
CountVec = CountVectorizer(ngram_range=(1,2))
#transform
Count_data = CountVec.fit_transform([sentence_1, sentence_2])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
cv_dataframe

Unnamed: 0,all,anything,at,at all,for,for anything,good,good at,good job,is,...,job will,miss,miss it,not,not good,not miss,this,this is,will,will not
0,0,1,0,0,1,1,1,0,1,1,...,1,1,1,1,0,1,1,1,1,1
1,1,0,1,1,0,0,1,1,0,1,...,0,0,0,1,1,0,1,1,0,0


# 5 TF-IDF

 * **Term Frequency (tf):** gives us the frequency of the word in each document in the corpus.

## $tf(t,d)= f_{t,d}$

 * **Inveres Document Frequency (idf):** used to calculate the weight of rare words across all documents in the corpus. The words that occur rarely in the corpus have a high IDF score.

## $idf(t,D) = \log\frac{N}{|\{d \in D \space : \space t \in d\}|}$

N: total number of documents in the corpus N=|D|

$|\{d \in D \space : \space t \in d\}|$ : number of documents where the term t appears

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all"
 
#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1))
#transform
tf_idf_data = tf_idf_vec.fit_transform([sentence_1,sentence_2])
 
#create dataframe
tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())
tf_idf_dataframe

Unnamed: 0,all,anything,at,for,good,is,it,job,miss,not,this,will
0,0.0,0.367724,0.0,0.367724,0.217184,0.217184,0.367724,0.367724,0.367724,0.217184,0.217184,0.367724
1,0.542701,0.0,0.542701,0.0,0.320528,0.320528,0.0,0.0,0.0,0.320528,0.320528,0.0


# 6 Word2vec

Установим полезную утилиту Python для парсинга веб-страниц. Она поможет нам достать статью Векипедии, с которой мы будем работать.

In [39]:
#!pip install beautifulsoup4

И еще установим библиотеку для синтаксического анализа HTML страниц.

In [38]:
#!pip install lxml

In [20]:
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Safuan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Статья, с которой будем работать, конечно, про искусственный интеллект:) 

In [21]:
# Загрузим статью по ссылке
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')

# Сдесь статья еще в куче с HTML кодами
article = scraped_data.read()

# А теперь статья приняла нормальный вид
parsed_article = bs.BeautifulSoup(article,'lxml')

# Википедия хранит текстовое содержимое статьи внутри p-тегов. Вытащим текст
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
    article_text += p.text

In [22]:
article_text

'\nArtificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of living beings, primarily of humans. It is a field of study in computer science that develops and studies intelligent machines. Such machines may be called AIs.\nAI technology is widely used throughout industry, government, and science. Some high-profile applications are: advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), interacting via human speech (such as Google Assistant, Siri, and Alexa), self-driving cars (e.g., Waymo), generative and creative tools (ChatGPT and AI art), and superhuman play and analysis in strategy games (such as chess and Go).[1]\nAlan Turing was the first person to conduct substantial research in the field that he called machine intelligence.[2] Artificial intelligence was founded as an academic discipline in 1956.[3] The field went through multiple cycles of optimism,[4][5] followed by per

Убедимся, что текст без HTML кодов.

In [23]:
article_text[0:300]

'\nArtificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of living beings, primarily of humans. It is a field of study in computer science that develops and studies intelligent machines. Such machines may be called AIs.\nAI technology is widely used th'

Супер!

Дальше сделаем [предобработку текста](https://tproger.ru/translations/regular-expression-python/):

In [24]:
# Удаляем прописные буквы
processed_article = article_text.lower()

# Удаляем все цифры, специальные символы
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
  
# Удаляем лишние пробелы из текста
processed_article = re.sub(r'\s+', ' ', processed_article)

# Делим статью на предложения
# Хоть оно и получается одно, но это нужно, чтобы word_tokenize воспринимал слова как слова, а не буквы как слова
all_sentences = nltk.sent_tokenize(processed_article)

# Теперь разделим на слова
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Удаляем слова, не дающие никакой информации (предлоги, артикли и тд)
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

Посмотрим первые несколько слов нашего "чистого" списка.

In [25]:
all_words[0][0:5]

['artificial', 'intelligence', 'ai', 'intelligence', 'machines']

Теперь нужно установить библиотеку обработки естественного языка. С её помощью можно обрабатывать тексты, работать с векторными моделями слов, такими как Word2Vec (в нашем случае).

In [26]:
#!pip install gensim

In [27]:
from gensim.models import Word2Vec

Создадим модель Word2Vec с использованием статьи в Википедии, которую мы скопировали.

min_count=2 показывает, что в модель Word2Vec будут входить только те слова, которые встречаются в корпусе как минимум 2 раза.

In [29]:
word2vec = Word2Vec(all_words, min_count=2)

Посмотри уникальные слова, которые в статье встреачются как минимум дважды.

In [31]:
# word2vec.wv.vocab

In [32]:
word2vec.wv.get_vecattr('data', 'count')

32

In [33]:
# model.wv - свойство модели, в котором хранятся отдельные ключевые векторы
vocab = word2vec.wv.key_to_index
print(vocab)
print(len(vocab))

{'ai': 0, 'intelligence': 1, 'learning': 2, 'machine': 3, 'used': 4, 'artificial': 5, 'human': 6, 'data': 7, 'research': 8, 'use': 9, 'problems': 10, 'many': 11, 'knowledge': 12, 'networks': 13, 'neural': 14, 'may': 15, 'also': 16, 'search': 17, 'problem': 18, 'deep': 19, 'using': 20, 'researchers': 21, 'field': 22, 'including': 23, 'systems': 24, 'computer': 25, 'reasoning': 26, 'could': 27, 'would': 28, 'technology': 29, 'decision': 30, 'applications': 31, 'possible': 32, 'general': 33, 'agent': 34, 'machines': 35, 'turing': 36, 'solve': 37, 'people': 38, 'make': 39, 'based': 40, 'large': 41, 'programs': 42, 'however': 43, 'world': 44, 'called': 45, 'information': 46, 'program': 47, 'models': 48, 'risk': 49, 'system': 50, 'mind': 51, 'training': 52, 'first': 53, 'risks': 54, 'example': 55, 'early': 56, 'specific': 57, 'real': 58, 'theory': 59, 'network': 60, 'algorithms': 61, 'processing': 62, 'include': 63, 'two': 64, 'ability': 65, 'decisions': 66, 'goals': 67, 'humans': 68, 'model

Посмотрим векторное представление какого-нибудь слова

In [34]:
v1 = word2vec.wv['artificial']
print(v1)
print(len(v1))

[-1.27615221e-02  7.94245210e-03 -1.37751224e-04 -9.65321343e-03
 -8.47052131e-03 -1.04167564e-02  6.54519722e-03  1.67750809e-02
 -9.14493669e-03 -7.65495375e-03 -9.83972009e-03 -1.00525720e-02
 -5.83345117e-03 -8.20181798e-03 -2.95580900e-03 -1.41269499e-02
 -5.42640639e-03 -1.25182196e-02 -2.59664282e-03 -1.25190690e-02
 -6.16802135e-03  1.51578768e-03  9.59798321e-03  5.74179972e-03
 -1.05462922e-02 -3.50345811e-03 -2.20064260e-03 -1.50226112e-02
 -3.32895713e-03  7.24068144e-03  1.09319873e-02 -6.34557661e-03
 -3.57294152e-03 -1.07739223e-02 -2.49360339e-03  1.62150934e-02
  3.56590259e-03  2.35201602e-04  2.90052703e-04 -1.45161888e-02
  9.96037386e-03 -1.10945934e-02 -8.65617860e-03 -3.11640510e-03
  3.51230078e-03 -4.53706924e-03 -3.03987367e-03  6.14359230e-03
  3.24374647e-03  2.34050537e-03  4.76721814e-03 -1.37085225e-02
  8.15279223e-03  8.41270760e-03 -1.45753687e-02  5.55189839e-03
  1.23527413e-02 -8.65587033e-03 -1.62908249e-02 -6.25332631e-03
  1.16521167e-02  7.59392

И найдем все похожие слова. Так же увидим их индексы подобия

In [37]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('ai', 0.5116602778434753),
 ('artificial', 0.45599961280822754),
 ('may', 0.4539526700973511),
 ('many', 0.4473116993904114),
 ('problem', 0.44670185446739197),
 ('networks', 0.4371168613433838),
 ('mind', 0.4301697015762329),
 ('position', 0.4225851595401764),
 ('could', 0.41936811804771423),
 ('computing', 0.41504523158073425)]

# 7 [Word2vec with PyTorch](https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0)
[paper](https://arxiv.org/abs/1301.3781)

## prepare corpus

In [38]:
from torchtext.data.utils import get_tokenizer

In [39]:
def get_english_tokenizer():
    """
    Documentation:
    https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer
    """
    tokenizer = get_tokenizer("basic_english", language="en")
    return tokenizer

In [40]:
tokenizer = get_english_tokenizer()

In [41]:
tokenizer('hello')

['hello']

In [42]:
from torchtext.datasets import WikiText2, WikiText103, PennTreebank
from torchtext.data import to_map_style_dataset
import torchdata
from torch.utils.data import DataLoader
from functools import partial 
import torch

In [43]:
def get_data_iterator(ds_name, ds_type, data_dir):
    data_iter = PennTreebank(root=data_dir, split=(ds_type))
    data_iter = to_map_style_dataset(data_iter)
    return data_iter

In [44]:
data_iterator = get_data_iterator(ds_name=None,
                                  ds_type='train', 
                                  data_dir="../data/",
                                 )

In [45]:
for i in data_iterator[0:5]:
    print(i,'\n')

aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 

pierre <unk> N years old will join the board as a nonexecutive director nov. N 

mr. <unk> is chairman of <unk> n.v. the dutch publishing group 

rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 

a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 



## [build_vocab_from_iterator](https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator), also showed how to build vocab from txt file

In [54]:
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch 

MIN_WORD_FREQUENCY = 50

CBOW_N_WORDS = 4 
MAX_SEQUENCE_LENGTH = 256 
EMBED_DIMENSION = 300 
EMBED_MAX_NORM = 1 

## [Dataset styles](https://glaringlee.github.io/data.html#iterable-style-datasets)

In [55]:
def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        map(tokenizer, data_iter),
        specials=["<unk>"],
        min_freq=MIN_WORD_FREQUENCY,
        #max_tokens=10,
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [56]:
vocab = build_vocab(data_iterator, tokenizer)

In [57]:
vocab['data']

453

In [58]:
def collate_cbow(batch, text_pipeline):
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)
        #print(text_tokens_ids)
        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

In [59]:
data_iterator[0:5]

['aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter',
 'pierre <unk> N years old will join the board as a nonexecutive director nov. N',
 'mr. <unk> is chairman of <unk> n.v. the dutch publishing group',
 'rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate',
 'a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported']

In [60]:
collate_cbow(data_iterator[0:5], lambda x: vocab(tokenizer(x)))[0][35]

tensor([395,   7, 339, 141,   0, 659,   0, 956])

In [61]:
collate_cbow(data_iterator[0:5], lambda x: vocab(tokenizer(x)))[1][35]

tensor(3)

In [62]:
data_iterator[0:5][0]

'aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter'

In [63]:
(lambda x: vocab(tokenizer(x)))(data_iterator[0:5][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [64]:
def get_dataloader_and_vocab(
    model_name, ds_name, ds_type, data_dir, batch_size, shuffle, vocab=None):

    data_iter = get_data_iterator(ds_name, ds_type, data_dir)
    tokenizer = get_english_tokenizer()

    if not vocab:
        vocab = build_vocab(data_iter, tokenizer)
        
    text_pipeline = lambda x: vocab(tokenizer(x))

    if model_name == "cbow":
        collate_fn = collate_cbow
    elif model_name == "skipgram":
        collate_fn = collate_skipgram
    else:
        raise ValueError("Choose model from: cbow, skipgram")

    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=partial(collate_fn, text_pipeline=text_pipeline))
    
    return dataloader, vocab

In [104]:
train_dataloader, vocab = get_dataloader_and_vocab(
    model_name="cbow",
    ds_name=None,
    ds_type='train',
    data_dir="./",
    batch_size=10,
    shuffle=True,       
)

val_dataloader = get_dataloader_and_vocab(
    model_name="cbow",
    ds_name=None,
    ds_type='valid',
    data_dir="./",
    batch_size=10,
    shuffle=False,
    vocab=vocab
)[0]

In [66]:
for i in train_dataloader:
    print(i[0].shape)

torch.Size([143, 8])
torch.Size([172, 8])
torch.Size([180, 8])
torch.Size([125, 8])
torch.Size([102, 8])
torch.Size([98, 8])
torch.Size([152, 8])
torch.Size([112, 8])
torch.Size([135, 8])
torch.Size([133, 8])
torch.Size([165, 8])
torch.Size([158, 8])
torch.Size([119, 8])
torch.Size([109, 8])
torch.Size([112, 8])
torch.Size([99, 8])
torch.Size([137, 8])
torch.Size([140, 8])
torch.Size([60, 8])
torch.Size([151, 8])
torch.Size([132, 8])
torch.Size([85, 8])
torch.Size([176, 8])
torch.Size([123, 8])
torch.Size([104, 8])
torch.Size([112, 8])
torch.Size([93, 8])
torch.Size([154, 8])
torch.Size([105, 8])
torch.Size([144, 8])
torch.Size([152, 8])
torch.Size([94, 8])
torch.Size([146, 8])
torch.Size([142, 8])
torch.Size([134, 8])
torch.Size([156, 8])
torch.Size([206, 8])
torch.Size([142, 8])
torch.Size([166, 8])
torch.Size([153, 8])
torch.Size([166, 8])
torch.Size([144, 8])
torch.Size([126, 8])
torch.Size([203, 8])
torch.Size([105, 8])
torch.Size([203, 8])
torch.Size([153, 8])
torch.Size([141, 8]

torch.Size([169, 8])
torch.Size([113, 8])
torch.Size([115, 8])
torch.Size([135, 8])
torch.Size([112, 8])
torch.Size([198, 8])
torch.Size([171, 8])
torch.Size([108, 8])
torch.Size([127, 8])
torch.Size([212, 8])
torch.Size([138, 8])
torch.Size([130, 8])
torch.Size([112, 8])
torch.Size([99, 8])
torch.Size([178, 8])
torch.Size([181, 8])
torch.Size([128, 8])
torch.Size([114, 8])
torch.Size([157, 8])
torch.Size([148, 8])
torch.Size([110, 8])
torch.Size([102, 8])
torch.Size([130, 8])
torch.Size([174, 8])
torch.Size([127, 8])
torch.Size([129, 8])
torch.Size([92, 8])
torch.Size([133, 8])
torch.Size([119, 8])
torch.Size([136, 8])
torch.Size([132, 8])
torch.Size([84, 8])
torch.Size([150, 8])
torch.Size([144, 8])
torch.Size([170, 8])
torch.Size([112, 8])
torch.Size([174, 8])
torch.Size([150, 8])
torch.Size([182, 8])
torch.Size([146, 8])
torch.Size([161, 8])
torch.Size([161, 8])
torch.Size([100, 8])
torch.Size([177, 8])
torch.Size([139, 8])
torch.Size([130, 8])
torch.Size([120, 8])
torch.Size([198,

torch.Size([124, 8])
torch.Size([124, 8])
torch.Size([60, 8])
torch.Size([160, 8])
torch.Size([177, 8])
torch.Size([105, 8])
torch.Size([151, 8])
torch.Size([142, 8])
torch.Size([118, 8])
torch.Size([163, 8])
torch.Size([186, 8])
torch.Size([117, 8])
torch.Size([113, 8])
torch.Size([95, 8])
torch.Size([77, 8])
torch.Size([121, 8])
torch.Size([152, 8])
torch.Size([171, 8])
torch.Size([129, 8])
torch.Size([121, 8])
torch.Size([62, 8])
torch.Size([129, 8])
torch.Size([135, 8])
torch.Size([165, 8])
torch.Size([113, 8])
torch.Size([143, 8])
torch.Size([123, 8])
torch.Size([111, 8])
torch.Size([137, 8])
torch.Size([115, 8])
torch.Size([116, 8])
torch.Size([160, 8])
torch.Size([138, 8])
torch.Size([109, 8])
torch.Size([130, 8])
torch.Size([177, 8])
torch.Size([167, 8])
torch.Size([150, 8])
torch.Size([113, 8])
torch.Size([119, 8])
torch.Size([226, 8])
torch.Size([135, 8])
torch.Size([113, 8])
torch.Size([94, 8])
torch.Size([134, 8])
torch.Size([87, 8])
torch.Size([129, 8])
torch.Size([89, 8])

torch.Size([166, 8])
torch.Size([161, 8])
torch.Size([177, 8])
torch.Size([110, 8])
torch.Size([147, 8])
torch.Size([150, 8])
torch.Size([111, 8])
torch.Size([168, 8])
torch.Size([165, 8])
torch.Size([218, 8])
torch.Size([160, 8])
torch.Size([191, 8])
torch.Size([199, 8])
torch.Size([97, 8])
torch.Size([164, 8])
torch.Size([168, 8])
torch.Size([181, 8])
torch.Size([106, 8])
torch.Size([125, 8])
torch.Size([92, 8])
torch.Size([144, 8])
torch.Size([179, 8])
torch.Size([117, 8])
torch.Size([130, 8])
torch.Size([147, 8])
torch.Size([118, 8])
torch.Size([125, 8])
torch.Size([139, 8])
torch.Size([166, 8])
torch.Size([157, 8])
torch.Size([94, 8])
torch.Size([155, 8])
torch.Size([143, 8])
torch.Size([130, 8])
torch.Size([143, 8])
torch.Size([156, 8])
torch.Size([120, 8])
torch.Size([126, 8])
torch.Size([121, 8])
torch.Size([121, 8])
torch.Size([86, 8])
torch.Size([167, 8])
torch.Size([147, 8])
torch.Size([121, 8])
torch.Size([90, 8])
torch.Size([142, 8])
torch.Size([95, 8])
torch.Size([101, 8]

torch.Size([137, 8])
torch.Size([162, 8])
torch.Size([140, 8])
torch.Size([128, 8])
torch.Size([107, 8])
torch.Size([103, 8])
torch.Size([153, 8])
torch.Size([165, 8])
torch.Size([146, 8])
torch.Size([124, 8])
torch.Size([152, 8])
torch.Size([166, 8])
torch.Size([134, 8])
torch.Size([149, 8])
torch.Size([162, 8])
torch.Size([97, 8])
torch.Size([175, 8])
torch.Size([174, 8])
torch.Size([168, 8])
torch.Size([195, 8])
torch.Size([131, 8])
torch.Size([192, 8])
torch.Size([113, 8])
torch.Size([139, 8])
torch.Size([125, 8])
torch.Size([176, 8])
torch.Size([198, 8])
torch.Size([188, 8])
torch.Size([157, 8])
torch.Size([100, 8])
torch.Size([153, 8])
torch.Size([151, 8])
torch.Size([208, 8])
torch.Size([165, 8])
torch.Size([135, 8])
torch.Size([155, 8])
torch.Size([107, 8])
torch.Size([94, 8])
torch.Size([173, 8])
torch.Size([166, 8])
torch.Size([137, 8])
torch.Size([138, 8])
torch.Size([116, 8])
torch.Size([164, 8])
torch.Size([128, 8])
torch.Size([147, 8])
torch.Size([119, 8])
torch.Size([92,

torch.Size([177, 8])
torch.Size([134, 8])
torch.Size([145, 8])
torch.Size([159, 8])
torch.Size([137, 8])
torch.Size([117, 8])
torch.Size([121, 8])
torch.Size([146, 8])
torch.Size([144, 8])
torch.Size([84, 8])
torch.Size([136, 8])
torch.Size([177, 8])
torch.Size([138, 8])
torch.Size([157, 8])
torch.Size([105, 8])
torch.Size([204, 8])
torch.Size([181, 8])
torch.Size([129, 8])
torch.Size([124, 8])
torch.Size([132, 8])
torch.Size([88, 8])
torch.Size([155, 8])
torch.Size([117, 8])
torch.Size([123, 8])
torch.Size([131, 8])
torch.Size([147, 8])
torch.Size([178, 8])
torch.Size([111, 8])
torch.Size([123, 8])
torch.Size([131, 8])
torch.Size([134, 8])
torch.Size([159, 8])
torch.Size([143, 8])
torch.Size([152, 8])
torch.Size([194, 8])
torch.Size([120, 8])
torch.Size([166, 8])
torch.Size([208, 8])
torch.Size([192, 8])
torch.Size([134, 8])
torch.Size([167, 8])
torch.Size([99, 8])
torch.Size([116, 8])
torch.Size([139, 8])
torch.Size([91, 8])
torch.Size([106, 8])
torch.Size([165, 8])
torch.Size([217, 

torch.Size([165, 8])
torch.Size([139, 8])
torch.Size([171, 8])
torch.Size([158, 8])
torch.Size([94, 8])
torch.Size([193, 8])
torch.Size([104, 8])
torch.Size([183, 8])
torch.Size([142, 8])
torch.Size([171, 8])
torch.Size([116, 8])
torch.Size([83, 8])
torch.Size([137, 8])
torch.Size([55, 8])
torch.Size([125, 8])
torch.Size([214, 8])
torch.Size([118, 8])
torch.Size([169, 8])
torch.Size([157, 8])
torch.Size([138, 8])
torch.Size([182, 8])
torch.Size([108, 8])
torch.Size([127, 8])
torch.Size([210, 8])
torch.Size([105, 8])
torch.Size([169, 8])
torch.Size([110, 8])
torch.Size([168, 8])
torch.Size([95, 8])
torch.Size([117, 8])
torch.Size([129, 8])
torch.Size([217, 8])
torch.Size([126, 8])
torch.Size([121, 8])
torch.Size([159, 8])
torch.Size([205, 8])
torch.Size([105, 8])
torch.Size([90, 8])
torch.Size([181, 8])
torch.Size([145, 8])
torch.Size([114, 8])
torch.Size([136, 8])
torch.Size([140, 8])
torch.Size([145, 8])
torch.Size([103, 8])
torch.Size([132, 8])
torch.Size([214, 8])
torch.Size([119, 8

torch.Size([127, 8])
torch.Size([161, 8])
torch.Size([142, 8])
torch.Size([153, 8])
torch.Size([118, 8])
torch.Size([111, 8])
torch.Size([125, 8])
torch.Size([96, 8])
torch.Size([117, 8])
torch.Size([130, 8])
torch.Size([125, 8])
torch.Size([108, 8])
torch.Size([177, 8])
torch.Size([115, 8])
torch.Size([137, 8])
torch.Size([166, 8])
torch.Size([156, 8])
torch.Size([81, 8])
torch.Size([144, 8])
torch.Size([129, 8])
torch.Size([138, 8])
torch.Size([175, 8])
torch.Size([99, 8])


## prepare model. [nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

In [67]:
emb = nn.Embedding(
      num_embeddings=1980,
      embedding_dim=EMBED_DIMENSION,
      max_norm=EMBED_MAX_NORM)

In [68]:
emb(torch.randint(0, 10, (1483,8))).shape

torch.Size([1483, 8, 300])

In [69]:
list(emb.parameters())[0].shape

torch.Size([1980, 300])

In [70]:
list(nn.Linear(in_features=1980, out_features=300).parameters())[0].shape

torch.Size([300, 1980])

In [71]:
nn.Linear(in_features=300, out_features=1980)(torch.rand([1483, 8, 300])).shape

torch.Size([1483, 8, 1980])

In [72]:
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
          num_embeddings=vocab_size,
          embedding_dim=EMBED_DIMENSION,
          max_norm=EMBED_MAX_NORM)

        self.linear = nn.Linear(
          in_features=EMBED_DIMENSION,
          out_features=vocab_size)
    
    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x
CBOW_model = CBOW_Model(len(vocab))

# 8 Inference

In [73]:
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

#sys.path.append("../utils_word2vec//")
sys.path.append("../")
import utils_word2vec
from utils_word2vec.model import CBOW_Model

In [83]:
folder = "weights/cbow_PennTreebank"
device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_weights = torch.load(f"../{folder}/last.pt", map_location=device)['model_state_dict']
vocab = torch.load(f"../{folder}/vocab.pt")

In [84]:
torch.load(f"../{folder}/last.pt", map_location=device)

{'epoch': 29,
 'model_state_dict': OrderedDict([('embeddings.weight',
               tensor([[ 2.2117e-02,  3.4843e-02,  5.4009e-02,  ...,  9.6949e-02,
                        -2.3338e-01,  2.5701e-02],
                       [-4.3491e-01, -4.2265e-01, -4.5659e-01,  ..., -2.4201e-02,
                         8.4041e-02,  4.4036e-02],
                       [-2.2154e-02, -2.6811e-02, -3.7039e-02,  ...,  7.5641e-02,
                         1.7327e-02, -2.7635e-01],
                       ...,
                       [ 7.4347e-02,  7.0348e-02,  7.3009e-02,  ..., -2.9095e-03,
                        -1.1632e-05, -5.8459e-02],
                       [-2.1222e-03,  3.4748e-04,  4.0864e-02,  ...,  2.2888e-03,
                        -7.4401e-02, -4.6504e-04],
                       [ 6.4868e-02,  6.5909e-02,  5.0916e-02,  ...,  2.0050e-02,
                        -1.2801e-01, -5.3838e-02]])),
              ('linear.weight',
               tensor([[-1.5808e-01, -8.4531e-01, -2.6967e-01,  ..., 

In [85]:
len(vocab)

1980

In [86]:
model = CBOW_Model(len(vocab))
model.load_state_dict(model_weights)

<All keys matched successfully>

In [87]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization, like a vektor
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)

norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(1980, 300)

# Visualization with t-SNE

In [88]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [89]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.show()
fig.write_html("../word2vec_visualization.html")

# Find Similar Words

In [90]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [91]:
for word, sim in get_top_similar("germany").items():
    print("{}: {:.3f}".format(word, sim))

german: 0.700
coast: 0.482
britain: 0.384
bay: 0.358
change: 0.294
countries: 0.281
golden: 0.270
communist: 0.264
days: 0.258
words: 0.256


# Vector Equations

In [92]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

<unk>: 0.624
woman: 0.482
authorities: 0.258
scandal: 0.239
particular: 0.237


# Pretrained Embedding in new model

In [93]:
model_weights

OrderedDict([('embeddings.weight',
              tensor([[ 2.2117e-02,  3.4843e-02,  5.4009e-02,  ...,  9.6949e-02,
                       -2.3338e-01,  2.5701e-02],
                      [-4.3491e-01, -4.2265e-01, -4.5659e-01,  ..., -2.4201e-02,
                        8.4041e-02,  4.4036e-02],
                      [-2.2154e-02, -2.6811e-02, -3.7039e-02,  ...,  7.5641e-02,
                        1.7327e-02, -2.7635e-01],
                      ...,
                      [ 7.4347e-02,  7.0348e-02,  7.3009e-02,  ..., -2.9095e-03,
                       -1.1632e-05, -5.8459e-02],
                      [-2.1222e-03,  3.4748e-04,  4.0864e-02,  ...,  2.2888e-03,
                       -7.4401e-02, -4.6504e-04],
                      [ 6.4868e-02,  6.5909e-02,  5.0916e-02,  ...,  2.0050e-02,
                       -1.2801e-01, -5.3838e-02]])),
             ('linear.weight',
              tensor([[-1.5808e-01, -8.4531e-01, -2.6967e-01,  ..., -6.4379e-02,
                       -1.0722e+00, -

In [94]:
embedding = nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to('cpu'))
# Get embeddings for index 1
input = torch.LongTensor([1, 131, 1, 1421])
embedding(input).shape

torch.Size([4, 300])

In [95]:
class TextRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(TextRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to(device))
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)
        
    def forward(self, x, hidden):
        x = self.embedding(x)#.squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, ht1, ct1
    
    def init_hidden(self, batch_size=1):
        return (torch.rand(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.rand(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

In [96]:
model_weights['embeddings.weight'].shape[0]

1980

In [97]:
model = TextRNN(
    input_size=model_weights['embeddings.weight'].shape[0], # vocab size
    hidden_size=2048,
    embedding_size=model_weights['embeddings.weight'].shape[1],
    n_layers=1
).to(device)

In [98]:
nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to(device))(torch.rand(25, 10).type(torch.LongTensor).to(device)).shape

torch.Size([25, 10, 300])

In [99]:
hidden = model.init_hidden(batch_size=10)
x_, ht1_, ct1_ = model(torch.rand(25, 10).type(torch.LongTensor).to(device), hidden)

In [100]:
hidden[0]

tensor([[[0.4199, 0.4613, 0.0229,  ..., 0.9226, 0.8242, 0.8559],
         [0.3335, 0.6394, 0.3707,  ..., 0.1336, 0.6542, 0.8546],
         [0.8112, 0.1724, 0.1724,  ..., 0.2210, 0.0894, 0.4193],
         ...,
         [0.1283, 0.4750, 0.5972,  ..., 0.7048, 0.1682, 0.9665],
         [0.1306, 0.2690, 0.5901,  ..., 0.1916, 0.6004, 0.8933],
         [0.6346, 0.1849, 0.2876,  ..., 0.8644, 0.8094, 0.8827]]],
       requires_grad=True)

In [101]:
x_.shape

torch.Size([25, 10, 1980])

In [102]:
ht1_.shape

torch.Size([1, 10, 2048])

In [103]:
ct1_.shape

torch.Size([1, 10, 2048])

## [Saving in Google Collab](https://stackoverflow.com/questions/64808087/how-do-i-save-files-from-google-colab-to-google-drive)