# Предобработка текста

In [1]:
!pip install nltk



In [2]:
from __future__ import print_function
import numpy as np
import random
import sys
import io
import os
import re

# 1 Stemming, (removing and replacing suffixes to get to the root of the word) [WordNet](https://wordnet.princeton.edu/)

In [3]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [4]:
words= ["learn", "learning", "learned", "learns"]
ps =  PorterStemmer()
for w in words:
    rootWord=ps.stem(w)
    print(rootWord)

learn
learn
learn
learn


In [5]:
sentence="Dear students, You should try very hard to master machine learning!"
words = nltk.word_tokenize(sentence)
ps = PorterStemmer()
word_list = []
for w in words:
    rootWord=ps.stem(w)
    word_list.append(rootWord)
print("Список слов из предложения:\n", word_list)

Список слов из предложения:
 ['dear', 'student', ',', 'you', 'should', 'tri', 'veri', 'hard', 'to', 'master', 'machin', 'learn', '!']


# 2 Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer

In [7]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatize Single Word
print(lemmatizer.lemmatize("classes"))
print(lemmatizer.lemmatize("women"))
print(lemmatizer.lemmatize("crying"))

class
woman
cry


In [8]:
# Напишем какое-нибудь предложение
sentence = "Bad students were expelled from the institute"

# Сделаем его токенизацию, то есть разобьем на слова
word_list = nltk.word_tokenize(sentence)
print("Список слов из предложения:\n", word_list)

# Сделаем леммитизацию каждого слова
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) #Ваш код здесь
print("Предложение из лемматизированных слов:\n", lemmatized_output)

Список слов из предложения:
 ['Bad', 'students', 'were', 'expelled', 'from', 'the', 'institute']
Предложение из лемматизированных слов:
 Bad student were expelled from the institute


In [9]:
# Иногда одно и то же слово может иметь несколько лемм в зависимости от значения/части речи/контекста
print(lemmatizer.lemmatize("stripes", 'v'))
print(lemmatizer.lemmatize("stripes", 'n'))

strip
stripe


Можно получить [теги](https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk) токена предобученным алгоритмом.

In [10]:
print(nltk.pos_tag(['women']))
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('women', 'NNS')]
[('Bad', 'JJ'), ('students', 'NNS'), ('were', 'VBD'), ('expelled', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('institute', 'NN')]


In [11]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'women'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

woman
['Bad', 'student', 'be', 'expel', 'from', 'the', 'institute']


# 3 Bag of words

## 3.1 Via python

In [12]:
def vectorize(tokens, filtered_vocab):
    ''' This function takes list of words in a sentence as input
    and returns a vector of size of filtered_vocab.It puts 0 if the
    word is not present in tokens and count of token if present.'''
    vector=[]
    for w in filtered_vocab:
        vector.append(tokens.count(w)) # Ваш код здесь
    return vector

In [13]:
def unique(sequence):
    '''This functions returns a list in which the order remains
    same and no item repeats.Using the set() function does not
    preserve the original ordering'''
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))] # Ваш код здесь

In [14]:
#create a list of stopwords.You can import stopwords from nltk too
stopwords=["to","is","a"]

#list of special characters.You can use regular expressions too
special_char=[",",":"," ",";",".","?"]

#Write the sentences in the corpus,in our case, just two
string1="Welcome to Great Learning , Now start learning"
string2="Learning is a good practice"

#convert them to lower case
# Ваш код здесь
string1=string1.lower()
string2=string2.lower()

#split the sentences into tokens
# Ваш код здесь
tokens1=string1.split()
tokens2=string2.split()
print(tokens1)
print(tokens2)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'learning']
['learning', 'is', 'a', 'good', 'practice']


In [15]:
#create a vocabulary list
vocab=unique(tokens1+tokens2) # Ваш код здесь
print(vocab)

['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']


In [16]:
#filter the vocabulary list
filtered_vocab=[]
for w in vocab:
    if w not in stopwords and w not in special_char:
        filtered_vocab.append(w)
print(filtered_vocab)

['welcome', 'great', 'learning', 'now', 'start', 'good', 'practice']


In [17]:
#convert sentences into vectors
vector1=vectorize(tokens1, filtered_vocab)
print(vector1)
vector2=vectorize(tokens2, filtered_vocab)
print(vector2)

[1, 1, 2, 1, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 1]


## 3.2 Via sklearn

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentence_1="Welcome to Great Learning , Now start learning"
sentence_2="Learning is a good practice"

CountVec = CountVectorizer(ngram_range=(1,1), stop_words='english')
#transform
Count_data = CountVec.fit_transform([sentence_1,sentence_2])

#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
cv_dataframe

Unnamed: 0,good,great,learning,practice,start,welcome
0,0,1,2,0,1,1
1,1,0,1,1,0,0


# 4 N-grams

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all"

CountVec = CountVectorizer(ngram_range=(1,2))
#transform
Count_data = CountVec.fit_transform([sentence_1, sentence_2])

#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
cv_dataframe

Unnamed: 0,all,anything,at,at all,for,for anything,good,good at,good job,is,...,job will,miss,miss it,not,not good,not miss,this,this is,will,will not
0,0,1,0,0,1,1,1,0,1,1,...,1,1,1,1,0,1,1,1,1,1
1,1,0,1,1,0,0,1,1,0,1,...,0,0,0,1,1,0,1,1,0,0


# 5 TF-IDF

 * **Term Frequency (tf):** gives us the frequency of the word in each document in the corpus.

## $tf(t,d)= f_{t,d}$

 * **Inveres Document Frequency (idf):** used to calculate the weight of rare words across all documents in the corpus. The words that occur rarely in the corpus have a high IDF score.

## $idf(t,D) = \log\frac{N}{|\{d \in D \space : \space t \in d\}|}$

N: total number of documents in the corpus N=|D|

$|\{d \in D \space : \space t \in d\}|$ : number of documents where the term t appears

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentence_1="This is a good job.I will not miss it for anything"
sentence_2="This is not good at all"

#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True,
                        smooth_idf=False,
                        ngram_range=(1,1))
#transform
tf_idf_data = tf_idf_vec.fit_transform([sentence_1,sentence_2])

#create dataframe
tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names_out())
tf_idf_dataframe

Unnamed: 0,all,anything,at,for,good,is,it,job,miss,not,this,will
0,0.0,0.367724,0.0,0.367724,0.217184,0.217184,0.367724,0.367724,0.367724,0.217184,0.217184,0.367724
1,0.542701,0.0,0.542701,0.0,0.320528,0.320528,0.0,0.0,0.0,0.320528,0.320528,0.0


# 6 Word2vec

Установим полезную утилиту Python для парсинга веб-страниц. Она поможет нам достать статью Векипедии, с которой мы будем работать.

In [21]:
!pip install beautifulsoup4



И еще установим библиотеку для синтаксического анализа HTML страниц.

In [22]:
!pip install lxml



In [23]:
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Статья, с которой будем работать, конечно, про искусственный интеллект:)

In [24]:
# Загрузим статью по ссылке
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')

# Сдесь статья еще в куче с HTML кодами
article = scraped_data.read()

# А теперь статья приняла нормальный вид
parsed_article = bs.BeautifulSoup(article, 'lxml')

# Википедия хранит текстовое содержимое статьи внутри p-тегов. Вытащим текст
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
    article_text += p.text

In [25]:
article_text

'\nArtificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1] Such machines may be called AIs.\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being called AI because once something becomes useful enough and commo

Убедимся, что текст без HTML кодов.

In [26]:
article_text[0:300]

'\nArtificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligenc'

Супер!

Дальше сделаем [предобработку текста](https://tproger.ru/translations/regular-expression-python/):

In [27]:
# Удаляем прописные буквы
processed_article = article_text.lower()

# Удаляем все цифры, специальные символы
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)

# Удаляем лишние пробелы из текста
processed_article = re.sub(r'\s+', ' ', processed_article)

# Делим статью на предложения
# Хоть оно и получается одно, но это нужно, чтобы word_tokenize воспринимал слова как слова, а не буквы как слова
all_sentences = nltk.sent_tokenize(processed_article)

# Теперь разделим на слова
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Удаляем слова, не дающие никакой информации (предлоги, артикли и тд)
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

Посмотрим первые несколько слов нашего "чистого" списка.

In [28]:
all_words[0][0:5]

['artificial', 'intelligence', 'ai', 'broadest', 'sense']

Теперь нужно установить библиотеку обработки естественного языка. С её помощью можно обрабатывать тексты, работать с векторными моделями слов, такими как Word2Vec (в нашем случае).

In [29]:
!pip install gensim



In [30]:
from gensim.models import Word2Vec

Создадим модель Word2Vec с использованием статьи в Википедии, которую мы скопировали.

min_count=2 показывает, что в модель Word2Vec будут входить только те слова, которые встречаются в корпусе как минимум 2 раза.

In [31]:
word2vec = Word2Vec(all_words, min_count=2)

Посмотри уникальные слова, которые в статье встреачются как минимум дважды.

In [32]:
word2vec.wv.get_vecattr('data', 'count')

56

In [33]:
# model.wv - свойство модели, в котором хранятся отдельные ключевые векторы
vocab = word2vec.wv.key_to_index
print(vocab)
print(len(vocab))

{'ai': 0, 'intelligence': 1, 'learning': 2, 'used': 3, 'data': 4, 'artificial': 5, 'machine': 6, 'human': 7, 'research': 8, 'use': 9, 'problems': 10, 'may': 11, 'many': 12, 'models': 13, 'power': 14, 'problem': 15, 'networks': 16, 'search': 17, 'knowledge': 18, 'also': 19, 'neural': 20, 'reasoning': 21, 'large': 22, 'systems': 23, 'deep': 24, 'applications': 25, 'including': 26, 'field': 27, 'researchers': 28, 'make': 29, 'nuclear': 30, 'language': 31, 'would': 32, 'generative': 33, 'computer': 34, 'machines': 35, 'google': 36, 'world': 37, 'logic': 38, 'developed': 39, 'us': 40, 'include': 41, 'called': 42, 'using': 43, 'decision': 44, 'agents': 45, 'people': 46, 'based': 47, 'solve': 48, 'information': 49, 'program': 50, 'general': 51, 'agent': 52, 'e': 53, 'goals': 54, 'programs': 55, 'however': 56, 'g': 57, 'companies': 58, 'possible': 59, 'could': 60, 'risks': 61, 'tools': 62, 'system': 63, 'first': 64, 'algorithms': 65, 'like': 66, 'u': 67, 'example': 68, 'early': 69, 'trained': 

Посмотрим векторное представление какого-нибудь слова

In [34]:
v1 = word2vec.wv['artificial']
print(v1)
print(len(v1))

[-0.01320602  0.01233964  0.00139635 -0.0074929  -0.00785062 -0.01714983
  0.00713619  0.02017214 -0.01170926 -0.00837566 -0.01211101 -0.01311092
 -0.00900075 -0.00747505 -0.00091838 -0.0146995  -0.00674595 -0.01471283
 -0.00188849 -0.01638971 -0.00308868  0.00531709  0.01062605  0.00219037
 -0.00798706 -0.00395909 -0.00454054 -0.01467931 -0.00413551  0.00728558
  0.01636832 -0.00619152 -0.00086232 -0.00934566 -0.00587021  0.02112619
  0.00594579 -0.00292679 -0.0004049  -0.01984783  0.00933695 -0.01200381
 -0.0068713  -0.00343285  0.00522817 -0.00784139 -0.0047174   0.00490179
  0.00517594  0.00301795  0.00567244 -0.01492036  0.00616187  0.00790542
 -0.01618155  0.00843499  0.01336621 -0.01014293 -0.0129839  -0.00634376
  0.01363927  0.00174839  0.00570953 -0.01238816 -0.00487577  0.00573421
  0.00288097  0.00806034 -0.01774592 -0.00084635 -0.00366637  0.00953168
  0.01384898  0.00049017  0.01075148  0.01190133 -0.01124797  0.00100717
 -0.0061954   0.00408266  0.00487698 -0.00666015 -0

И найдем все похожие слова. Так же увидим их индексы подобия

In [35]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('ai', 0.717522919178009),
 ('may', 0.621457040309906),
 ('machine', 0.6090330481529236),
 ('artificial', 0.6074098944664001),
 ('research', 0.6030327081680298),
 ('used', 0.5975072979927063),
 ('problem', 0.5948125720024109),
 ('field', 0.5941661596298218),
 ('knowledge', 0.588356077671051),
 ('large', 0.583982527256012)]

# 7 [Word2vec with PyTorch](https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0)
[paper](https://arxiv.org/abs/1301.3781)

## prepare corpus

In [36]:
!pip install torchtext==0.17.1
!pip install torchdata==0.7.1

Collecting torchtext==0.17.1
  Downloading torchtext-0.17.1-cp311-cp311-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting torch==2.2.1 (from torchtext==0.17.1)
  Downloading torch-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.1)
  Downloading torchdata-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1->torchtext==0.17.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1->torchtext==0.17.1)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1->torchtext==0.17.1)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.1->torcht

In [37]:
!pip install portalocker==2.8.2

Collecting portalocker==2.8.2
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [38]:
from torchtext.data.utils import get_tokenizer

In [39]:
def get_english_tokenizer():
    """
    Documentation:
    https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer
    """
    tokenizer = get_tokenizer("basic_english", language="en")
    return tokenizer

In [40]:
tokenizer = get_english_tokenizer()

In [41]:
tokenizer('hello')

['hello']

In [42]:
from torchtext.datasets import WikiText2, WikiText103, PennTreebank
from torchtext.data import to_map_style_dataset
import torchdata
from torch.utils.data import DataLoader
from functools import partial
import torch

In [43]:
def get_data_iterator(ds_name, ds_type, data_dir):
    data_iter = PennTreebank(root=data_dir, split=(ds_type))
    data_iter = to_map_style_dataset(data_iter)
    return data_iter

In [44]:
data_iterator = get_data_iterator(ds_name=None,
                                  ds_type='train',
                                  data_dir="../data/",
                                 )

In [45]:
for i in data_iterator[0:5]:
    print(i,'\n')

aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 

pierre <unk> N years old will join the board as a nonexecutive director nov. N 

mr. <unk> is chairman of <unk> n.v. the dutch publishing group 

rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 

a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 



## [build_vocab_from_iterator](https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator), also showed how to build vocab from txt file

In [46]:
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch

MIN_WORD_FREQUENCY = 50

CBOW_N_WORDS = 4
MAX_SEQUENCE_LENGTH = 256
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

## [Dataset styles](https://glaringlee.github.io/data.html#iterable-style-datasets)

In [47]:
def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        map(tokenizer, data_iter),
        specials=["<unk>"],
        min_freq=MIN_WORD_FREQUENCY,
        #max_tokens=10,
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [48]:
vocab = build_vocab(data_iterator, tokenizer)

In [49]:
vocab['data']

453

In [51]:
def collate_cbow(batch, text_pipeline):
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)
        #print(text_tokens_ids)
        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

In [52]:
data_iterator[0:5]

['aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter',
 'pierre <unk> N years old will join the board as a nonexecutive director nov. N',
 'mr. <unk> is chairman of <unk> n.v. the dutch publishing group',
 'rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate',
 'a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported']

In [53]:
collate_cbow(data_iterator[0:5], lambda x: vocab(tokenizer(x)))[0][35]

tensor([395,   7, 339, 141,   0, 659,   0, 956])

In [54]:
collate_cbow(data_iterator[0:5], lambda x: vocab(tokenizer(x)))[1][35]

tensor(3)

In [55]:
data_iterator[0:5][0]

'aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter'

In [56]:
(lambda x: vocab(tokenizer(x)))(data_iterator[0:5][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [57]:
def get_dataloader_and_vocab(
    model_name, ds_name, ds_type, data_dir, batch_size, shuffle, vocab=None):

    data_iter = get_data_iterator(ds_name, ds_type, data_dir)
    tokenizer = get_english_tokenizer()

    if not vocab:
        vocab = build_vocab(data_iter, tokenizer)

    text_pipeline = lambda x: vocab(tokenizer(x))

    if model_name == "cbow":
        collate_fn = collate_cbow
    elif model_name == "skipgram":
        collate_fn = collate_skipgram
    else:
        raise ValueError("Choose model from: cbow, skipgram")

    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=partial(collate_fn, text_pipeline=text_pipeline))

    return dataloader, vocab

In [58]:
train_dataloader, vocab = get_dataloader_and_vocab(
    model_name="cbow",
    ds_name=None,
    ds_type='train',
    data_dir="./",
    batch_size=10,
    shuffle=True,
)

val_dataloader = get_dataloader_and_vocab(
    model_name="cbow",
    ds_name=None,
    ds_type='valid',
    data_dir="./",
    batch_size=10,
    shuffle=False,
    vocab=vocab
)[0]

In [59]:
for i in train_dataloader:
    print(i[0].shape)

torch.Size([135, 8])
torch.Size([83, 8])
torch.Size([97, 8])
torch.Size([122, 8])
torch.Size([83, 8])
torch.Size([71, 8])
torch.Size([124, 8])
torch.Size([126, 8])
torch.Size([141, 8])
torch.Size([198, 8])
torch.Size([152, 8])
torch.Size([89, 8])
torch.Size([122, 8])
torch.Size([122, 8])
torch.Size([197, 8])
torch.Size([151, 8])
torch.Size([183, 8])
torch.Size([148, 8])
torch.Size([156, 8])
torch.Size([164, 8])
torch.Size([152, 8])
torch.Size([174, 8])
torch.Size([154, 8])
torch.Size([128, 8])
torch.Size([125, 8])
torch.Size([184, 8])
torch.Size([110, 8])
torch.Size([137, 8])
torch.Size([105, 8])
torch.Size([164, 8])
torch.Size([105, 8])
torch.Size([133, 8])
torch.Size([196, 8])
torch.Size([93, 8])
torch.Size([183, 8])
torch.Size([124, 8])
torch.Size([153, 8])
torch.Size([52, 8])
torch.Size([222, 8])
torch.Size([105, 8])
torch.Size([163, 8])
torch.Size([104, 8])
torch.Size([194, 8])
torch.Size([153, 8])
torch.Size([110, 8])
torch.Size([75, 8])
torch.Size([182, 8])
torch.Size([231, 8])


## prepare model. [nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

In [60]:
emb = nn.Embedding(
      num_embeddings=1980,
      embedding_dim=EMBED_DIMENSION,
      max_norm=EMBED_MAX_NORM)

In [61]:
emb(torch.randint(0, 10, (1483,8))).shape

torch.Size([1483, 8, 300])

In [62]:
list(emb.parameters())[0].shape

torch.Size([1980, 300])

In [63]:
list(nn.Linear(in_features=1980, out_features=300).parameters())[0].shape

torch.Size([300, 1980])

In [64]:
nn.Linear(in_features=300, out_features=1980)(torch.rand([1483, 8, 300])).shape

torch.Size([1483, 8, 1980])

In [65]:
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
          num_embeddings=vocab_size,
          embedding_dim=EMBED_DIMENSION,
          max_norm=EMBED_MAX_NORM)

        self.linear = nn.Linear(
          in_features=EMBED_DIMENSION,
          out_features=vocab_size)

    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x
CBOW_model = CBOW_Model(len(vocab))

In [68]:
import yaml
from train import train

with open("config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)
train(config)

Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB 

Vocabulary size: 1980
Epoch: 1/30, Train Loss=4.90762, Val Loss=4.70300
Learning rate: 0.02500
Epoch: 2/30, Train Loss=4.58918, Val Loss=4.61456
Learning rate: 0.02500
Epoch: 3/30, Train Loss=4.48782, Val Loss=4.50985
Learning rate: 0.01250
Epoch: 4/30, Train Loss=4.33023, Val Loss=4.39499
Learning rate: 0.01250
Epoch: 5/30, Train Loss=4.29568, Val Loss=4.37258
Learning rate: 0.01250
Epoch: 6/30, Train Loss=4.26610, Val Loss=4.37263
Learning rate: 0.00625
Epoch: 7/30, Train Loss=4.12840, Val Loss=4.27175
Learning rate: 0.00625
Epoch: 8/30, Train Loss=4.09704, Val Loss=4.26990
Learning rate: 0.00625
Epoch: 9/30, Train Loss=4.08275, Val Loss=4.24100
Learning rate: 0.00313
Epoch: 10/30, Train Loss=3.97518, Val Loss=4.18179
Learning rate: 0.00313
Epoch: 11/30, Train Loss=3.94935, Val Loss=4.17954
Learning rate: 0.00313
Epoch: 12/30, Train Loss=3.93803, Val Loss=4.16387
Learning rate: 0.00156
Epoch: 13/30, Trai

# 8 Inference

In [69]:
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

#sys.path.append("../utils_word2vec//")
sys.path.append("../")
import utils_word2vec
from utils_word2vec.model import CBOW_Model

In [71]:
folder = "weights/cbow_PennTreebank"
device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_weights = torch.load(f"{folder}/last.pt", map_location=device)['model_state_dict']
vocab = torch.load(f"{folder}/vocab.pt")

In [72]:
torch.load(f"{folder}/last.pt", map_location=device)

{'epoch': 29,
 'model_state_dict': OrderedDict([('embeddings.weight',
               tensor([[-0.0390,  0.0989,  0.0485,  ...,  0.0443,  0.1089,  0.0137],
                       [ 0.5565, -0.0476, -0.4796,  ..., -0.0351, -0.0609, -0.0043],
                       [ 0.0100,  0.0238, -0.0381,  ...,  0.5360,  0.0793,  0.4658],
                       ...,
                       [-0.0671,  0.0843,  0.0548,  ..., -0.0198,  0.0581, -0.0215],
                       [-0.0197,  0.0228, -0.0080,  ..., -0.0171,  0.0345,  0.0101],
                       [-0.0590,  0.0415,  0.0571,  ...,  0.0323,  0.0688, -0.0165]])),
              ('linear.weight',
               tensor([[ 0.0716, -2.4243,  0.0864,  ..., -0.3383,  0.6644, -0.3412],
                       [-4.9431,  0.6099,  8.0234,  ...,  0.3507,  0.6666, -0.7633],
                       [-2.5649, -0.3303,  0.5491,  ...,  0.3228,  1.2617, -0.6941],
                       ...,
                       [ 0.0805, -1.2413,  0.2118,  ..., -2.5579,  0.0681,

In [73]:
len(vocab)

1980

In [74]:
model = CBOW_Model(len(vocab))
model.load_state_dict(model_weights)

<All keys matched successfully>

In [75]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization, like a vektor
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)

norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(1980, 300)

# Visualization with t-SNE

In [76]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [77]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.show()
fig.write_html("../word2vec_visualization.html")

# Find Similar Words

In [78]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [79]:
for word, sim in get_top_similar("germany").items():
    print("{}: {:.3f}".format(word, sim))

german: 0.701
coast: 0.493
bay: 0.382
britain: 0.374
golden: 0.306
texas: 0.284
days: 0.262
change: 0.258
countries: 0.253
europe: 0.252


# Vector Equations

In [80]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

<unk>: 0.590
woman: 0.468
stop: 0.247
developed: 0.240
authorities: 0.236


# Pretrained Embedding in new model

In [81]:
model_weights

OrderedDict([('embeddings.weight',
              tensor([[-0.0390,  0.0989,  0.0485,  ...,  0.0443,  0.1089,  0.0137],
                      [ 0.5565, -0.0476, -0.4796,  ..., -0.0351, -0.0609, -0.0043],
                      [ 0.0100,  0.0238, -0.0381,  ...,  0.5360,  0.0793,  0.4658],
                      ...,
                      [-0.0671,  0.0843,  0.0548,  ..., -0.0198,  0.0581, -0.0215],
                      [-0.0197,  0.0228, -0.0080,  ..., -0.0171,  0.0345,  0.0101],
                      [-0.0590,  0.0415,  0.0571,  ...,  0.0323,  0.0688, -0.0165]])),
             ('linear.weight',
              tensor([[ 0.0716, -2.4243,  0.0864,  ..., -0.3383,  0.6644, -0.3412],
                      [-4.9431,  0.6099,  8.0234,  ...,  0.3507,  0.6666, -0.7633],
                      [-2.5649, -0.3303,  0.5491,  ...,  0.3228,  1.2617, -0.6941],
                      ...,
                      [ 0.0805, -1.2413,  0.2118,  ..., -2.5579,  0.0681, -0.7212],
                      [-2.5666,  1.33

In [82]:
embedding = nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to('cpu'))
# Get embeddings for index 1
input = torch.LongTensor([1, 131, 1, 1421])
embedding(input).shape

torch.Size([4, 300])

In [83]:
class TextRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(TextRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to(device))
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)

    def forward(self, x, hidden):
        x = self.embedding(x)#.squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, ht1, ct1

    def init_hidden(self, batch_size=1):
        return (torch.rand(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.rand(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

In [84]:
model_weights['embeddings.weight'].shape[0]

1980

In [85]:
model = TextRNN(
    input_size=model_weights['embeddings.weight'].shape[0], # vocab size
    hidden_size=2048,
    embedding_size=model_weights['embeddings.weight'].shape[1],
    n_layers=1
).to(device)

In [86]:
nn.Embedding.from_pretrained(model_weights['embeddings.weight'].to(device))(torch.rand(25, 10).type(torch.LongTensor).to(device)).shape

torch.Size([25, 10, 300])

In [87]:
hidden = model.init_hidden(batch_size=10)
x_, ht1_, ct1_ = model(torch.rand(25, 10).type(torch.LongTensor).to(device), hidden)

In [88]:
hidden[0]

tensor([[[0.1753, 0.5835, 0.2117,  ..., 0.4554, 0.1665, 0.3903],
         [0.0383, 0.8671, 0.3282,  ..., 0.2124, 0.1378, 0.2522],
         [0.6444, 0.3134, 0.9524,  ..., 0.8805, 0.7446, 0.2255],
         ...,
         [0.5521, 0.5158, 0.4866,  ..., 0.1738, 0.7012, 0.9580],
         [0.5538, 0.2588, 0.0731,  ..., 0.2710, 0.0443, 0.0954],
         [0.7077, 0.2202, 0.8360,  ..., 0.9815, 0.4491, 0.0957]]],
       requires_grad=True)

In [89]:
x_.shape

torch.Size([25, 10, 1980])

In [90]:
ht1_.shape

torch.Size([1, 10, 2048])

In [91]:
ct1_.shape

torch.Size([1, 10, 2048])

## [Saving in Google Collab](https://stackoverflow.com/questions/64808087/how-do-i-save-files-from-google-colab-to-google-drive)