In [None]:
!wget -O quora.zip -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=1ERtxpdWOgGQ3HOigqAMHTJjmOE_tWvoF"
!unzip quora.zip


Archive:  quora.zip
  inflating: train.csv               


In [None]:
import nltk
import string
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Word Embeddings 


Previously we talked about such models of vectors
![embeddings relations](https://www.tensorflow.org/images/linear-relationships.png)
*From [Vector Representations of Words, Tensorflow tutorial](https://www.tensorflow.org/tutorials/representation/word2vec)*

Today we start from simple word2vec models. We will start from pretrained ones and then take a look into how we can create an architecture based on Pytorch like this

## Simple model training

To start with we will work with [Quora Question Pairs at kaggle](https://www.kaggle.com/c/quora-question-pairs)
Dataset consists of question pairs and label are they equal or not

In [None]:
quora_data = pd.read_csv('train.csv')

quora_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


### Standard text preprocesing:
- read texts
- delete punctuations
- lowercase
- tokenize

optional
- remove stop words
- lemmatization or stemming
- grammar check


### Preprocessing

Let's train Word2Vec from `gensim` on top of all the texts

In [None]:
# read data
quora_data["question1"] = quora_data["question1"].replace(np.nan, '', regex=True)
quora_data["question2"] = quora_data["question2"].replace(np.nan, '', regex=True)

# remove punctiation
quora_data["question1"] = quora_data["question1"].apply(lambda s: s.translate(str.maketrans('', '', string.punctuation))) 
quora_data["question2"] = quora_data["question2"].apply(lambda s: s.translate(str.maketrans('', '', string.punctuation))) 

# cast to lowercase
quora_data["question1"] = quora_data["question1"].apply(lambda s: s.lower()) 
quora_data["question2"] = quora_data["question2"].apply(lambda s: s.lower())


texts = list(pd.concat([quora_data.question1, quora_data.question2]).unique())
texts[:10]

['what is the step by step guide to invest in share market in india',
 'what is the story of kohinoor kohinoor diamond',
 'how can i increase the speed of my internet connection while using a vpn',
 'why am i mentally very lonely how can i solve it',
 'which one dissolve in water quikly sugar salt methane and carbon di oxide',
 'astrology i am a capricorn sun cap moon and cap risingwhat does that say about me',
 'should i buy tiago',
 'how can i be a good geologist',
 'when do you use シ instead of し',
 'motorola company can i hack my charter motorolla dcx3400']

For tokenization is easier to use `nltk` (it's faster than `spacy`, but can be worse in some situations. Also spacy is more production oriented framework now)

In [None]:
word_tokenize(texts[0])

['what',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india']

Tokenizing all texts

In [None]:
tokenized_texts = [word_tokenize(text) for text in texts]

In [None]:
print([' '.join(row) for row in tokenized_texts[:2]])

['what is the step by step guide to invest in share market in india', 'what is the story of kohinoor kohinoor diamond']


Train model:

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(tokenized_texts, 
                 vector_size=32,      # embedding vector size
                 min_count=5,  # consider words that occured at least 5 times
                 window=5).wv  # define context as a 5-word window around the target word

## Examine the model

In [None]:
model.get_vector('anything')

array([ 1.23159623e+00, -7.18023121e-01,  3.05932069e+00,  1.85257542e+00,
       -2.14819956e+00,  5.04982591e-01, -1.86287665e+00, -1.43480316e-01,
        8.75910223e-01, -3.49305534e+00, -1.41126350e-01,  2.68434763e+00,
        1.45996422e-01, -1.70843101e+00,  5.81411757e-02, -3.77825350e-01,
       -1.32196903e+00,  2.32053113e+00, -2.27879803e-03, -4.88962531e-01,
        1.63951969e+00,  9.02974367e-01, -2.26027846e+00, -1.07766354e+00,
       -1.28912723e+00, -1.18110085e+00,  2.21294069e+00,  5.16677320e-01,
        4.58270550e+00,  4.61535931e+00, -1.07698984e-01,  4.69572186e-01],
      dtype=float32)

Most similar words:

In [None]:
model.most_similar('bread')

[('rice', 0.9482910633087158),
 ('vodka', 0.9366668462753296),
 ('sauce', 0.9324812889099121),
 ('cheese', 0.9229483604431152),
 ('chocolate', 0.9228693842887878),
 ('butter', 0.922383725643158),
 ('beans', 0.9220638871192932),
 ('banana', 0.9174727201461792),
 ('pasta', 0.915195882320404),
 ('noodles', 0.9140670895576477)]

Or in this way:

In [None]:
model.most_similar(positive=['coder', 'money'], negative=['brain'])

[('parcels', 0.6623937487602234),
 ('photographer', 0.6605520248413086),
 ('millionaire', 0.6469765305519104),
 ('freelancer', 0.6467384099960327),
 ('trainer', 0.6361854672431946),
 ('tutor', 0.6272916197776794),
 ('discount', 0.6256202459335327),
 ('100k', 0.6223222613334656),
 ('aspiring', 0.615554690361023),
 ('trader', 0.6135827898979187)]

And of course operations:

In [None]:
model.most_similar([model.get_vector('politician') - model.get_vector('power') + model.get_vector('honesty')])

[('alia', 0.6672729849815369),
 ('presents', 0.651232898235321),
 ('romantic', 0.6454851627349854),
 ('farewell', 0.6206591725349426),
 ('punjabi', 0.6180728077888489),
 ('singer', 0.6153601408004761),
 ('teen', 0.6116803288459778),
 ('charming', 0.5998355150222778),
 ('sexy', 0.5970967411994934),
 ('seductive', 0.5927653312683105)]

## Model visualisation

Let's take a look on 1k most frequent words

In [None]:
words = model.index_to_key[:1000]

print(words[::100])

['the', 'up', 'top', 'white', 'america', 'post', 'blood', 'exams', 'sim', 'snapchat']


Constructing embedding matrix

In [None]:
word_vectors = model.vectors[[model.key_to_index[word] for word in words]]

### PCA Visualisation

In [None]:
from sklearn.decomposition import PCA


def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return pca.fit_transform(word_vectors)

In [None]:
word_vectors_pca = get_pca_projection(word_vectors)

Visualization function:

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig

In [None]:
draw_vectors(word_vectors_pca[:, 0], word_vectors_pca[:, 1], token=words)

### TSNE

In [None]:
from sklearn.manifold import TSNE

def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return tsne.fit_transform(word_vectors)

In [None]:
word_tsne = get_tsne_projection(word_vectors)
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], color='green', token=words)

## Sentence embeddings

Now we will use pretrained model for ease of operations

In [None]:
import gensim.downloader as api

model = api.load('glove-twitter-100')



In [None]:
import gensim.downloader as api
api.info('glove-twitter-100')

{'num_records': 1193514,
 'file_size': 405932991,
 'base_dataset': 'Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-twitter-100/__init__.py',
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'parameters': {'dimension': 100},
 'description': 'Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https://nlp.stanford.edu/projects/glove/)',
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-twitter-100.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'checksum': 'b04f7bed38756d64cf55b58ce7e97b15',
 'file_name': 'glove-twitter-100.gz',
 'parts': 1}

The easy way to obtain sentence embedding is to get mean embedding of words in sentence. Let's do it

In [None]:
def get_phrase_embedding(model, phrase):    
    tokenized = word_tokenize(phrase.lower())
    if len(tokenized) > 0:
        vector = model.get_mean_vector(tokenized)
        return vector
    return np.zeros([model.vector_size], dtype='float32')

In [None]:
vector = get_phrase_embedding(model, "I'm very sure. This never happened to me before...")

vector.shape

(100,)

Creating vectors for every questions



In [None]:
text_vectors = np.array([get_phrase_embedding(model, phrase) for phrase in texts])

Let's find out most closest questions to each other

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_nearest(model, text_vectors, texts, query, k=10):
    query = get_phrase_embedding(model, query)
    sim_scores = cosine_similarity(text_vectors, query.reshape(1, -1)).reshape(1,-1)
    k_idx = np.argpartition(-sim_scores, k)[: , :k]
    return np.array(texts)[k_idx]

In [None]:
texts[4]

'which one dissolve in water quikly sugar salt methane and carbon di oxide'

In [None]:
find_nearest(model, text_vectors, texts, 
             query="which one dissolve in water quikly sugar salt methane and carbon di oxide",
             k=10)

array([['carbon dioxide forms carbonic acid when added to water is co2 soluble in water why doesnt co2 form carbonic acid in the case of an acid rain',
        'why cant we convert salt water into pure drinking water in large scale',
        'is it safe to consume the salt water biproduct of bleach  hydrogen peroxide',
        'if water is split into hydrogen and oxygen how much of each gas is produced per liter of water processed',
        'which one dissolve in water quikly sugar salt methane and carbon di oxide',
        'why does some organic compounds dissolve in water',
        'what makes the fine salt dissolve faster in water',
        'the sugar and milk dissolve in water why',
        'does ice dissolve or melt in water',
        'why a drop of oil float in water']], dtype='<U1130')

In [None]:
results = find_nearest(model, text_vectors, texts, query="How do i enter the matrix?", k=10)


In [None]:
results

array([['do you live in the matrix why', 'how do i get to the dark web',
        'how do i get to download things on the dark web that i cannot find in the normal web',
        'what do i do to enter the line of event management',
        'i want to run my own startup i have the idea and plan but no team what do i do',
        'how do i download the mengtos designcode book',
        'in minecraft how do you make a book',
        'i have a great idea for a mobile app but i do not know how to code what should i do to make this idea a reality',
        'what can i do with this ipad', 'how do i use the greenify app']],
      dtype='<U1130')

In [None]:
find_nearest(model, text_vectors, texts, query="How does Trump?", k=10)

array([['why does trump tweet', 'trump wins what do you think',
        'what does donald trump think about israel',
        'who or what is donald trump really',
        'what does india think of donald trump',
        'would you do business with trump why why not',
        'what does donald trump think of china',
        'why is donald trump so successful in what he does',
        'what does donald trump think of india',
        'what do you think about donald trump']], dtype='<U1130')

In [None]:
find_nearest(model, text_vectors, texts, query="Why don't i ask a question myself?", k=10)

array([['why do you always answer a question with a question i dont or do i',
        'how do i ask a question on this',
        'how do you ask a good question', 'how do i downvote a question',
        'why do i ask this question', 'how do i ask a question',
        'how do you ask a question',
        'how do i ask a question on quora and what should i ask',
        'how do i ask someone on a date',
        'why do i have to ask a girl out why cant she ask me']],
      dtype='<U1130')

## Now we will look into embedding training process and more neural networks

Main idea is that we can predict word knowing its context:
![contexts](https://image.ibb.co/mnQ2uz/2018_09_17_21_07_08.png)
*From [cs224n, Lecture 2](http://web.stanford.edu/class/cs224n/lectures/lecture2.pdf)*

You can watch how it works here: [https://ronxin.github.io/wevi/](https://ronxin.github.io/wevi/).

# PyTorch basics

In [None]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 

np.random.seed(42)

## Automatic differentation

### Computational graphs

Computational graphs - is easu way to calculate complex functions gradients

For example with function

$$f = (x + y) \cdot z$$

would look like a graph

![graph](https://image.ibb.co/mWM0Lx/1_6o_Utr7_ENFHOK7_J4l_XJtw1g.png)  
*From [Backpropagation, Intuitions - CS231n](http://cs231n.github.io/optimization-2/)*


Let's see that example
First define function in Pytorch:

In [None]:
x = torch.tensor(-2., requires_grad=True)
y = torch.tensor(5., requires_grad=True)
z = torch.tensor(-4., requires_grad=True)

q = x + y
f = q * z

Next just ask to calc gradients

In [None]:
f.backward()

print('df/dz =', z.grad)
print('df/dx =', x.grad)
print('df/dy =', y.grad)

df/dz = tensor(3.)
df/dx = tensor(-4.)
df/dy = tensor(-4.)


Method `backward()` calculates all gradients which has paramter `requires_grad == True`.

Also we can use context managers to calc gradients ([Locally disabling gradient computation](https://pytorch.org/docs/stable/autograd.html#locally-disabling-gradient-computation)):
```python
torch.autograd.no_grad()
torch.autograd.enable_grad()
torch.autograd.set_grad_enabled(mode)

```

In [None]:
with torch.autograd.no_grad():
    x = torch.tensor(-2., requires_grad=True)
    y = torch.tensor(5., requires_grad=True)
    q = x + y

z = torch.tensor(-4., requires_grad=True)
f = q * z

f.backward()

print('df/dz =', z.grad)
print('df/dx =', x.grad)
print('df/dy =', y.grad)

df/dz = tensor(3.)
df/dx = None
df/dy = None


More about autograd, can be found here: [Autograd mechanics](https://pytorch.org/docs/stable/notes/autograd.html).


Tensors are contain data:

In [None]:
x.data

tensor(-2.)

Collected gradient:

In [None]:
x.grad

Function to calculate that gradient:

In [None]:
q.grad_fn

And more

In [None]:
x.type(), x.shape, x.device, x.layout

('torch.FloatTensor', torch.Size([]), device(type='cpu'), torch.strided)

## Word embeddings and PyTorch API

Will work with the same data and create word 2 vec model of its own. Will start with same steps of reading and lowercasing texts

In [None]:
quora_data = pd.read_csv('train.csv')

quora_data.question1 = quora_data.question1.replace(np.nan, '', regex=True)
quora_data.question2 = quora_data.question2.replace(np.nan, '', regex=True)

texts = list(pd.concat([quora_data.question1, quora_data.question2]).unique())

tokenized_texts = [word_tokenize(text.lower()) for text in texts]

Collect word frequency counts:

In [None]:
from collections import Counter

MIN_COUNT = 5

words_counter = Counter(token for tokens in tokenized_texts for token in tokens)
word2index = {
    '<unk>': 0
}

for word, count in words_counter.most_common():
    if count < MIN_COUNT:
        break
        
    word2index[word] = len(word2index)
    
index2word = [word for word, _ in sorted(word2index.items(), key=lambda x: x[1])]
    
print('Vocabulary size:', len(word2index))
print('Tokens count:', sum(len(tokens) for tokens in tokenized_texts))
print('Unknown tokens appeared:', sum(1 for tokens in tokenized_texts for token in tokens if token not in word2index))
print('Most freq words:', index2word[1:21])

Vocabulary size: 28635
Tokens count: 6971163
Unknown tokens appeared: 123082
Most freq words: ['?', 'the', 'what', 'is', 'a', 'i', 'to', 'in', 'how', 'of', 'do', 'are', 'and', 'for', ',', 'can', 'you', 'why', 'it', 'my']


### Skip-Gram Word2vec

Simple model consists of 2 layers. Main idea to train embedding vectors in the way to predict context of words. 

For that reason probabilities are modelled in the following way $\{P(w_{c+j}|w_c):  j = c-k, ..., c+k, j \neq c\}$, where $k$ - context windoe size, $c$ - central word index.

Model architecture will be: pair of matrices $U$ - embedding matrix, which we will use for tasks, and $V$ -output layer matrix.

For every word in dictionary we have referring row in $U$ and column in $V$.

![skip-gram](https://image.ibb.co/khFXu9/Skip_gram.png)

Word is transformed into embedding - row $u_c$. Next that embeddding multiplied on $V$. 

And we got scores $v_j^T u_c$ - similarity of word $j$ and initially passed word.

To make it more like probabilities we will use softmax function: $P(i) = \frac{e^{x_i}}{\sum_j e^{x_j}}$.

And cross entropy loss to make optimization:

$$-\sum_{-k \leq j \leq k, j \neq 0} \log \frac{\exp(v_{c+j}^T u_c)}{\sum_{i=1}^{|V|} \exp(v_i^T u_c)} \to \min_{U, V}.$$

At the end, vector $u_c$ will start to be closer to vectors $v_{c_j}$ from its context.

Let's create it.

#### Batch generation

First we need to collect contexts.

In [None]:
def build_contexts(tokenized_texts, window_size):
    contexts = []
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            central_word = tokens[i]
            context = [tokens[i + delta] for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]

            contexts.append((central_word, context))
            
    return contexts

In [None]:
contexts = build_contexts(tokenized_texts, window_size=2)

In [None]:
contexts[:5]

[('what', ['is', 'the']),
 ('is', ['what', 'the', 'step']),
 ('the', ['what', 'is', 'step', 'by']),
 ('step', ['is', 'the', 'by', 'step']),
 ('by', ['the', 'step', 'step', 'guide'])]

Cast words to their indicies in vocab

In [None]:
contexts = [(word2index.get(central_word, 0), [word2index.get(word, 0) for word in context]) 
            for central_word, context in contexts]

Implement batch generator for our network:

In [None]:
import random

def make_skip_gram_batchs_iter(contexts, window_size, num_skips, batch_size):
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * window_size
    
    central_words = [word for word, context in contexts if len(context) == 2 * window_size and word != 0]
    contexts = [context for word, context in contexts if len(context) == 2 * window_size and word != 0]
    
    batch_size = int(batch_size / num_skips)
    batchs_count = int(math.ceil(len(contexts) / batch_size))
    
    print('Initializing batchs generator with {} batchs per epoch'.format(batchs_count))
    
    while True:
        indices = np.arange(len(contexts))
        np.random.shuffle(indices)

        for i in range(batchs_count):
            batch_begin, batch_end = i * batch_size, min((i + 1) * batch_size, len(contexts))
            batch_indices = indices[batch_begin: batch_end]

            batch_data, batch_labels = [], []

            for data_ind in batch_indices:
                central_word, context = central_words[data_ind], contexts[data_ind]
                
                words_to_use = random.sample(context, num_skips)
                batch_data.extend(words_to_use)
                batch_labels.extend([central_word] * num_skips)
            
            yield batch_data, batch_labels

In [None]:
batch, labels = next(make_skip_gram_batchs_iter(contexts, window_size=2, num_skips=2, batch_size=32))

Initializing batchs generator with 295363 batchs per epoch


#### nn.Sequential

Simpliest way to create network is to use nn.Sequential from PyTorch model

In [None]:
model = nn.Sequential(
    nn.Embedding(len(word2index), 32),
    nn.Linear(32, len(word2index))
)

In [None]:
model.cuda()

Sequential(
  (0): Embedding(28635, 32)
  (1): Linear(in_features=32, out_features=28635, bias=True)
)

or

In [None]:
device = torch.device("cuda")

model = model.to(device)

Create tensors on gpu:

In [None]:
batch = torch.cuda.LongTensor(batch)
labels = torch.cuda.LongTensor(labels)

calculating logits:

In [None]:
logits = model(batch)

 Loss function

In [None]:
loss_function = nn.CrossEntropyLoss().cuda() 

Calculate loss:

In [None]:
loss = loss_function(logits, labels)

And, finally, backprop!

In [None]:
loss.backward()

At the end we will start optimize that.

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01) 

To start optimization we just call `step()`:

In [None]:
print(model[0].weight)

optimizer.step()

print(model[0].weight)

Parameter containing:
tensor([[ 1.5687,  0.8974, -0.6652,  ...,  0.3202,  0.7438, -2.4435],
        [-0.1911,  0.6675, -0.4920,  ...,  0.0348,  0.6454, -0.2675],
        [-1.1767, -0.0074,  1.3031,  ...,  2.5654,  1.9421, -0.9800],
        ...,
        [ 1.5167, -0.0539, -0.2989,  ...,  1.7295, -2.2647, -0.7563],
        [ 0.5362,  0.2104,  0.5932,  ...,  0.1006,  0.6059,  0.4281],
        [-1.7104, -1.0440, -0.4714,  ..., -1.0427,  0.4615,  1.6725]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[ 1.5787,  0.8874, -0.6552,  ...,  0.3102,  0.7338, -2.4335],
        [-0.1811,  0.6575, -0.4820,  ...,  0.0248,  0.6554, -0.2575],
        [-1.1867, -0.0174,  1.2931,  ...,  2.5554,  1.9321, -0.9700],
        ...,
        [ 1.5167, -0.0539, -0.2989,  ...,  1.7295, -2.2647, -0.7563],
        [ 0.5362,  0.2104,  0.5932,  ...,  0.1006,  0.6059,  0.4281],
        [-1.7104, -1.0440, -0.4714,  ..., -1.0427,  0.4615,  1.6725]],
       device='cuda:0', requires_grad=True)


And the last part is to nulify grads!

In [None]:
optimizer.zero_grad()

#### Train loop

In [None]:
loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()

device = torch.device("cuda")

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss().cuda() 


for step, (batch, labels) in enumerate(make_skip_gram_batchs_iter(contexts, window_size=2, num_skips=4, batch_size=128)):
    batch = torch.cuda.LongTensor(batch)
    labels = torch.cuda.LongTensor(labels)
    
    logits = model(batch)
    loss = loss_function(logits, labels)

    loss.backward()

    optimizer.step()
    
    optimizer.zero_grad()

    total_loss += loss.item()
    
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, 
                                                                    time.time() - start_time))
        total_loss = 0
        start_time = time.time()
    # for time reasons 
    if step > 35000:
        break

Initializing batchs generator with 147682 batchs per epoch
Step = 1000, Avg Loss = 7.1603, Time = 6.58s
Step = 2000, Avg Loss = 7.1338, Time = 2.10s
Step = 3000, Avg Loss = 7.0778, Time = 2.07s
Step = 4000, Avg Loss = 7.0469, Time = 2.08s
Step = 5000, Avg Loss = 6.9427, Time = 2.10s
Step = 6000, Avg Loss = 6.9616, Time = 2.08s
Step = 7000, Avg Loss = 6.9131, Time = 2.26s
Step = 8000, Avg Loss = 6.9077, Time = 2.08s
Step = 9000, Avg Loss = 6.8822, Time = 2.07s
Step = 10000, Avg Loss = 6.8610, Time = 2.09s
Step = 11000, Avg Loss = 6.8066, Time = 2.08s
Step = 12000, Avg Loss = 6.8501, Time = 2.14s
Step = 13000, Avg Loss = 6.8460, Time = 2.23s
Step = 14000, Avg Loss = 6.8209, Time = 2.08s
Step = 15000, Avg Loss = 6.7862, Time = 2.07s
Step = 16000, Avg Loss = 6.7729, Time = 2.06s
Step = 17000, Avg Loss = 6.7772, Time = 2.06s
Step = 18000, Avg Loss = 6.7868, Time = 2.16s
Step = 19000, Avg Loss = 6.7819, Time = 2.19s
Step = 20000, Avg Loss = 6.7595, Time = 2.08s
Step = 21000, Avg Loss = 6.788

#### Result analysis

Let's get embeddings from GPU to numpy

In [None]:
embeddings = model[0].weight.cpu().data.numpy()

And check how it looks like

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def most_similar(embeddings, index2word, word2index, word):
    word_emb = embeddings[word2index[word]]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [index2word[index] for index in reversed(top10)]

most_similar(embeddings, index2word, word2index, 'warm')

['warm',
 'soluble',
 'droplets',
 'tap',
 'salts',
 'copper',
 'insoluble',
 'purified',
 'fresh',
 'meters']

### CBOW
Alternative model variant:

![](https://image.ibb.co/jnsW49/CBOW.png)

Now using *sum* context vector central word vector is predicted.

In [None]:
def make_cbow_batchs_iter(contexts, window_size, batch_size):
    data = np.array([context for word, context in contexts if len(context) == 2 * window_size and word != 0])
    labels = np.array([word for word, context in contexts if len(context) == 2 * window_size and word != 0])
        
    batchs_count = int(math.ceil(len(data) / batch_size))
    
    print('Initializing batchs generator with {} batchs per epoch'.format(batchs_count))
    
    while True:
        indices = np.arange(len(data))
        np.random.shuffle(indices)

        for i in range(batchs_count):
            batch_begin, batch_end = i * batch_size, min((i + 1) * batch_size, len(contexts))
            batch_indices = indices[batch_begin: batch_end]

            batch_data, batch_labels = [], []

            for data_ind in batch_indices:
                data_sample, label = data[data_ind], labels[data_ind]
                batch_data.extend(data_sample)
                batch_labels.extend([label] * len(data_sample))
            
            yield batch_data, batch_labels

Better way to implement model in PyTorch is to inherit from nn.Module and create a class for the model:

```python
class MyNetModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super(MyNetModel, self).__init__()
        <initialize layers>
        
    def forward(self, inputs):
        <apply layers>
        return final_output
```



In [None]:
class CBoWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        hidden = self.embeddings(inputs)
        output = self.out_layer(hidden)
        return output
      
model = CBoWModel(vocab_size=len(word2index), embedding_dim=32).cuda()

optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss().cuda() 

In [None]:
loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()

for step, (batch, labels) in enumerate(make_cbow_batchs_iter(contexts, window_size=2, batch_size=128)):
    batch = torch.cuda.LongTensor(batch)
    labels = torch.cuda.LongTensor(labels)
    
    logits = model(batch)
    loss = loss_function(logits, labels)

    loss.backward()

    optimizer.step()
    
    optimizer.zero_grad()

    total_loss += loss.item()
    
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, 
                                                                    time.time() - start_time))
        total_loss = 0
        start_time = time.time()
    # for time reasons 
    if step > 35000:
        break

Initializing batchs generator with 36921 batchs per epoch
Step = 1000, Avg Loss = 7.2742, Time = 10.31s
Step = 2000, Avg Loss = 6.8194, Time = 5.07s
Step = 3000, Avg Loss = 6.7297, Time = 5.23s
Step = 4000, Avg Loss = 6.6814, Time = 5.10s
Step = 5000, Avg Loss = 6.6010, Time = 5.13s
Step = 6000, Avg Loss = 6.5776, Time = 5.22s
Step = 7000, Avg Loss = 6.5421, Time = 5.07s
Step = 8000, Avg Loss = 6.5455, Time = 5.21s
Step = 9000, Avg Loss = 6.4976, Time = 6.27s
Step = 10000, Avg Loss = 6.5058, Time = 5.32s
Step = 11000, Avg Loss = 6.4811, Time = 5.58s
Step = 12000, Avg Loss = 6.4733, Time = 5.24s
Step = 13000, Avg Loss = 6.4663, Time = 5.09s
Step = 14000, Avg Loss = 6.4723, Time = 5.13s
Step = 15000, Avg Loss = 6.4515, Time = 6.50s
Step = 16000, Avg Loss = 6.4691, Time = 5.86s
Step = 17000, Avg Loss = 6.4604, Time = 5.19s
Step = 18000, Avg Loss = 6.4520, Time = 5.09s
Step = 19000, Avg Loss = 6.4625, Time = 5.22s
Step = 20000, Avg Loss = 6.4404, Time = 5.08s
Step = 21000, Avg Loss = 6.437

In [None]:
embeddings = model.embeddings.weight.cpu().data.numpy()

In [None]:
most_similar(embeddings, index2word, word2index, 'warm')

['warm',
 'lime',
 'lemon',
 'mineral',
 'dispenser',
 'sinned',
 'distilled',
 'winters',
 'tap',
 'tupperware']

# Recurrent neural networks

In [None]:
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.3.6


In [None]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Names classification

In [None]:
!curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  13.1M      0 --:--:-- --:--:-- --:--:-- 13.0M
Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating

## Simple RNN

### Data preparation and train test split

In [None]:
data_dir = "./data/names"

lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
    }

In [None]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx); num_letters

59

In [None]:
lang2label

{'Scottish': tensor([0]),
 'Polish': tensor([1]),
 'German': tensor([2]),
 'French': tensor([3]),
 'Japanese': tensor([4]),
 'Dutch': tensor([5]),
 'Spanish': tensor([6]),
 'Italian': tensor([7]),
 'Czech': tensor([8]),
 'Irish': tensor([9]),
 'English': tensor([10]),
 'Chinese': tensor([11]),
 'Arabic': tensor([12]),
 'Korean': tensor([13]),
 'Russian': tensor([14]),
 'Vietnamese': tensor([15]),
 'Greek': tensor([16]),
 'Portuguese': tensor([17])}

In [None]:
num_langs = len(lang2label)

In [None]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [None]:
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

In [None]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

In [None]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


In [None]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


Main advantage of RNN - shared parameters:

![RNN types](http://karpathy.github.io/assets/rnn/diags.jpeg)

*From [(The Unreasonable Effectiveness of Recurrent Neural Networks)](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)*

First example is dense network. Others demonstrate processing of sequences of arbitrary length and generation of sequence of arbitrary length


And green blocks in every picture arethe same weights. So, from one side we will train really deep netrwork, from the other number of parameters would be bounded

---
Let's write RNN!


![rnn-unrolled](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/RNN-unrolled.png)

*From [(Understanding LSTM Networks)](http://colah.github.io/posts/2015-08-Understanding-LSTMs)*

In our case we start with such procesing:
$$h_t = tanh(W_h [h_{t-1}; x_t] + b_h)$$

$h_{t-1}$ - hidden state from previous step, $x_t$ - input vector. $[h_{t-1}; x_t]$ - simple concatenation.

We will check on simple task. Network will predict index of first element of the sequence

batch has dimensions `(sequence_length, batch_size, input_size)`. All `RNN` in pytorch works with such format by default.

We can change that with `batch_first`, if needed

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

## RNN Training

![bptt](https://image.ibb.co/cEYkw9/rnn_bptt_with_gradients.png)  
*From [Recurrent Neural Networks Tutorial, Part 3 – Backpropagation Through Time and Vanishing Gradients](http://www.wildml.com/2015/10/recurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients/)*

RNN can 'forget' information to understand why you can take a look here: [Backpropagation Through Time and Vanishing Gradients](http://www.wildml.com/2015/10/recurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients/) или здесь - [Vanishing Gradients & LSTMs](http://harinisuresh.com/2016/10/09/lstms/).

One problem of RNN - *gradient explotion*. 

It appears when the weight matrix is such that it increases the norm of the gradient vector on the back pass. As a result, the norm of the gradient grows exponentially and it "explodes".

This can be solved by using gradient clipping: `nn.utils.clip_grad_norm_(rnn.parameters(), 1.)`.

In [None]:
hidden_size = 256
learning_rate = 0.001

model = SimpleRNN(num_letters, hidden_size, num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 4.0323
Epoch [1/2], Step [6000/18063], Loss: 0.1284
Epoch [1/2], Step [9000/18063], Loss: 0.4120
Epoch [1/2], Step [12000/18063], Loss: 1.9984
Epoch [1/2], Step [15000/18063], Loss: 0.0063
Epoch [1/2], Step [18000/18063], Loss: 0.0025
Epoch [2/2], Step [3000/18063], Loss: 0.0000
Epoch [2/2], Step [6000/18063], Loss: 2.6498
Epoch [2/2], Step [9000/18063], Loss: 0.0498
Epoch [2/2], Step [12000/18063], Loss: 0.0412
Epoch [2/2], Step [15000/18063], Loss: 0.0000
Epoch [2/2], Step [18000/18063], Loss: 0.0171


### Evaluation

In [None]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 72.3966%


In [None]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}

def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model.train()    
    return label2lang[pred.item()]

In [None]:
myrnn_predict("Mike")

'English'

In [None]:
myrnn_predict("Qin")

'Chinese'

Other proble 

Another problem is *fading gradients*. It is connected vice versa - with exponential damping of gradients. And now it is solved in more complex ways.

Namely, they use gate architectures.

Gate's idea is simple, but important, they are used not only in recurrent networks.

If you look at how our SimpleRNN works, you can see that each time the memory (i.e. $h_t$) is overwritten. I would like to be able to make this rewriting controllable: not discard some important information from the vector.

To do this, we will create a vector $g \in \{0,1\}^n$, which will say which cells $h_{t-1}$ are good, and which ones should be replaced with new values:
$$h_t = g \odot f(x_t, h_{t-1}) + (1 - g) \odot h_{t-1}.$$

For example:
$$
 \begin{bmatrix}
  8 \\
  11 \\
  3 \\
  7
 \end{bmatrix} =
 \begin{bmatrix}
  0 \\
  1 \\
  0 \\
  0
 \end{bmatrix}
 \odot
  \begin{bmatrix}
  7 \\
  11 \\
  6 \\
  5
 \end{bmatrix}
 +
  \begin{bmatrix}
  1 \\
  0 \\
  1 \\
  1
 \end{bmatrix}
 \odot
  \begin{bmatrix}
  8 \\
  5 \\
  3 \\
  7
 \end{bmatrix}
$$

Чтобы добиться дифференцируемости, будем использовать сигмоиду: $\sigma(f(x_t, h_{t-1}))$.

В результате сеть будет сама, глядя на входы, решать, какие ячейки своей памяти и насколько стоит перезаписывать.

### LSTM

It seems that the first architecture to use this mechanism was LSTM (Long Short-Term Memory).

In it, $c_{t-1}$ is also added to $h_{t-1}$: $h_{t-1}$ is the same hidden state obtained in the previous step, and $c_{t -1}$ is a memory vector.

Schematically, it looks like this:
![](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)
*From [(Understanding LSTM Networks)](http://colah.github.io/posts/2015-08-Understanding-LSTMs)*

To begin with, we can calculate the new hidden state in the same way as before (let's denote it $\tilde c_{t}$):
$$\tilde c_{t} = tanh(W_h [h_{t-1}; x_t] + b_h)$$

In simple RNNs, we would simply overwrite the third hidden state with this value. And now we want to understand how much we need information from $c_{t-1}$ and from $\tilde c_{t}$.

Let's evaluate it with sigmoids:
$$f = \sigma(W_f [h_{t-1}; x_t] + b_f),$$
$$i = \sigma(W_i [h_{t-1}; x_t] + b_i).$$

The first is about how much you want to forget old information. The second is how interesting the new one is. Then
$$c_t = f \odot c_{t-1} + i \odot \tilde c_t.$$

We will also weigh the new hidden state:
$$o = \sigma(W_o [h_{t-1}; x_t] + b_o),$$
$$h_t = o \odot tanh(c_t).$$

One more picture:
![](https://image.ibb.co/e6HQUU/details.png)  
*From [Vanishing Gradients & LSTMs](http://harinisuresh.com/2016/10/09/lstms/)*

Why is the problem of fading gradients solved? Because look at the derivative $\frac{\partial c_t}{\partial c_{t-1}}$. It is proportional to the gate $f$. If $f=1$ - gradients flow unchanged. Otherwise the network itself learns when it wants to forget something.

It is highly recommended to read the article: [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) for more details and pictures.

Why did I write out these formulas? The main thing is to show how many more parameters you need to learn in LSTM compared to a regular RNN. Four times more!

Simple video for more understanding - [video how RNN forgets (bottom)](https://www.youtube.com/watch?v=mLxsbWAYIpw)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, num_layers, hidden_size):
        super(LSTMModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size=num_letters, 
            hidden_size=hidden_size, 
            num_layers=num_layers,
        )
        self.fc = nn.Linear(hidden_size, num_langs)
    
    def forward(self, x):
        output, hidden_state = self.lstm(x)
        output = self.fc(output[-1])
        return output

In [None]:
model = LSTMModel(num_layers=2, hidden_size=hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        output = model(name)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
         
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 1.1911
Epoch [1/2], Step [6000/18063], Loss: 3.0587
Epoch [1/2], Step [9000/18063], Loss: 0.2908
Epoch [1/2], Step [12000/18063], Loss: 0.0504
Epoch [1/2], Step [15000/18063], Loss: 0.0003
Epoch [1/2], Step [18000/18063], Loss: 0.0108
Epoch [2/2], Step [3000/18063], Loss: 0.0008
Epoch [2/2], Step [6000/18063], Loss: 0.9084
Epoch [2/2], Step [9000/18063], Loss: 1.9612
Epoch [2/2], Step [12000/18063], Loss: 1.0311
Epoch [2/2], Step [15000/18063], Loss: 0.4979
Epoch [2/2], Step [18000/18063], Loss: 1.1666


In [None]:
num_correct = 0

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        output = model(name)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 79.5217%


In [None]:
def pytorch_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        output = model(tensor_name)
        _, pred = torch.max(output, dim=1)
    model.train()
    return label2lang[pred.item()]

In [None]:
pytorch_predict("Jake")

'English'

In [None]:
pytorch_predict("Sergei")

'Italian'

# Additional materials

Here is implementation of attention layer for transformers architecture. Study it and try to use in more sophisticated architecture if you wish

Also you can try something from hugging face, i.e.
https://huggingface.co/distilbert-base-uncased

In [None]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F


class MultiHeadAttention(nn.Module):
    def __init__(self, model_dimension, num_heads=1, dropout=0.0):
        super().__init__()
        self.num_heads = num_heads

        self.dropout = nn.Dropout(dropout)
        self.q = nn.Linear(model_dimension, model_dimension * num_heads)
        self.k = nn.Linear(model_dimension, model_dimension * num_heads)
        self.v = nn.Linear(model_dimension, model_dimension * num_heads)
        self.outputs = nn.Linear(model_dimension * num_heads, model_dimension)

    @staticmethod
    def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, dropout=None, num_heads=1):
        seq_len, bsz, embedding_dim, = q.size()
        head_dim = embedding_dim // num_heads

        q = q.contiguous().view(seq_len, bsz * num_heads, head_dim).transpose(0, 1)
        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)

        score = torch.bmm(q, k.transpose(1, 2)) / np.sqrt(head_dim)

        if dropout is not None:
            score = dropout(score)
        score = torch.bmm(F.softmax(score, dim=-1), v)
        score = score.transpose(0, 1).contiguous().view(seq_len, bsz, embedding_dim)
        return score

    def forward(self, x):
        q = self.q(x).contiguous().transpose(0, 1)
        k = self.k(x).contiguous().transpose(0, 1)
        v = self.v(x).contiguous().transpose(0, 1)
        y = self.attention(q, k, v, self.dropout, self.num_heads)
        y = y.contiguous().transpose(0, 1)
        y = self.outputs(y)
        return y


class FeedForward(nn.Module):
    def __init__(self, model_dim, linear_dim=2048, dropout=0.0):
        super().__init__()
        self.linear_1 = nn.Linear(model_dim, linear_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(linear_dim, model_dim)

    def forward(self, x):
        x = self.linear_1(x)
        x = F.relu(self.dropout(x))
        x = self.linear_2(x)
        return x


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=200):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)