# Embeddings: How Machines "Understand" Words

## Understand vs. Reading Text

## Word Vectors

In [None]:
import torch
o = torch.zeros(20000).int()
o[1152] = 1

In [None]:
E = torch.nn.Embedding(num_embeddings = 20000, embedding_dim = 300)
e = E(o)

In [None]:
e

### Word2Vec

### Embeddings in the Age of Transfer Learning

## Embeddings in Practice

### Preprocessing

In [None]:
import torch.nn.functional as F
import torch.nn as nn
from torch import optim

from torchtext import *
import torchtext

In [None]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
TEXT.build_vocab(train, vectors='glove.6B.100d')

In [None]:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, \
batch_first=False, tokenize='spacy')
LABEL = data.LabelField()

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train, vectors='glove.6B.100d') \
# use 'glove.42B.300d' for greater accuracy or \
'glove.6B.100d' for greater speed
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test), \
batch_sizes=(128,1024), device=dev, sort_within_batch=True, repeat=False)

### Model

In [None]:
class RNN_classifier(nn.Module):
    def __init__(self, embedding_size = 100, hidden_size = 512, num_layers = 3):
        super().__init__()

        # Set up an embedding layer with the right dimensions, \
        and copy the weights from the pretrained glove embeddings
        vocab = TEXT.vocab
        self.embed = nn.Embedding(len(vocab), embedding_size).cuda()
        self.embed.weight.data.copy_(vocab.vectors)

        # Set up a standard PyTorch RNN sections with the right \
        dimensions and a variable number of layers
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers)

        # Add a two layer classification head with the right dimensions. \
        The final layer must output a single number
        self.classificationLayer1 = nn.Linear(hidden_size,10)
        self.classificationLayer2 = nn.Linear(10,1)


    def forward(self, input, lengths=None):

        embed_input = self.embed(input)
        packed_emb = nn.utils.rnn.pack_padded_sequence(embed_input, \
        lengths, batch_first=False)

        output, hidden = self.rnn(packed_emb)
        hidden = hidden[-1]
        x = hidden.squeeze(0)
        x = self.classificationLayer1(x)
        x = self.classificationLayer2(x)

        logits = x.view(-1)
        return logits


In [None]:
model = RNN_classifier(hidden_size=256, num_layers=1)
model.to(dev)

In [None]:
for batch in train_iter:
    (x,x_len) = batch.text
    pred = model(x,x_len)
    print(pred.shape)
    break

### Training

In [None]:
loss_func = F.binary_cross_entropy_with_logits
opt = optim.Adam(model.parameters(), lr=1e-4)
epochs = 6

In [None]:
def get_metrics(model, test_data):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(test_data):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            total += batch_data.label.size(0)
            correct += (predicted_labels == batch_data.label.long()).sum()
        return correct.float()/total

In [None]:
from tqdm import tqdm_notebook as tqdm

for epoch in tqdm(range(epochs)):
    model.train()
    for batch in tqdm(train_iter):
        (x,x_lengths)=batch.text
        pred = model(x,x_lengths)

        actual=batch.label.float()
        loss = loss_func(pred,actual)

        loss.backward()
        opt.step()
        opt.zero_grad()

    if (epoch==5):
        for g in opt.param_groups:
            g['lr'] = 3e-3

    print("Accuracy: " + str(get_metrics(model, test_iter).cpu().numpy()))

### Validation

In [None]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    # based on:
    # https://github.com/bentrevett/pytorch-sentiment-analysis/blob/
    # master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(dev)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
review = """I like that Far From Home is trying something new and that its
humor  feels more real than the ironic cracks in most superhero movies.
I just wish its good pieces all came together more satisfyingly."""

print('Probability positive:')
predict_sentiment(model, review)

## Embedding Things That Aren't Words

#### A Sonnet in The MIDI Protocol

### Some General Tips for Making Custom Embeddings

## Conclusion