# Assignment 8 (unfinished)

Develop a model for 20 news groups dataset. Select 20% of data for test set.  

Use metric learning with siamese networks and triplet loss.   
Use KNN and LSH (`annoy` library) for final prediction after the network was trained.

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors. 

Your quality = accuracy score

In [1]:
import pandas as pd
import numpy as np
import gensim
import spacy
import os
from pathlib import Path
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

from torchtext import data


SEED = 42
np.random.seed(SEED)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


___

**Data + train, validation, test split**

In [4]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='train')

In [6]:
X_train, X_val, y_train, y_val = train_test_split(newsgroups_train.data,
                                                  newsgroups_train.target,
                                                  test_size=0.2,
                                                  random_state=SEED,
                                                  shuffle=True)

X_test, y_test = newsgroups_test.data, newsgroups_test.target

[allennlp.modules.elmo.batch_to_ids()](https://allenai.github.io/allennlp-docs/api/allennlp.modules.elmo.html#allennlp.modules.elmo.batch_to_ids)

Converts a batch of **tokenized sentences** to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length).

`batch_to_ids(batch:typing.List[typing.List[str]])`

In [7]:
sentences = ['First sentence.', 'Another one sentence.']

In [8]:
spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(sentence):
    return [[tok.lemma_ for tok in spacy_en.tokenizer(sentence) if tok.text.isalpha()]]

In [9]:
for i in sentences:
    print(tokenizer(i))

[['First', 'sentence']]
[['Another', 'one', 'sentence']]


[the 12th seminar](https://github.com/thedenaas/hse_seminars_2018/blob/master/seminar_12/more_embeddings.ipynb):

```def branch(self, x):
    x = self.elmo(x)['elmo_representations']
    x = tt.cat(x, dim=-1)
    x = x.mean(dim=1)
    x = self.fc(x)
    return x```

In [10]:
def elmo_embeddings(p, sentences):
    for sent in tqdm_notebook(sentences):
        tok_sent = tokenizer(sent)
        character_ids = batch_to_ids(tok_sent)
        embeddings = elmo(character_ids)['elmo_representations']
        x = tt.cat(embeddings, dim=-1)
        x = x.mean(dim=1)
        x = x.detach().numpy() # if x.numpy(): Can't call numpy() on Variable that requires grad. 
                               # Use var.detach().numpy() instead.
        
        with p.open('ab') as f:
            np.save(f, x)

        with p.open('rb') as f:
            fsz = os.fstat(f.fileno()).st_size
            out = np.load(f)
            while f.tell() < fsz:
                out = np.vstack((out, np.load(f)))
    
    return out

In [14]:
test_path = Path('test_embeddings.npy')
test_embeddings = elmo_embeddings(test_path, X_test[:5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [15]:
train_path = Path('train_embeddings.npy')
train_embeddings = elmo_embeddings(train_path, X_train[:5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [16]:
valid_path = Path('valid_embeddings.npy')
valid_embeddings = elmo_embeddings(valid_path, X_val[:5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [17]:
# train_path_full = Path('train_embeddings_full.npy')
# train_embeddings_full = elmo_embeddings(train_path_full, X_train[:5000])

In [18]:
# valid_path_full = Path('valid_embeddings_full.npy')
# valid_embeddings_full = elmo_embeddings(valid_path_full, X_val)

In [19]:
y_train_tt, y_val_tt, y_test_tt = tt.FloatTensor(y_train), tt.FloatTensor(y_val), tt.FloatTensor(y_test)

In [20]:
def get_triplets(data, label):
    
    # positive -- the encodings for the positive data (similar to anchor) --> y[curr] == y
    # negative -- the encodings for the negative data (different from anchor) --> y[curr] != y
    
    pos = []
    neg = []
    
    for idx in range(len((data))):
        positive = data[np.random.choice(np.where(label == label[idx])[0])]
        pos.append(positive)
        
        negative = data[np.random.choice(np.where(label != label[idx])[0])]
        neg.append(negative)

    return pos, neg

In [21]:
pos_train, neg_train = get_triplets(train_embeddings, y_train_tt[:5])

In [22]:
pos_val, neg_val = get_triplets(valid_embeddings, y_val_tt[:5])

In [23]:
pos_test, neg_test = get_triplets(test_embeddings, y_test_tt[:5])

In [24]:
batch_size = 32

train_loader = DataLoader(TensorDataset(tt.Tensor(train_embeddings), tt.FloatTensor(pos_train), tt.FloatTensor(neg_train)), batch_size=batch_size)
val_loader = DataLoader(TensorDataset(tt.Tensor(train_embeddings), tt.FloatTensor(pos_val), tt.FloatTensor(neg_val)), batch_size=batch_size)
test_loader = DataLoader(TensorDataset(tt.Tensor(test_embeddings), tt.FloatTensor(pos_test), tt.FloatTensor(neg_test)), batch_size=batch_size)

___

In [25]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        loss = model(batch[0], batch[1], batch[2]).sum()
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch[0], batch[1], batch[2]).sum()
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [26]:
### SEMINAR

def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed).mean()

class MyModel(nn.Module):
    
    def __init__(self, elmo, criterion):
        super(MyModel, self).__init__()
        
        self.elmo = elmo
        self.criterion = criterion
        
        self.fc = nn.Linear(1024*2, 128)
        
        self.out = nn.Linear(128*3, 1)
        
    def branch(self, x):
        x = self.fc(x)
        return x
        
    def forward(self, anchor, pos, neg):
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return triplet_loss(anchor, pos, neg)



# model = MyModel(elmo, nn.BCEWithLogitsLoss())

# optimizer = optim.Adam(model.parameters())

# nn_train(model, train_loader, val_loader, optimizer, n_epochs=2)

In [27]:
model = MyModel(elmo, nn.BCEWithLogitsLoss())

optimizer = optim.Adam(model.parameters())

nn_train(model, train_loader, val_loader, optimizer, n_epochs=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=1, style=ProgressStyle(description_width='initi…


validation loss 0.65779


HBox(children=(IntProgress(value=0, description='epoch 1', max=1, style=ProgressStyle(description_width='initi…


validation loss 1.27336


In [39]:
_train = tt.from_numpy(train_embeddings)
_train = model.branch(_train).detach().numpy()

In [54]:
# https://www.kaggle.com/nicw102168/nearest-neighbor-classification-with-annoy

import annoy

def test_annoy_knn(Ntrees=128):
    
    vector_length = 128
    t = annoy.AnnoyIndex(vector_length)
    X_train = model.branch(tt.from_numpy(train_embeddings)).detach().numpy()
    for i, v in enumerate(X_train):
        t.add_item(i, v)

    t.build(Ntrees)
    
    # error with shapes

#     y_hat = [y_train_tt[:5][t.get_nns_by_vector(v, 10)] for v in test_embeddings[:5]]
#     acc = metrics.accuracy_score(y_test_tt[:5], y_hat)
#     conf = metrics.confusion_matrix(y_test_tt[:5], y_hat)
#     return acc, conf

    return t

In [55]:
test_annoy_knn()

<annoy.Annoy at 0x7f0f50ebd1f0>

In [56]:
def save_model():
    model_name = 'assignment8.pt'
    tt.save(model, model_name)
    
    print('saving model as %s' % model_name)
    
save_model()

saving model as assignment8.pt
