# Assignment 8

Develop a model for 20 news groups dataset. Select 20% of data for test set.  

Use metric learning with siamese networks and triplet loss.   
Use KNN and LSH (`annoy` library) for final prediction after the network was trained.

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors. 

Your quality = accuracy score

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.model_selection import train_test_split

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

from torchtext import data

In [2]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [3]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [4]:
y_train = train.target
train = train.data

In [5]:
test, valid, y_test, y_valid = test.data[:len(test.data)//2], test.data[len(test.data)//2:], test.target[:len(test.data)//2], test.target[len(test.data)//2:]

#### Переводит тектсы с списки индексов

batch_to_idx из Elmo у меня не работает, все падает. Написала сама такую же функцию.

In [6]:
def get_ids(data):
    word2id = dict()
    id2word = dict()
    i = 1
    max_len = 0
    
    for dtype in data:
        for comment in dtype:
            tokens = tokenizer(comment)
            if len(tokens) > max_len:
                max_len = len(tokens)
            
            for t in tokens:
                if not t in word2id:
                    word2id[t] = i
                    id2word[i] = t
                    i += 1
                    
    return word2id, id2word, max_len

def batch_to_ids(dtype, word2id, max_len):
    list_of_ids = list()
    
    for comment in tqdm_notebook(dtype):
        comment_ids = list()
        tokens = tokenizer(comment)
        
        for t in tokens:
            comment_ids.append(word2id[t])
        while len(comment_ids) < max_len:
            comment_ids.append(0)
        list_of_ids.append(comment_ids)
        
    return list_of_ids

In [7]:
word2id, id2word, max_len = get_ids((train, valid, test))

In [8]:
test_id = batch_to_ids(test, word2id, max_len)
train_id = batch_to_ids(train, word2id, max_len)
valid_id = batch_to_ids(valid, word2id, max_len)

HBox(children=(IntProgress(value=0, max=3766), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11314), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3766), HTML(value='')))




#### Делаем тройки даных

In [9]:
def make_triplets(data, y):
    positive = list()
    negative = list()
    for i, comment in enumerate(data):
        positive.append(data[np.random.choice(np.where(y == y[i])[0])])
        negative.append(data[np.random.choice(np.where(y != y[i])[0])])
    
    return tt.tensor(positive), tt.tensor(negative)

In [10]:
batch_size = 32
positive, negative = make_triplets(test_id, y_test)
test_loader = DataLoader(TensorDataset(tt.tensor(test_id), positive, negative), batch_size=batch_size, drop_last=True)

positive, negative = make_triplets(train_id, y_train)
train_loader = DataLoader(TensorDataset(tt.tensor(train_id), positive, negative), batch_size=batch_size, drop_last=True)

positive, negative = make_triplets(valid_id, y_valid)
valid_loader = DataLoader(TensorDataset(tt.tensor(valid_id), positive, negative), batch_size=batch_size, drop_last=True)

#### Модель

In [56]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        loss = model(batch[0], batch[1], batch[2])
        loss = tt.abs(tt.mean(loss))
        loss.backward()
        optimizer.step()
        
        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss
        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch[0], batch[1], batch[2])
            loss = tt.abs(tt.mean(loss))
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        #valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [12]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)

In [13]:
class Tripletnet(nn.Module):
    
    def __init__(self, vocab_size, embed_size, criterion):
        super(Tripletnet, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.criterion = criterion
        
        self.fc = nn.Linear(embed_size, 128)
        
        self.out = nn.Linear(128*3, 20)
        
    def branch(self, x):
        
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x
        
    def forward(self, anchor, pos, neg):
        
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return triplet_loss(anchor, pos, neg)

#### Обучаем модель

In [57]:
model = Tripletnet(len(word2id), 100, nn.BCEWithLogitsLoss())

optimizer = optim.Adam(model.parameters())

nn_train(model, train_loader, valid_loader, optimizer, n_epochs=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=353), HTML(value='')))

validation loss 0.00000


HBox(children=(IntProgress(value=0, description='epoch 1', max=353), HTML(value='')))

validation loss 0.00000


#### Сохраняем модель

In [58]:
tt.save(model, 'model_ass8.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [6]:
model = tt.load('model_ass8.pt')

### Annoy

In [17]:
from annoy import AnnoyIndex
from scipy.spatial import cKDTree

#### KNN

In [59]:
f = 128
a = AnnoyIndex(f)
train_emb = list()

for i in tqdm_notebook(range(1, len(train_id))):    
    emb = model.branch(tt.tensor(train_id[i-1:i])).detach().numpy()
    train_emb.append(emb[0])
    a.add_item(i, emb[0])
a.build(128)

HBox(children=(IntProgress(value=0, max=11313), HTML(value='')))

True

In [29]:
test_emb = list()

for i in tqdm_notebook(range(1, len(test_id))):    
    emb = model.branch(tt.tensor(test_id[i-1:i])).detach().numpy()
    test_emb.append(emb[0])

HBox(children=(IntProgress(value=0, max=3755), HTML(value='')))

In [60]:
np.save("test_emb.npy", np.array(test_emb))
np.save("train_emb.npy", np.array(train_emb))

In [61]:
accuracy_score = 0

for i, v in tqdm_notebook(enumerate(test_emb)):

    ten_nearest = a.get_nns_by_vector(v, 10)
    nearest_v = list()
    nearest_y = list()

    for n in ten_nearest:
        n_vec = train_emb[n]
        real_train = y_train[n]
        nearest_v.append(list(n_vec))
        nearest_y.append(real_train)

    pred = nearest_v[cKDTree(nearest_v).query(v, k=1)[1]]
    id_pred = nearest_v.index(pred)
    if y_test[i] == nearest_y[id_pred]:
        accuracy_score += 1
        
print('Accuracy:', accuracy_score/len(test_emb))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Accuracy: 0.03089214380825566


#### LSH

In [62]:
accuracy_score = 0
for i, v in tqdm_notebook(enumerate(test_emb)):

    ten_nearest = a.get_nns_by_vector(v, 10)
    nearest_v = list()
    nearest_y = list()

    for n in ten_nearest:
        n_vec = train_emb[n]
        real_train = y_train[n]
        nearest_v.append(n_vec)
        nearest_y.append(real_train)
    
       
    if y_test[i] == max(set(nearest_y), key = nearest_y.count):
        accuracy_score += 1
        
print('Accuracy:', accuracy_score/len(test_emb))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Accuracy: 0.05193075898801598
