# Assignment 8
Develop a model for 20 news groups dataset. Select 20% of data for test set.

Use metric learning with siamese networks and triplet loss.

Use KNN and LSH (annoy library) for final prediction after the network was trained.

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors.

Your quality = accuracy score

In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
df = pd.read_csv('data.csv')
len(df)

11314

In [3]:
df.head()

Unnamed: 0,id,message,topic,tokenized_message,lemmatized_message
0,0,From: lerxst@wam.umd.edu (where's my thing)\r\...,7,From where have my thing Subject WHAT car be t...,from where have my thing subject what car be t...
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,From Guy Kuo Subject SI Clock Poll Final Call ...,from guy kuo subject si clock poll final call ...
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,From Thomas E Willis Subject PB question Organ...,from thomas e willis subject pb question organ...
3,3,From: jgreen@amber (Joe Green)\r\nSubject: Re:...,1,From Joe Green Subject Re Weitek Organization ...,from joe green subject re weitek organization ...
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,From Jonathan McDowell Subject Re Shuttle Laun...,from jonathan mcdowell subject re shuttle laun...


In [4]:
from sklearn.model_selection import train_test_split

SEED = 42
import numpy as np
np.random.seed(SEED)

In [5]:
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchtext import data
from allennlp.modules.elmo import Elmo, batch_to_ids

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [7]:
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
elmo = Elmo(options_file, weight_file, 2, dropout=0)

# Создание триплетов

In [6]:
def pdist(emb1, emb2):
    '''
    compute the eucilidean distance matrix between embeddings
    '''
    m, n = emb1.shape[0], emb2.shape[0]
    emb1_pow = tt.pow(emb1, 2).sum(dim = 1, keepdim = True).expand(m, n)
    emb2_pow = tt.pow(emb2, 2).sum(dim = 1, keepdim = True).expand(n, m).t()
    dist_mtx = emb1_pow + emb2_pow
    dist_mtx = dist_mtx.addmm_(1, -2, emb1, emb2.t())
    dist_mtx = tt.sqrt(dist_mtx.clamp(min = 1e-12))
    return dist_mtx

class TripletSelector(object):
    '''
    generate triplet
    '''
    def __init__(self, *args, **kwargs):
        super(TripletSelector, self).__init__()

    def __call__(self, embeds, labels):
        dist_mtx = pdist(embeds, embeds).detach().cpu().numpy()
        labels = labels.contiguous().cpu().numpy().reshape((-1, 1))
        num = labels.shape[0]
        dia_inds = np.diag_indices(num)
        lb_eqs = labels == labels.T
        lb_eqs[dia_inds] = False
        dist_same = dist_mtx.copy()
        dist_same[lb_eqs == False] = -np.inf
        pos_idxs = np.argmax(dist_same, axis = 1)
        dist_diff = dist_mtx.copy()
        lb_eqs[dia_inds] = True
        dist_diff[lb_eqs == True] = np.inf
        neg_idxs = np.argmin(dist_diff, axis = 1)
        pos = embeds[pos_idxs].contiguous().view(num, -1)
        neg = embeds[neg_idxs].contiguous().view(num, -1)
        return embeds, pos, neg

In [7]:
selector = TripletSelector()

# Модель

In [8]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    loss = F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)
    loss = loss.mean()
    return loss

class Tripletnet(nn.Module):
    def __init__(self):
        super(Tripletnet, self).__init__()
        self.fc = nn.Linear(256*2, 128)
        
    def branch(self, x):
        x = self.fc(x)
        return x

    def forward(self, anchor, pos, neg):
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        return triplet_loss(anchor, pos, neg)

In [9]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        anchor, pos, neg = selector(batch[0], batch[1])
        loss = model(anchor, pos, neg)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            anchor, pos, neg = selector(batch[0], batch[1])
            loss = model(anchor, pos, neg)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
    print('Saving model...')
    tt.save(model, "model.pt")

# Подготовка данных

In [12]:
for i, text in tqdm(enumerate(df.lemmatized_message.values)):
    x = batch_to_ids([text.split()])
    x = elmo(x)['elmo_representations']
    x = tt.cat(x, dim=-1)
    x = x.mean(dim=1)
    x = x.detach().numpy()
    if i == 0:
        data = x
    else:
        data = np.vstack((data, x))
    if i % 100 == 0:
        np.save('data.npy', data)
np.save('data.npy', data)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
data = np.load('data.npy')
data.shape

(11314, 512)

# Делим на train, validation и test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, df.topic.values, random_state=SEED, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=SEED, test_size=0.2)

In [12]:
batch_size = 32
train_dataset = TensorDataset(tt.tensor(X_train), tt.tensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_dataset = TensorDataset(tt.tensor(X_val), tt.tensor(y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [13]:
model = Tripletnet()
optimizer = optim.Adam(model.parameters())

In [14]:
nn_train(model, train_loader, val_loader, optimizer, n_epochs=3)

HBox(children=(IntProgress(value=0, description='epoch 0', max=227, style=ProgressStyle(description_width='ini…


validation loss 0.00030


HBox(children=(IntProgress(value=0, description='epoch 1', max=227, style=ProgressStyle(description_width='ini…


validation loss 0.00018


HBox(children=(IntProgress(value=0, description='epoch 2', max=227, style=ProgressStyle(description_width='ini…


validation loss 0.00016
Saving model...


  "type " + obj.__name__ + ". It won't be checked "


# Точность

In [15]:
model = tt.load('model.pt')
#_train = np.load('_train.npy')
#_test = np.load('_test.npy')

In [16]:
_train = tt.from_numpy(X_train)
_train = model.branch(_train).detach().numpy()
np.save("_train.npy", _train)
_train.shape

(7240, 128)

In [17]:
_test = tt.from_numpy(X_test)
_test = model.branch(_test).detach().numpy()
np.save("_test.npy", _test)
_test.shape

(2263, 128)

In [18]:
from annoy import AnnoyIndex

In [19]:
f = 128
t = AnnoyIndex(f)
for i, v in tqdm(enumerate(_train)):
    t.add_item(i, v)
t.build(128)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




True

In [21]:
accuracy = 0
for i, v in tqdm(enumerate(_test)):
    # находим индексы 20-и ближайших соседей
    ids_nearest_neighbors = t.get_nns_by_vector(v, 10)
    neighbor_vectors = []
    # находим их эмбеддинги
    for id_neighbor in ids_nearest_neighbors:
        neighbor_v = _train[id_neighbor]
        true_train = y_train[id_neighbor]
        neighbor_vectors.append((true_train, neighbor_v))
    # находим расстояние между вектором теста и каждым вектором ближашего соседа
    probs = np.zeros(20)
    for k, v_n in enumerate(neighbor_vectors):
        dist = np.linalg.norm(v-v_n[1])
        probs[v_n[0]] += dist
    pred = np.argmax(probs)
    if y_test[i] == pred:
        accuracy += 1
print('Accuracy:', accuracy/len(_test))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy: 0.447635881573133
