# Assignment 8
Develop a model for 20 news groups dataset from scikit-learn. Select 20% of data for test set.

Develop metric learning model with siamese network [3 points] and softmax loss or triplet loss [3 points] (from seminar). Use KNN and LSH (any library for approximate nearest neighbor search) for final prediction after the network was trained. [2 points]

! Remember, that LSH gives you a set of neighbor candidates, for which you have to calculate distances to choose top-k nearest neighbors.

Your quality metric = accuracy score [2 points if acc > 0.8 ]

In [0]:
import gensim
import gensim.downloader as api
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.ndimage.filters import gaussian_filter1d
from tqdm import tqdm, tqdm_notebook

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
batch_size = 256
random_state = 42

In [0]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else tt.device('cpu')

In [5]:
newsgroups = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
df = pd.DataFrame(columns=['text', 'target'])
df['text'] = newsgroups.data
df['text']  = df['text'].apply(lambda sent: nltk.word_tokenize(sent))
df['target'] = newsgroups.target
df['target'] = df['target'].apply(float)

In [7]:
wv = api.load('word2vec-google-news-300')
wv['king'][:10]



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


array([ 0.12597656,  0.02978516,  0.00860596,  0.13964844, -0.02563477,
       -0.03613281,  0.11181641, -0.19824219,  0.05126953,  0.36328125],
      dtype=float32)

In [0]:
emb_dim = wv['king'].size

In [0]:
def vectorize_sent(sent, wv):
  sent_vec = []
  for w in sent:
    try:
      vec = wv[w]
      sent_vec.append(vec)
    except KeyError:
      continue
  return np.mean(np.array(sent_vec), axis=0)

In [0]:
df['vec'] = df['text'].apply(lambda sent: vectorize_sent(sent, wv))

In [11]:
df.head()

Unnamed: 0,text,target,vec
0,"[From, :, lerxst, @, wam.umd.edu, (, where, 's...",7.0,"[0.032288477, 0.03263666, 0.07366503, 0.066889..."
1,"[From, :, guykuo, @, carson.u.washington.edu, ...",4.0,"[-0.020951407, 0.072069034, 0.043153763, 0.052..."
2,"[From, :, twillis, @, ec.ecn.purdue.edu, (, Th...",4.0,"[0.02607478, 0.023831822, 0.02035976, 0.102583..."
3,"[From, :, jgreen, @, amber, (, Joe, Green, ), ...",1.0,"[-0.005437399, -0.0005287288, 0.017397419, 0.0..."
4,"[From, :, jcm, @, head-cfa.harvard.edu, (, Jon...",14.0,"[-0.07343274, 0.00018738056, 0.033099294, 0.09..."


### add negative and positive

In [0]:
def add_positive(row, df):
  target = row['target'] 
  vec = row['vec']
  positive = df[df['target'] == target]['vec']
  selected = np.random.choice(positive)
  while selected.tolist() == vec.tolist():
    selected = np.random.choice(positive)
  return selected

In [0]:
def add_negative(row, df):
  target = row['target'] 
  vec = row['vec']
  negative = df[df['target'] != target]['vec']
  return np.random.choice(negative)

In [14]:
positives = []
negatives = []
for i, row in df.iterrows():
  positives.append(add_positive(row, df))
  negatives.append(add_negative(row, df))
len(positives)

11314

In [0]:
df['positive'] = positives
df['negative'] = negatives

In [16]:
df.head()

Unnamed: 0,text,target,vec,positive,negative
0,"[From, :, lerxst, @, wam.umd.edu, (, where, 's...",7.0,"[0.032288477, 0.03263666, 0.07366503, 0.066889...","[-0.02043521, 0.04807749, 0.044298727, 0.06962...","[-0.031017743, 0.0011712471, 0.0742548, 0.0510..."
1,"[From, :, guykuo, @, carson.u.washington.edu, ...",4.0,"[-0.020951407, 0.072069034, 0.043153763, 0.052...","[-0.015402461, 0.0016477992, 0.023189358, 0.12...","[-0.047387306, 0.012864274, 0.047276933, 0.094..."
2,"[From, :, twillis, @, ec.ecn.purdue.edu, (, Th...",4.0,"[0.02607478, 0.023831822, 0.02035976, 0.102583...","[-0.020003768, 0.0018544617, 0.041496783, 0.09...","[-0.0035505022, 0.0014508929, 0.030938284, 0.0..."
3,"[From, :, jgreen, @, amber, (, Joe, Green, ), ...",1.0,"[-0.005437399, -0.0005287288, 0.017397419, 0.0...","[0.021669481, 0.029209606, 0.042437445, 0.1068...","[-0.012189492, 0.04209396, 0.031411484, 0.0665..."
4,"[From, :, jcm, @, head-cfa.harvard.edu, (, Jon...",14.0,"[-0.07343274, 0.00018738056, 0.033099294, 0.09...","[-0.04802665, 0.01356139, 0.030353839, 0.08902...","[0.022399005, 0.033801246, 0.059539437, 0.0744..."


### form batches

In [0]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=random_state, shuffle=True)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=random_state, shuffle=True)

In [0]:
class Batch:
  def __init__(self, anchors, positives, negatives, targets):
    self.anc = anchors
    self.pos = positives
    self.neg = negatives
    self.tgt = targets

In [0]:
class Iterator:
  def __init__(self, df, batch_size=128, device='cpu'):
    self.batches = self.make_batches(df, batch_size, device)

  @staticmethod
  def make_batches(df, batch_size, device):
    batches = []
    shuffled = df.sample(frac=1)
    i = 0
    while i < len(df):
      b = shuffled[i: i+batch_size]
      if len(b) == batch_size:
        anc = torch.tensor(list(b.vec.values)).to(device)
        pos = torch.tensor(list(b.positive.values)).to(device)
        neg = torch.tensor(list(b.negative.values)).to(device)
        tgt = torch.tensor(list(b.target.values)).to(device)
        batch = Batch(anc, pos, neg, tgt)
        batches.append(batch)
      i += batch_size
    return batches

In [20]:
Iterator(df, batch_size=batch_size, device=DEVICE).batches[0].pos

tensor([[ 0.0392,  0.0082,  0.0270,  ..., -0.0602,  0.0030, -0.0106],
        [ 0.0251, -0.0025,  0.0351,  ..., -0.0395, -0.0041, -0.0297],
        [-0.0071,  0.0033,  0.0459,  ..., -0.0374,  0.0153, -0.0435],
        ...,
        [ 0.0377, -0.0025,  0.0208,  ..., -0.0433,  0.0143, -0.0046],
        [ 0.0270,  0.0059,  0.0295,  ..., -0.0174, -0.0069, -0.0305],
        [-0.0329, -0.0025,  0.0542,  ..., -0.0407,  0.0255, -0.0254]],
       device='cuda:0')

In [0]:
trn_itr = Iterator(train_df, batch_size=batch_size, device=DEVICE).batches
vld_itr = Iterator(val_df, batch_size=batch_size, device=DEVICE).batches
tst_itr = Iterator(test_df, batch_size=batch_size, device=DEVICE).batches

In [0]:
len_train = len(trn_itr)
len_valid = len(vld_itr)
len_test = len(tst_itr)

## Network

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, inp_dim=300, lin_dim=128):
        super(MyModel, self).__init__()
        self.fc = nn.Linear(inp_dim, lin_dim)
        
    def branch(self, x):
        x = self.fc(x)
        return x
        
    def forward(self, batch):
        anc = batch.anc
        pos = batch.pos
        neg = batch.neg
        
        anc = self.branch(anc)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return anc, pos, neg


In [0]:
def train_epoch(data_iter, len_iter, n_epoch, model, criterion, optimizer=None):
    train_losses = []
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len_iter, desc=f"Epoch {n_epoch + 1}", leave=True)
    counter = 0
    for batch in data_iter:
        if optimizer:
          optimizer.zero_grad()
        anc, pos, neg = model.forward(batch)
        loss = criterion(anc, pos, neg)
        loss.backward()
        if optimizer:
          optimizer.step()
        loss_value = loss.detach().item()
        total_loss += loss_value
        train_losses.append(loss_value)
        data_iter.set_postfix(loss = loss_value)
        counter += 1
        
    total_loss /= counter
    return total_loss, train_losses


def valid_epoch(data_iter, len_iter, n_epoch, model, criterion):
    valid_losses = []
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len_iter, desc=f"Eval epoch {n_epoch + 1}", leave=True)
    counter = 0
    for batch in data_iter:
        with torch.no_grad():
            anc, pos, neg = model.forward(batch)
            loss = criterion(anc, pos, neg)
            loss_value = loss.detach().item()
            total_loss += loss_value
            valid_losses.append(loss_value)
            data_iter.set_postfix(loss = loss_value)
            counter +=1
        
    total_loss /= counter
    return total_loss, valid_losses

### Loss

In [0]:
class TripletLoss(nn.L1Loss):
  def __init__(self, margin=0.1):
    super(TripletLoss, self).__init__()
    self.margin = margin
  
  def forward(self, anc, pos, neg):
    return torch.mean(F.cosine_similarity(anc, neg, dim=-1) - F.cosine_similarity(anc, pos, dim=-1) + self.margin)

In [26]:
criterion = TripletLoss()
criterion(torch.tensor([[1.0,2.0,3.0], [1.0,1.0,2.0]]), torch.tensor([[1.0,1.0,1.0], [1.0,0.0,2.0]]), torch.tensor([[0.0,0.0,0.0], [0.0,2.0,10.0]]))

tensor(-0.3790)

## Train

In [0]:
criterion = TripletLoss()
criterion = criterion.to(DEVICE)
model = MyModel(inp_dim=emb_dim, lin_dim=512)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

In [0]:
num_epochs = 15

In [29]:
total_train_losses = []
total_valid_losses = []
for epoch in range(num_epochs):
    model.train()
    loss, train_losses = train_epoch(trn_itr, len_train, epoch, model, criterion, optimizer)
    total_train_losses += train_losses
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss, valid_losses = valid_epoch(vld_itr, len_valid, epoch, model, criterion)
        total_valid_losses += valid_losses
        if scheduler:
          if type(scheduler) == torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step(loss)
          else:
            scheduler.step() 
        print('valid', loss)

HBox(children=(IntProgress(value=0, description='Epoch 1', max=31, style=ProgressStyle(description_width='init…


train -0.25843667010626487


HBox(children=(IntProgress(value=0, description='Eval epoch 1', max=3, style=ProgressStyle(description_width='…


valid -0.47955307364463806


HBox(children=(IntProgress(value=0, description='Epoch 2', max=31, style=ProgressStyle(description_width='init…


train -0.5441000884579074


HBox(children=(IntProgress(value=0, description='Eval epoch 2', max=3, style=ProgressStyle(description_width='…


valid -0.5745790600776672


HBox(children=(IntProgress(value=0, description='Epoch 3', max=31, style=ProgressStyle(description_width='init…


train -0.6076347693320243


HBox(children=(IntProgress(value=0, description='Eval epoch 3', max=3, style=ProgressStyle(description_width='…


valid -0.6078996260960897


HBox(children=(IntProgress(value=0, description='Epoch 4', max=31, style=ProgressStyle(description_width='init…


train -0.6391072830846233


HBox(children=(IntProgress(value=0, description='Eval epoch 4', max=3, style=ProgressStyle(description_width='…


valid -0.625820537408193


HBox(children=(IntProgress(value=0, description='Epoch 5', max=31, style=ProgressStyle(description_width='init…


train -0.6619296227732012


HBox(children=(IntProgress(value=0, description='Eval epoch 5', max=3, style=ProgressStyle(description_width='…


valid -0.6403262813886007


HBox(children=(IntProgress(value=0, description='Epoch 6', max=31, style=ProgressStyle(description_width='init…


train -0.6801850738063935


HBox(children=(IntProgress(value=0, description='Eval epoch 6', max=3, style=ProgressStyle(description_width='…


valid -0.6544007857640585


HBox(children=(IntProgress(value=0, description='Epoch 7', max=31, style=ProgressStyle(description_width='init…


train -0.6944240908468923


HBox(children=(IntProgress(value=0, description='Eval epoch 7', max=3, style=ProgressStyle(description_width='…


valid -0.6665329535802206


HBox(children=(IntProgress(value=0, description='Epoch 8', max=31, style=ProgressStyle(description_width='init…


train -0.7056053607694565


HBox(children=(IntProgress(value=0, description='Eval epoch 8', max=3, style=ProgressStyle(description_width='…


valid -0.676409105459849


HBox(children=(IntProgress(value=0, description='Epoch 9', max=31, style=ProgressStyle(description_width='init…


train -0.7147666965761492


HBox(children=(IntProgress(value=0, description='Eval epoch 9', max=3, style=ProgressStyle(description_width='…


valid -0.6845846176147461


HBox(children=(IntProgress(value=0, description='Epoch 10', max=31, style=ProgressStyle(description_width='ini…


train -0.7225412053446616


HBox(children=(IntProgress(value=0, description='Eval epoch 10', max=3, style=ProgressStyle(description_width=…


valid -0.6915806531906128


HBox(children=(IntProgress(value=0, description='Epoch 11', max=31, style=ProgressStyle(description_width='ini…


train -0.7292830924833974


HBox(children=(IntProgress(value=0, description='Eval epoch 11', max=3, style=ProgressStyle(description_width=…


valid -0.6977261702219645


HBox(children=(IntProgress(value=0, description='Epoch 12', max=31, style=ProgressStyle(description_width='ini…


train -0.7352176347086506


HBox(children=(IntProgress(value=0, description='Eval epoch 12', max=3, style=ProgressStyle(description_width=…


valid -0.7032004197438558


HBox(children=(IntProgress(value=0, description='Epoch 13', max=31, style=ProgressStyle(description_width='ini…


train -0.7405140746024347


HBox(children=(IntProgress(value=0, description='Eval epoch 13', max=3, style=ProgressStyle(description_width=…


valid -0.7081102530161539


HBox(children=(IntProgress(value=0, description='Epoch 14', max=31, style=ProgressStyle(description_width='ini…


train -0.7453108410681447


HBox(children=(IntProgress(value=0, description='Eval epoch 14', max=3, style=ProgressStyle(description_width=…


valid -0.7125217119852701


HBox(children=(IntProgress(value=0, description='Epoch 15', max=31, style=ProgressStyle(description_width='ini…


train -0.7497115269784005


HBox(children=(IntProgress(value=0, description='Eval epoch 15', max=3, style=ProgressStyle(description_width=…


valid -0.7164672613143921


## Predict


In [32]:
predict_vecs = []
predict_tgts = []
model.eval()
with torch.no_grad():
  data_iter = tqdm_notebook(tst_itr, total=len_test, desc=f"Test", leave=True)
  for batch in data_iter:
    anc, pos, neg = model.forward(batch)
    tgt = batch.tgt
    predict_vecs.append(anc.cpu().detach().numpy())
    predict_tgts += tgt.cpu().detach()

HBox(children=(IntProgress(value=0, description='Test', max=8, style=ProgressStyle(description_width='initial'…




In [33]:
predict_df = pd.DataFrame(columns=['target', 'vec'])
predict_df['target'] = [x.item() for x in predict_tgts]
predict_df['vec'] = list(np.concatenate(predict_vecs))
predict_df.tail()

Unnamed: 0,target,vec
2043,16.0,"[-0.023727303, 0.01833433, -0.018880416, -0.00..."
2044,4.0,"[0.061534, -0.04067895, 0.048892073, 0.0317100..."
2045,12.0,"[0.014112316, 0.00051000156, -0.018455632, -0...."
2046,19.0,"[-0.06338209, 0.04284886, -0.0397351, -0.02226..."
2047,18.0,"[-0.043457314, 0.047806196, -0.05149258, -0.03..."


## Nearest neighbours

In [34]:
X = np.array([[-1, -1, 3], [-2, -1, 12], [-3, -2, 1], [1, 1, 1], [1, 2, 1], [1, 3, 2]])
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X)
x_distances, x_indices = nbrs.kneighbors(X)
x_indices

array([[0, 2, 3],
       [1, 0, 2],
       [2, 0, 3],
       [3, 4, 5],
       [4, 3, 5],
       [5, 4, 3]])

In [35]:
x_distances

array([[ 0.        ,  3.        ,  3.46410162],
       [ 0.        ,  9.05538514, 11.09053651],
       [ 0.        ,  3.        ,  5.        ],
       [ 0.        ,  1.        ,  2.23606798],
       [ 0.        ,  1.        ,  1.41421356],
       [ 0.        ,  1.41421356,  2.23606798]])

In [0]:
predicted = np.concatenate(predict_vecs)
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(predicted)
distances, indices = nbrs.kneighbors(predicted)

In [0]:
predicted_classes = []
for i, row in predict_df.iterrows():
  distances_to_classes = [100]*20
  distances_i = distances[i]
  indices_i = indices[i]
  for n_i, n_d in enumerate(distances_i):
    class_n = predict_df['target'][indices_i[n_i]]
    if distances_to_classes[int(class_n)] == 100:
      distances_to_classes[int(class_n)] = n_d
    else:
      distances_to_classes[int(class_n)] += n_d
  class_i = distances_to_classes.index(min(distances_to_classes))

  predicted_classes.append(class_i)

In [38]:
accuracy_score(y_true = list(predict_df.target.values), y_pred=predicted_classes)

0.8701171875