In [23]:
import json
import pandas as pd
import numpy as np
import torch
from torch import nn
from pprint import pprint
import jsonlines
import os
import re
from tqdm.notebook import tqdm
import pickle

from typing import *

In [2]:
train_path = 'data/train.jsonl'
dev_path = 'data/dev.jsonl'

# Utils

In [3]:
def save_dictionary(dictionary, path):
    with open(path, 'wb') as f:
        pickle.dump(dictionary, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_dictionary(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [26]:
# Saving / loading models
class Checkpoint:
    def __init__(self, path, resume=False):
        self.path = path
        os.makedirs(path, exist_ok=True)
        self.resume = resume

    def load(self, model, optimizer, id_path=""):
        if (not self.resume) and id_path == "":
            raise RuntimeError()
        if self.resume:
            id_path = sorted(os.listdir(self.path))[-1]
        self.checkpoint = torch.load(
            os.path.join(self.path, id_path), map_location=lambda storage, loc: storage
        )
        if self.checkpoint == None:
            raise RuntimeError("Checkpoint empty.")
        epoch = self.checkpoint["epoch"]
        model.load_state_dict(self.checkpoint["model_state_dict"])
        optimizer.load_state_dict(self.checkpoint["optimizer_state_dict"])
        loss = self.checkpoint["loss"]
        return (model, optimizer, epoch, loss)

    def save(self, model, optimizer, epoch, loss, accuracy):
        model_checkpoint = {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "epoch": epoch,
            "loss": loss,
            "accuracy": accuracy
        }
        checkpoint_name = "{}.pth".format(str(epoch).zfill(3))
        complete_path = os.path.join(self.path, checkpoint_name)
        torch.save(model_checkpoint, complete_path)
        return checkpoint_name

    def load_just_model(self, model, id_path=""):
        if self.resume:
            id_path = sorted(os.listdir(self.path))[-1]
        self.checkpoint = torch.load(
            os.path.join(self.path, id_path), map_location=lambda storage, loc: storage
        )
        if self.checkpoint == None:
            raise RuntimeError("Checkpoint empty.")
        model.load_state_dict(self.checkpoint["model_state_dict"])
        return model

# Create word embedding with GloVe

In [4]:
def sentence2vector(sentence: str) -> Optional[torch.Tensor]:
    sentences_word_vector = [word_vectors[w] for w in sentence.split(' ') if w in word_vectors]
    
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  # tensor shape: (#words X #features)
    return torch.mean(sentences_word_vector, dim=0)

In [5]:
def cosine_similarity(v1: torch.Tensor, v2: torch.Tensor) -> float:
    num = torch.sum(v1 * v2)
    den = torch.linalg.norm(v1) * torch.linalg.norm(v2)
    return (num / den).item()

In [6]:
word_vectors = dict()
n_words = 400_000
with open('embeddings/glove.6B.300d.txt') as f:
    for i, line in tqdm(enumerate(f), total=n_words):

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector

  0%|          | 0/400000 [00:00<?, ?it/s]

In [34]:
save_dictionary(word_vectors, 'vocabulary.pkl')

# Dataset class using GloVe

In [14]:
class WiCDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path: str):
        self.data = []
        self.create_dataset(dataset_path)
    
    def create_dataset(self, dataset_path: str) -> None:
        with jsonlines.open(dataset_path, 'r') as f:
            for i, line in enumerate(f.iter()):
                s1 = sentence2vector(line['sentence1'])
                s2 = sentence2vector(line['sentence2'])
                # sentence = f"{line['sentence1']} {line['sentence2']}"
                sentence_vector = torch.cat((s1, s2))
                
                label = torch.tensor(1, dtype=torch.float32) if line['label'] == 'True' else torch.tensor(0, dtype=torch.float32)
                self.data.append((sentence_vector, label))


    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data[idx]

# Model Class

In [9]:
class MLP(nn.Module):
    def __init__(
        self,
        n_features: int,
        num_layers: int,
        hidden_dim: int,
        activation: Callable[[torch.Tensor], torch.Tensor],
    ) -> None:
        super().__init__()

        self.first_layer = nn.Linear(in_features=n_features, out_features=hidden_dim)

        self.layers = (
            nn.ModuleList()
        )

        for i in range(num_layers):
            self.layers.append(
                nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
            )
        self.activation = activation
        
        self.batchnorm = nn.BatchNorm1d(hidden_dim)

        self.last_layer = nn.Linear(in_features=hidden_dim, out_features=1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, meshgrid: torch.Tensor) -> torch.Tensor:
        """
        Applies transformations to each (x, y) independently

        :param meshgrid: tensor of dimensions [..., 2], where ... means any number of dims
        """
        out = meshgrid

        out = self.first_layer(
            out
        )  # First linear layer, transforms the hidden dimensions from `n_features` (embedding dimension) to `hidden_dim`
        for layer in self.layers:  # Apply `k` (linear, activation) layer
            out = layer(out)
            out = self.activation(out)
            # out = self.batchnorm(out)
            # out = nn.Dropout(p=0.2)(out)
        out = self.last_layer(
            out
        )  # Last linear layer to bring the `hiddem_dim` features to a binary space (`True`/`False`)
        
        out = self.sigmoid(out)
        return out.squeeze(-1)


# Training process

In [27]:
def correctly_predicted(predicted, gt):
    predicted_labels = (predicted > 0.5).float()

    return (predicted_labels == gt).sum().item(), gt.shape[0]

def step(model, criterion, xb, yb, opt=None):
    loss = criterion(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def fit(epochs, model, criterion, opt, train_dl, valid_dl, checkpoint=None):
    for epoch in tqdm(range(epochs)):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            step(model, criterion, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            # losses, nums = zip(*[step(model, criterion, xb, yb) for xb, yb in valid_dl])
            losses = []
            nums = []
            corrects = []
            for xb, yb in valid_dl:
                xb = xb.to(device)
                yb = yb.to(device)
                
                loss, num = step(model, criterion, xb, yb)
                correct, _ = correctly_predicted(model(xb), yb)
                losses.append(loss)
                nums.append(num)
                corrects.append(correct)
                

        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        val_acc = np.sum(corrects) / np.sum(nums)

        if checkpoint:
            checkpoint.save(model, opt, epoch, val_loss, val_acc)

        print(f"{epoch} \t {val_loss:.2f} \t {val_acc}")


In [15]:
train_dataset = WiCDataset(train_path)
val_dataset = WiCDataset(dev_path)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.BCELoss()#.to(device)
model = MLP(n_features=600,
            num_layers=5, 
            hidden_dim=150, 
            activation=torch.nn.functional.relu).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.00001)

checkpoint = Checkpoint(path='checkpoints')

In [32]:
fit(50, model, criterion, optimizer, train_loader, val_loader, checkpoint)

  0%|          | 0/50 [00:00<?, ?it/s]

0 	 0.69 	 0.5
1 	 0.69 	 0.501
2 	 0.69 	 0.55
3 	 0.69 	 0.565
4 	 0.68 	 0.56
5 	 0.67 	 0.576
6 	 0.67 	 0.58
7 	 0.66 	 0.614
8 	 0.67 	 0.597
9 	 0.65 	 0.617
10 	 0.66 	 0.605
11 	 0.66 	 0.617
12 	 0.67 	 0.622
13 	 0.66 	 0.628
14 	 0.69 	 0.612
15 	 0.70 	 0.628
16 	 0.71 	 0.623
17 	 0.77 	 0.609
18 	 0.76 	 0.621
19 	 0.81 	 0.622
20 	 0.87 	 0.605
21 	 0.86 	 0.634
22 	 0.91 	 0.629
23 	 0.96 	 0.617
24 	 1.06 	 0.61
25 	 1.12 	 0.619
26 	 1.20 	 0.6
27 	 1.21 	 0.617
28 	 1.28 	 0.608
29 	 1.36 	 0.609
30 	 1.49 	 0.594
31 	 1.50 	 0.616
32 	 1.63 	 0.591
33 	 1.71 	 0.598
34 	 1.66 	 0.613
35 	 1.84 	 0.602
36 	 1.85 	 0.591
37 	 2.02 	 0.59
38 	 2.29 	 0.597
39 	 2.38 	 0.6
40 	 2.65 	 0.612
41 	 2.23 	 0.59
42 	 2.37 	 0.587
43 	 2.38 	 0.597
44 	 2.51 	 0.593
45 	 2.64 	 0.587
46 	 2.97 	 0.596
47 	 2.86 	 0.592
48 	 3.10 	 0.589
49 	 3.14 	 0.596
