In [192]:
import json
import pandas as pd
import numpy as np
import torch
from torch import nn
from pprint import pprint
import jsonlines
import os
import re
from tqdm.notebook import tqdm
import pickle
import string

import nltk
from nltk.stem.porter import PorterStemmer

from typing import *

In [15]:
train_path = 'data/train.jsonl'
dev_path = 'data/dev.jsonl'

# Utils

In [30]:
# https://stackoverflow.com/questions/49239941/what-is-unk-in-the-pretrained-glove-vector-files-e-g-glove-6b-50d-txt

unk_embedding = '0.22418134 -0.28881392 0.13854356 0.00365387 -0.12870757 0.10243822 0.061626635 0.07318011 -0.061350107 -1.3477012 0.42037755 -0.063593924 -0.09683349 0.18086134 0.23704372 0.014126852 0.170096 -1.1491593 0.31497982 0.06622181 0.024687296 0.076693475 0.13851812 0.021302193 -0.06640582 -0.010336159 0.13523154 -0.042144544 -0.11938788 0.006948221 0.13333307 -0.18276379 0.052385733 0.008943111 -0.23957317 0.08500333 -0.006894406 0.0015864656 0.063391194 0.19177166 -0.13113557 -0.11295479 -0.14276934 0.03413971 -0.034278486 -0.051366422 0.18891625 -0.16673574 -0.057783455 0.036823478 0.08078679 0.022949161 0.033298038 0.011784158 0.05643189 -0.042776518 0.011959623 0.011552498 -0.0007971594 0.11300405 -0.031369694 -0.0061559738 -0.009043574 -0.415336 -0.18870236 0.13708843 0.005911723 -0.113035575 -0.030096142 -0.23908928 -0.05354085 -0.044904727 -0.20228513 0.0065645403 -0.09578946 -0.07391877 -0.06487607 0.111740574 -0.048649278 -0.16565254 -0.052037314 -0.078968436 0.13684988 0.0757494 -0.006275573 0.28693774 0.52017444 -0.0877165 -0.33010918 -0.1359622 0.114895485 -0.09744406 0.06269521 0.12118575 -0.08026362 0.35256687 -0.060017522 -0.04889904 -0.06828978 0.088740796 0.003964443 -0.0766291 0.1263925 0.07809314 -0.023164088 -0.5680669 -0.037892066 -0.1350967 -0.11351585 -0.111434504 -0.0905027 0.25174105 -0.14841858 0.034635577 -0.07334565 0.06320108 -0.038343467 -0.05413284 0.042197507 -0.090380974 -0.070528865 -0.009174437 0.009069661 0.1405178 0.02958134 -0.036431845 -0.08625681 0.042951006 0.08230793 0.0903314 -0.12279937 -0.013899368 0.048119213 0.08678239 -0.14450377 -0.04424887 0.018319942 0.015026873 -0.100526 0.06021201 0.74059093 -0.0016333034 -0.24960588 -0.023739101 0.016396184 0.11928964 0.13950661 -0.031624354 -0.01645025 0.14079992 -0.0002824564 -0.08052984 -0.0021310581 -0.025350995 0.086938225 0.14308536 0.17146006 -0.13943303 0.048792403 0.09274929 -0.053167373 0.031103406 0.012354865 0.21057427 0.32618305 0.18015954 -0.15881181 0.15322933 -0.22558987 -0.04200665 0.0084689725 0.038156632 0.15188617 0.13274793 0.113756925 -0.095273495 -0.049490947 -0.10265804 -0.27064866 -0.034567792 -0.018810693 -0.0010360252 0.10340131 0.13883452 0.21131058 -0.01981019 0.1833468 -0.10751636 -0.03128868 0.02518242 0.23232952 0.042052146 0.11731903 -0.15506615 0.0063580726 -0.15429358 0.1511722 0.12745973 0.2576985 -0.25486213 -0.0709463 0.17983761 0.054027 -0.09884228 -0.24595179 -0.093028545 -0.028203879 0.094398156 0.09233813 0.029291354 0.13110267 0.15682974 -0.016919162 0.23927948 -0.1343307 -0.22422817 0.14634751 -0.064993896 0.4703685 -0.027190214 0.06224946 -0.091360025 0.21490277 -0.19562101 -0.10032754 -0.09056772 -0.06203493 -0.18876675 -0.10963594 -0.27734384 0.12616494 -0.02217992 -0.16058226 -0.080475815 0.026953284 0.110732645 0.014894041 0.09416802 0.14299914 -0.1594008 -0.066080004 -0.007995227 -0.11668856 -0.13081996 -0.09237365 0.14741232 0.09180138 0.081735 0.3211204 -0.0036552632 -0.047030564 -0.02311798 0.048961394 0.08669574 -0.06766279 -0.50028914 -0.048515294 0.14144728 -0.032994404 -0.11954345 -0.14929578 -0.2388355 -0.019883996 -0.15917352 -0.052084364 0.2801028 -0.0029121689 -0.054581646 -0.47385484 0.17112483 -0.12066923 -0.042173345 0.1395337 0.26115036 0.012869649 0.009291686 -0.0026459037 -0.075331464 0.017840583 -0.26869613 -0.21820338 -0.17084768 -0.1022808 -0.055290595 0.13513643 0.12362477 -0.10980586 0.13980341 -0.20233242 0.08813751 0.3849736 -0.10653763 -0.06199595 0.028849555 0.03230154 0.023856193 0.069950655 0.19310954 -0.077677034 -0.144811'
unk_embedding = unk_embedding.strip().split(' ')
unk_embedding = torch.tensor([float(c) for c in unk_embedding])

In [16]:
def save_dictionary(dictionary, path):
    with open(path, 'wb') as f:
        pickle.dump(dictionary, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_dictionary(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [17]:
# Saving / loading models
class Checkpoint:
    def __init__(self, path, resume=False):
        self.path = path
        os.makedirs(path, exist_ok=True)
        self.resume = resume

    def load(self, model, optimizer, id_path=""):
        if (not self.resume) and id_path == "":
            raise RuntimeError()
        if self.resume:
            id_path = sorted(os.listdir(self.path))[-1]
        self.checkpoint = torch.load(
            os.path.join(self.path, id_path), map_location=lambda storage, loc: storage
        )
        if self.checkpoint == None:
            raise RuntimeError("Checkpoint empty.")
        epoch = self.checkpoint["epoch"]
        model.load_state_dict(self.checkpoint["model_state_dict"])
        optimizer.load_state_dict(self.checkpoint["optimizer_state_dict"])
        loss = self.checkpoint["loss"]
        return (model, optimizer, epoch, loss)

    def save(self, model, optimizer, epoch, loss, accuracy):
        model_checkpoint = {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "epoch": epoch,
            "loss": loss,
            "accuracy": accuracy
        }
        checkpoint_name = "{}.pth".format(str(epoch).zfill(3))
        complete_path = os.path.join(self.path, checkpoint_name)
        torch.save(model_checkpoint, complete_path)
        return checkpoint_name

    def load_just_model(self, model, id_path=""):
        if self.resume:
            id_path = sorted(os.listdir(self.path))[-1]
        self.checkpoint = torch.load(
            os.path.join(self.path, id_path), map_location=lambda storage, loc: storage
        )
        if self.checkpoint == None:
            raise RuntimeError("Checkpoint empty.")
        model.load_state_dict(self.checkpoint["model_state_dict"])
        return model

In [251]:
def preprocess(sentence):
    # lowercase sentence
    sentence = sentence.lower()
    # remove punctuation
    # sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = re.sub('[^\w\s]', ' ', sentence)
    sentence = re.sub(' +', ' ', sentence).strip()
    return sentence

In [242]:
preprocess('this Lemmatization, is! what we expected')

'this lemmatization is what we expected'

In [37]:
def find_position(sentence: str, start: int):
    return len(re.findall(" +", stringa[:start]))

# Create word embedding with GloVe

In [109]:
def sentence2vector(sentence: str) -> Optional[torch.Tensor]:
    sentences_word_vector = []
    for w in re.split(" +", sentence):
        word_embedding = word_vectors[w] if w in word_vectors else unk_embedding
        sentences_word_vector.append(word_embedding)
    # sentences_word_vector = [word_vectors[w] for w in re.split(" +", sentence) if w in word_vectors]
    
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  # tensor shape: (#words X #features)
    return torch.mean(sentences_word_vector, dim=0)

In [155]:
def w_sentence2vector(sentence: str, target_start: int) -> Optional[torch.Tensor]:
    # get embedding for each word in the sentence
    target_position = find_position(sentence, target_start)
    sentences_word_vector = []
    for w in re.split(" +", sentence):
        word_embedding = word_vectors[w] if w in word_vectors else unk_embedding
        sentences_word_vector.append(word_embedding)
    
    sentences_word_vector = torch.stack(sentences_word_vector)
    
    # weights from 1 to 0
    weights = torch.linspace(1, 0.1, len(sentences_word_vector)).unsqueeze(1)
    
    # weighted vector
    new_vectors = sentences_word_vector
    
    t = target_position
    n = len(sentences_word_vector)
    # right of the target word
    new_vectors[t:] = new_vectors[t:] * weights[:n - t]
    # left of the target word
    new_vectors[:t] = new_vectors[:t] * reversed(weights[1:t + 1])
    
    # denominator (sum of the weights)
    weights_sum = weights[:n - t].sum() + weights[1:t + 1].sum()

    return new_vectors.sum(dim=0) / weights_sum

In [23]:
def cosine_similarity(v1: torch.Tensor, v2: torch.Tensor) -> float:
    num = torch.sum(v1 * v2)
    den = torch.linalg.norm(v1) * torch.linalg.norm(v2)
    return (num / den).item()

In [209]:
word_vectors = dict()
n_words = 400_000
with open('embeddings/glove.6B.300d.txt') as f:
    for i, line in tqdm(enumerate(f), total=n_words):

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector

  0%|          | 0/400000 [00:00<?, ?it/s]

In [34]:
# save_dictionary(word_vectors, 'model/vocabulary.pkl')

In [297]:
PorterStemmer()

<PorterStemmer>

In [301]:
no_emb = []
with jsonlines.open(train_path, 'r') as f:
    for i, line in enumerate(f.iter()):
        start1 = int(line['start1'])
        start2 = int(line['start2'])
        end1 = int(line['end1'])
        end2 = int(line['end2'])
        # sentences.append(line['sentence1'])
        # sentences.append(line['sentence2'])
        s1 = line['sentence1']
        s2 = line['sentence2']
        lemma1 = s1[start1:end1]
        lemma2 = s2[start2:end2]
        # if lemma1 != lemma2:
        #     print(lemma1, lemma2)
        lemma1 = preprocess(lemma1)
        lemma2 = preprocess(lemma2)
        if lemma1 not in word_vectors:
            no_emb.append(lemma1)
        # if lemma2 not in word_vectors:
        #     no_emb.append(lemma2)

In [308]:
len(set([stemmer.stem(w) for w in no_emb]))

49

In [252]:
no_embeddings = []
for sentence in tqdm(sentences):
    sentence = preprocess(sentence)
    for tk in re.split(' +', sentence):
        if tk not in word_vectors:
            no_embeddings.append(tk)

  0%|          | 0/16000 [00:00<?, ?it/s]

# Dataset class using GloVe

In [284]:
class WiCDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path: str):
        self.data = []
        self.create_dataset(dataset_path)
        
    
    def create_dataset(self, dataset_path: str) -> None:
        with jsonlines.open(dataset_path, 'r') as f:
            for i, line in enumerate(f.iter()):
                start1 = int(line['start1'])
                start2 = int(line['start2'])
                s1 = w_sentence2vector(preprocess(line['sentence1']), start1)
                s2 = w_sentence2vector(preprocess(line['sentence2']), start2)
                # s1 = sentence2vector(line['sentence1'])
                # s2 = sentence2vector(line['sentence2'])
                
                # sentence = f"{line['sentence1']} {line['sentence2']}"
                sentence_vector = torch.cat((s1, s2))
                
                label = torch.tensor(1, dtype=torch.float32) if line['label'] == 'True' else torch.tensor(0, dtype=torch.float32)
                self.data.append((sentence_vector, label))


    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data[idx]

# Model Class

In [132]:
class MLP(nn.Module):
    def __init__(
        self,
        n_features: int,
        num_layers: int,
        hidden_dim: int,
        activation: Callable[[torch.Tensor], torch.Tensor],
    ) -> None:
        super().__init__()

        self.first_layer = nn.Linear(in_features=n_features, out_features=hidden_dim)

        self.layers = (
            nn.ModuleList()
        )

        for i in range(num_layers):
            self.layers.append(
                nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
            )
        self.activation = activation
        
        self.batchnorm = nn.BatchNorm1d(hidden_dim)

        self.last_layer = nn.Linear(in_features=hidden_dim, out_features=1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, meshgrid: torch.Tensor) -> torch.Tensor:
        """
        Applies transformations to each (x, y) independently

        :param meshgrid: tensor of dimensions [..., 2], where ... means any number of dims
        """
        out = meshgrid

        out = self.first_layer(
            out
        )  # First linear layer, transforms the hidden dimensions from `n_features` (embedding dimension) to `hidden_dim`
        for layer in self.layers:  # Apply `k` (linear, activation) layer
            out = layer(out)
            out = self.activation(out)
            # out = self.batchnorm(out)
            # out = nn.Dropout(p=0.2)(out)
        out = self.last_layer(
            out
        )  # Last linear layer to bring the `hiddem_dim` features to a binary space (`True`/`False`)
        
        out = self.sigmoid(out)
        return out.squeeze(-1)


# Training process

In [281]:
def correctly_predicted(predicted, gt):
    predicted_labels = (predicted > 0.5).float()

    return (predicted_labels == gt).sum().item(), gt.shape[0]

def step(model, criterion, xb, yb, opt=None):
    loss = criterion(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def fit(epochs, model, criterion, opt, train_dl, valid_dl, checkpoint=None):
    for epoch in tqdm(range(epochs)):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            step(model, criterion, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            # losses, nums = zip(*[step(model, criterion, xb, yb) for xb, yb in valid_dl])
            losses = []
            nums = []
            corrects = []
            for xb, yb in valid_dl:
                xb = xb.to(device)
                yb = yb.to(device)
                
                loss, num = step(model, criterion, xb, yb)
                correct, _ = correctly_predicted(model(xb), yb)
                losses.append(loss)
                nums.append(num)
                corrects.append(correct)
                
        # val_loss = np.sum(losses) / np.sum(nums)
        
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        val_acc = np.sum(corrects) / np.sum(nums)

        if checkpoint:
            checkpoint.save(model, opt, epoch, val_loss, val_acc)

        print(f"{epoch} \t {val_loss:.2f} \t {val_acc}")


In [285]:
train_dataset = WiCDataset(train_path)
val_dataset = WiCDataset(dev_path)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False)

In [286]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.BCELoss()#.to(device)
model = MLP(n_features=600,
            num_layers=5, 
            hidden_dim=150, 
            activation=torch.nn.functional.relu).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.00001)

checkpoint = Checkpoint(path='checkpoints')

In [287]:
fit(150, model, criterion, optimizer, train_loader, val_loader, checkpoint)

  0%|          | 0/150 [00:00<?, ?it/s]

0 	 0.69 	 0.5
1 	 0.69 	 0.521
2 	 0.69 	 0.523
3 	 0.69 	 0.545
4 	 0.69 	 0.531
5 	 0.68 	 0.565
6 	 0.68 	 0.547
7 	 0.68 	 0.56
8 	 0.68 	 0.581
9 	 0.68 	 0.581
10 	 0.67 	 0.574
11 	 0.67 	 0.593
12 	 0.66 	 0.604
13 	 0.66 	 0.601
14 	 0.66 	 0.609
15 	 0.66 	 0.614
16 	 0.66 	 0.624
17 	 0.66 	 0.626
18 	 0.69 	 0.609
19 	 0.69 	 0.621
20 	 0.73 	 0.609
21 	 0.73 	 0.616
22 	 0.77 	 0.601
23 	 0.79 	 0.61
24 	 0.83 	 0.611
25 	 0.86 	 0.592
26 	 0.90 	 0.6
27 	 0.93 	 0.607
28 	 1.00 	 0.595
29 	 1.04 	 0.611
30 	 1.11 	 0.591
31 	 1.16 	 0.605
32 	 1.19 	 0.587
33 	 1.26 	 0.597
34 	 1.35 	 0.596
35 	 1.40 	 0.601
36 	 1.44 	 0.606
37 	 1.54 	 0.605
38 	 1.58 	 0.596
39 	 1.62 	 0.61
40 	 1.70 	 0.601
41 	 1.77 	 0.594
42 	 1.88 	 0.598
43 	 1.90 	 0.596
44 	 1.95 	 0.603
45 	 2.04 	 0.609
46 	 2.04 	 0.592
47 	 2.11 	 0.597
48 	 2.25 	 0.592
49 	 2.28 	 0.594
50 	 2.33 	 0.6
51 	 2.35 	 0.6
52 	 2.68 	 0.593
53 	 2.55 	 0.594
54 	 2.56 	 0.596
55 	 3.15 	 0.579
56 	 1.69 	 0

In [28]:
fit(50, model, criterion, optimizer, train_loader, val_loader, checkpoint)

  0%|          | 0/50 [00:00<?, ?it/s]

0 	 0.69 	 0.5
1 	 0.69 	 0.5
2 	 0.69 	 0.5
3 	 0.69 	 0.521
4 	 0.69 	 0.551
5 	 0.68 	 0.552
6 	 0.68 	 0.552
7 	 0.68 	 0.546
8 	 0.68 	 0.559
9 	 0.69 	 0.563
10 	 0.68 	 0.573
11 	 0.68 	 0.58
12 	 0.68 	 0.569
13 	 0.67 	 0.584
14 	 0.69 	 0.59
15 	 0.67 	 0.598
16 	 0.67 	 0.591
17 	 0.67 	 0.602
18 	 0.68 	 0.604
19 	 0.70 	 0.595
20 	 0.71 	 0.606
21 	 0.72 	 0.609
22 	 0.73 	 0.607
23 	 0.79 	 0.585
24 	 0.77 	 0.608
25 	 0.82 	 0.596
26 	 0.85 	 0.609
27 	 0.88 	 0.583
28 	 0.91 	 0.597
29 	 0.97 	 0.586
30 	 1.00 	 0.589
31 	 1.09 	 0.589
32 	 1.14 	 0.605
33 	 1.19 	 0.614
34 	 1.26 	 0.596
35 	 1.32 	 0.602
36 	 1.42 	 0.608
37 	 1.48 	 0.589
38 	 1.62 	 0.597
39 	 1.68 	 0.594
40 	 1.75 	 0.596
41 	 1.81 	 0.603
42 	 1.90 	 0.596
43 	 1.98 	 0.594
44 	 2.04 	 0.596
45 	 2.11 	 0.601
46 	 2.23 	 0.596
47 	 2.37 	 0.6
48 	 2.44 	 0.601
49 	 2.58 	 0.602
