In [1]:
import json
import pandas as pd
import numpy as np
import torch
from torch import nn
from pprint import pprint
import jsonlines
import os
import re
from tqdm.notebook import tqdm
import pickle

from typing import *

In [2]:
train_path = 'data/train.jsonl'
dev_path = 'data/dev.jsonl'

# Utils

In [3]:
def save_dictionary(dictionary, path):
    with open(path, 'wb') as f:
        pickle.dump(dictionary, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_dictionary(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Create word embedding with GloVe

In [4]:
def sentence2vector(sentence: str) -> Optional[torch.Tensor]:
    sentences_word_vector = [word_vectors[w] for w in sentence.split(' ') if w in word_vectors]
    
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  # tensor shape: (#words X #features)
    return torch.mean(sentences_word_vector, dim=0)

In [5]:
def cosine_similarity(v1: torch.Tensor, v2: torch.Tensor) -> float:
    num = torch.sum(v1 * v2)
    den = torch.linalg.norm(v1) * torch.linalg.norm(v2)
    return (num / den).item()

In [6]:
word_vectors = dict()
n_words = 400_000
with open('embeddings/glove.6B.300d.txt') as f:
    for i, line in tqdm(enumerate(f), total=n_words):

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector

  0%|          | 0/400000 [00:00<?, ?it/s]

# Dataset class using GloVe

In [26]:
x1 = train_dataset[0][0]
x2 = train_dataset[1][0]

In [39]:
with jsonlines.open(train_path, 'r') as f:
    for i, line in enumerate(f.iter()):
        break

In [41]:
line['sentence1']

'In that context of coordination and integration, Bolivia holds a key play in any process of infrastructure development.'

In [None]:
sentence2vector()

In [49]:
class WiCDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path: str):
        self.data = []
        self.create_dataset(dataset_path)
    
    def create_dataset(self, dataset_path: str) -> None:
        with jsonlines.open(dataset_path, 'r') as f:
            for i, line in enumerate(f.iter()):
                s1 = sentence2vector(line['sentence1'])
                s2 = sentence2vector(line['sentence2'])
                # sentence = f"{line['sentence1']} {line['sentence2']}"
                sentence_vector = torch.cat((s1, s2))
                
                label = torch.tensor(1, dtype=torch.float32) if line['label'] == 'True' else torch.tensor(0, dtype=torch.float32)
                self.data.append((sentence_vector, label))


    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data[idx]

# Model Class

In [122]:
class MLP(nn.Module):
    def __init__(
        self,
        n_features: int,
        num_layers: int,
        hidden_dim: int,
        activation: Callable[[torch.Tensor], torch.Tensor],
    ) -> None:
        super().__init__()

        self.first_layer = nn.Linear(in_features=n_features, out_features=hidden_dim)

        self.layers = (
            nn.ModuleList()
        )

        for i in range(num_layers):
            self.layers.append(
                nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
            )
        self.activation = activation
        
        self.batchnorm = nn.BatchNorm1d(hidden_dim)

        self.last_layer = nn.Linear(in_features=hidden_dim, out_features=1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, meshgrid: torch.Tensor) -> torch.Tensor:
        """
        Applies transformations to each (x, y) independently

        :param meshgrid: tensor of dimensions [..., 2], where ... means any number of dims
        """
        out = meshgrid

        out = self.first_layer(
            out
        )  # First linear layer, transforms the hidden dimensions from `n_features` (embedding dimension) to `hidden_dim`
        for layer in self.layers:  # Apply `k` (linear, activation) layer
            out = layer(out)
            out = self.activation(out)
            # out = self.batchnorm(out)
            # out = nn.Dropout(p=0.2)(out)
        out = self.last_layer(
            out
        )  # Last linear layer to bring the `hiddem_dim` features to a binary space (`True`/`False`)
        
        out = self.sigmoid(out)
        return out.squeeze(-1)


# Training process

In [71]:
def correctly_predicted(predicted, gt):
    predicted_labels = (predicted > 0.5).float()

    return (predicted_labels == gt).sum().item(), gt.shape[0]

def step(model, criterion, xb, yb, opt=None):
    loss = criterion(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def fit(epochs, model, criterion, opt, train_dl, valid_dl):
    for epoch in tqdm(range(epochs)):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            step(model, criterion, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            # losses, nums = zip(*[step(model, criterion, xb, yb) for xb, yb in valid_dl])
            losses = []
            nums = []
            corrects = []
            for xb, yb in valid_dl:
                xb = xb.to(device)
                yb = yb.to(device)
                
                loss, num = step(model, criterion, xb, yb)
                correct, _ = correctly_predicted(model(xb), yb)
                losses.append(loss)
                nums.append(num)
                corrects.append(correct)
                

        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        val_acc = np.sum(corrects) / np.sum(nums)

        print(f"{epoch} \t {val_loss} \t {val_acc}")


In [91]:
train_dataset = WiCDataset(train_path)
val_dataset = WiCDataset(dev_path)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [132]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.BCELoss()#.to(device)
model = MLP(n_features=600,
            num_layers=5, 
            hidden_dim=150, 
            activation=torch.nn.functional.glu).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.00001)

In [133]:
fit(50, model, criterion, optimizer, train_loader, val_loader)

  0%|          | 0/50 [00:00<?, ?it/s]

RuntimeError: mat1 dim 1 must match mat2 dim 0