In [1]:
!pip install sentence_transformers
!pip install pandas
!pip install matplotlib
!pip install spacy
!pip install nltk
!pip install torchtext
!pip install transformers
!pip install datasets

[0m

In [2]:
#!wget https://nlp.stanford.edu/data/glove.6B.zip
# !apt install unzip
# !unzip glove.6B.zip

In [3]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import numpy as np
import pandas as pd

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout



#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)



# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []

train_dict = {
    "s1": [],
    "s2": [],
    "score": []
}
test_dict = {
    "s1": [],
    "s2": [],
    "score": []
}
dev_dict = {
     "s1": [],
    "s2": [],
    "score": []
}
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        s1 = row["sentence1"].lower()
        s2 = row["sentence2"].lower()

        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
            dev_dict["s1"].append(s1)
            dev_dict["s2"].append(s2)
            dev_dict["score"].append(score)
        elif row['split'] == 'test':
            test_samples.append(inp_example)
            test_dict["s1"].append(s1)
            test_dict["s2"].append(s2)
            test_dict["score"].append(score)
        else:
            train_samples.append(inp_example)
            train_dict["s1"].append(s1)
            train_dict["s2"].append(s2)
            train_dict["score"].append(score)

train_df = pd.DataFrame(train_dict)
test_df = pd.DataFrame(test_dict)
dev_df = pd.DataFrame(dev_dict)

2023-10-08 04:28:20 - Read STSbenchmark train dataset


In [4]:
train_samples[0].texts, train_samples[0].label

(['A plane is taking off.', 'An air plane is taking off.'], 1.0)

In [5]:
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english") ## We'll use tokenizer available from PyTorch

tokenizer("Hello, How are you?")

['hello', ',', 'how', 'are', 'you', '?']

In [6]:
from torchtext.vocab import GloVe

global_vectors = GloVe(name='6B', dim=300)

2023-10-08 04:28:20 - Loading vectors from .vector_cache/glove.6B.300d.txt.pt


In [7]:
#global_vectors.get_vecs_by_tokens(["a", "banana"])
global_vectors.vectors.shape, global_vectors.itos[-1]

(torch.Size([400000, 300]), 'sandberger')

In [8]:
try:
    import gc
    try:
        model.cpu()
        del model
    except Exception as e:
        print(e)
    gc.collect()
    torch.cuda.empty_cache()
except Exception as e:
    print(e)

name 'model' is not defined
name 'torch' is not defined


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight.to(x.device)


class Block(nn.Module):
    def __init__(self, n_hidden):
        super().__init__()
        self.norm1 = RMSNorm(1)
        self.fc = nn.Linear(n_hidden, n_hidden)
        self.relu = nn.ReLU()

    def forward(self, x):
        x1 = self.norm1(x)
        x2 = self.relu(x1)
        x3 = x + x2
        return x3
        

class attbilstm(nn.Module):
    def __init__(self, vocab_size, config, vec=None):
        super().__init__()
        self.hidden_dim = config['hidden_dim']
        self.batch_size = config['batch_size']
        self.emb_dim = config['emb_dim']
        self.gpu = config['gpu']
        
        self.embedding = nn.Embedding(vocab_size, config['emb_dim'])
        if vec is not None:
            self.embedding.weight.data.copy_(vec) #load pretrained
            self.embedding.weight.requires_grad = False #non-trainable
        self.encoder = nn.LSTM(config['emb_dim'], config['hidden_dim'], num_layers=config['nlayers'], bidirectional=config['bidir'], dropout=config['dropout'])
        self.fc = nn.Linear(config['hidden_dim'] * 2, config['hidden_dim'] * 2)
        self.dropout = nn.Dropout(config['dropout'])
        self.layers = []
        for i in range(config["num_layers"]):
            self.layers.append(Block(config['hidden_dim'] * 2))
        # self.hidden = nn.Parameters(self.batch_size, self.hidden_dim)
        self.sigmoid = nn.Sigmoid()
    
    def attnetwork(self, encoder_out, final_hidden):
        hidden = final_hidden.squeeze(0)
        #M = torch.tanh(encoder_out)
        attn_weights = torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden = torch.bmm(encoder_out.transpose(1,2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        # print (wt.shape, new_hidden.shape)
        # new_hidden = torch.tanh(new_hidden)
        # print ('UP:', new_hidden.shape)
        # print("soft_attn_weights", soft_attn_weights.shape)
        
        return new_hidden, soft_attn_weights
    
    def forward(self, sequence):
        #print(sequence.shape)
        emb_input = self.embedding(sequence)    
        inputx = self.dropout(emb_input)
        output, (hn, cn) = self.encoder(inputx)
        # print("shape output, hn, cn", output.shape, hn.shape, cn.shape)
        fbout = output[:, :, :self.hidden_dim]+ output[:, :, self.hidden_dim:] #sum bidir outputs F+B
        fbout = fbout.permute(1,0,2)
        fbhn = (hn[-2,:,:]+hn[-1,:,:]).unsqueeze(0)
        # print(fbout.shape, fbhn.shape)
        attn_out, attn_weight = self.attnetwork(fbout, fbhn)
        # print("att shape", attn_out.shape, attn_weight.shape)
        #attn1_out = self.attnetwork1(output, hn)
        # logits = self.fc(attn_out)
        # print(fbout.shape, output.shape, attn_weight.permute(1,0).unsqueeze(-1).shape)
        out = output * attn_weight.permute(1,0).unsqueeze(-1)
        out = torch.mean(out, dim=(1))
        for l in self.layers:
            out = l(out)
        #print("out, attn", out.shape, attn_weight.shape)
        # out = self.sigmoid(out)
        return out
        

In [10]:
config = {
    "hidden_dim": 256,
    "batch_size": 4,
    "emb_dim": 300,
    "nlayers": 2,
    "dropout": 0.5,
    "bidir": True,
    "gpu": True,
    "out_dim": 256,
    "num_layers": 8
}
global_vectors.vectors.shape
vectors = torch.vstack([global_vectors.vectors, torch.zeros((1, 300))])
vocab_size = vectors.shape[0]
model = attbilstm(vocab_size=vocab_size, config=config, vec=vectors)

output = model.forward(torch.from_numpy(np.array([[1, 0, 0, 0], [2,3,0, 0], [4,5,6, 0]])))
output, output.shape

(tensor([[-7.2892e-03,  2.7604e+00,  7.0965e-01,  ..., -1.6619e-02,
           2.5715e+00, -1.0113e-02],
         [-1.1048e-02,  4.1293e+00,  1.2515e+00,  ..., -8.3236e-03,
           3.8774e+00, -4.1962e-03],
         [-1.2167e-02,  3.9126e+00, -3.2774e-03,  ..., -3.1512e-03,
           5.0554e-01,  1.8145e+00]], grad_fn=<AddBackward0>),
 torch.Size([3, 512]))

In [11]:
# model.to("cuda")
# torch.save(model.state_dict(), "test.pt")
# model.load_state_dict(torch.load("test.pt"))

In [12]:
global_vectors.vectors.shape
torch.vstack([global_vectors.vectors, torch.zeros((1, 300))]).shape, global_vectors.vectors.dtype

(torch.Size([400001, 300]), torch.float32)

In [13]:
#global_vectors.unk_init = nn.init.xavier_uniform_

def process_df(train_df, mx = 64):
    t = []
    for line in train_df["s1"]:
        parts = tokenizer(line)
        l = []
        for k in parts:
            if k in global_vectors.stoi:
                l.append(global_vectors.stoi[k])
            # else:
                # l.append(global_vectors.vectors.shape[0]-1)
        l.extend([vectors.shape[0]-1] * (mx - len(l)))
        t.append(np.array(l))
        
    
    train_df["i1"] = t
    
    t = []
    for line in train_df["s2"]:
        parts = tokenizer(line)
        l = []
        for k in parts:
            if k in global_vectors.stoi:
                l.append(global_vectors.stoi[k])
            # else:
            #     l.append(global_vectors.vectors.shape[0]-1)
        l.extend([vectors.shape[0]-1] * (mx - len(l)))
        t.append(np.array(l))
    train_df["i2"] = t

process_df(train_df)
process_df(test_df)


In [14]:
train_df

Unnamed: 0,s1,s2,score,i1,i2
0,a plane is taking off.,an air plane is taking off.,1.00,"[7, 1313, 14, 582, 138, 2, 400000, 400000, 400...","[29, 325, 1313, 14, 582, 138, 2, 400000, 40000..."
1,a man is playing a large flute.,a man is playing a flute.,0.76,"[7, 300, 14, 697, 7, 426, 16677, 2, 400000, 40...","[7, 300, 14, 697, 7, 16677, 2, 400000, 400000,..."
2,a man is spreading shreded cheese on a pizza.,a man is spreading shredded cheese on an uncoo...,0.76,"[7, 300, 14, 6002, 5795, 13, 7, 9388, 2, 40000...","[7, 300, 14, 6002, 20256, 5795, 13, 29, 53867,..."
3,three men are playing chess.,two men are playing chess.,0.52,"[87, 301, 32, 697, 7162, 2, 400000, 400000, 40...","[55, 301, 32, 697, 7162, 2, 400000, 400000, 40..."
4,a man is playing the cello.,a man seated is playing the cello.,0.85,"[7, 300, 14, 697, 0, 19641, 2, 400000, 400000,...","[7, 300, 9928, 14, 697, 0, 19641, 2, 400000, 4..."
...,...,...,...,...,...
5744,severe gales as storm clodagh hits britain,merkel pledges nato solidarity with latvia,0.00,"[2546, 43694, 19, 1836, 213914, 2042, 695, 400...","[6648, 7700, 945, 6132, 17, 7211, 400000, 4000..."
5745,dozens of egyptians hostages taken by libyan t...,egyptian boat crash death toll rises as more b...,0.00,"[2209, 3, 13007, 4005, 492, 21, 7176, 2712, 19...","[2434, 2377, 2005, 336, 2493, 4890, 19, 56, 17..."
5746,president heading to bahrain,president xi: china to continue help to fight ...,0.00,"[90, 3339, 4, 6700, 400000, 400000, 400000, 40...","[90, 9163, 132, 4, 660, 275, 4, 838, 19127, 40..."
5747,"china, india vow to further bilateral ties",china scrambles to reassure jittery stock traders,0.00,"[132, 1, 474, 12887, 4, 489, 2902, 1445, 40000...","[132, 45583, 4, 12182, 21350, 452, 3182, 40000..."


In [15]:
from datasets import IterableDataset
from torch.utils.data import Dataset, DataLoader

In [16]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        t = self.df.iloc[idx]
        return t[3],t[4], t[2]
    

In [None]:
import torch.nn as nn
import tqdm
from scipy import stats
ds = CustomDataset(train_df)
dl = DataLoader(ds, batch_size=16, shuffle=True) 

test_ds = CustomDataset(test_df)
test_dl = DataLoader(test_ds, batch_size=4)

import torch
from torch import nn, Tensor
from typing import Iterable, Dict



class CosineSimilarityLoss(nn.Module):

    def __init__(self, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation


    def forward(self, embeddings, labels: Tensor):
        emb_1 = embeddings[0]
        emb_2 = embeddings[1]
        output = self.cos_score_transformation(nn.functional.cosine_similarity(emb_1, emb_2))
        return self.loss_fct(output, labels.view(-1))

def predict(s1, s2):
    emb_1 = model(s1)
    emb_2 = model(s2)
    out = nn.functional.cosine_similarity(emb_1, emb_2)
    return out

def test(model, dl, crition, name="test"):
    print("test model ", name)
    s1 = []
    s2 = []
    avg_loss = 0
    for i,b in enumerate(dl):
        i1, i2, s = b
        emb_1 = model(i1.to(device))
        emb_2 = model(i2.to(device))
        score = nn.functional.cosine_similarity(emb_1, emb_2)
        s1.append(score.cpu().detach().numpy())
        s2.append(s.cpu().detach().numpy())
        #print(score, s)
        avg_loss += crition([emb_1,emb_2], s.to(device))
    x = np.hstack(s1)
    y = np.hstack(s2)
    p = stats.spearmanr(x,y)
    print(f"avg_{name}_loss = {avg_loss/len(dl)}, p= {p}")

device = "cuda"
model.to(device)
crition = CosineSimilarityLoss().to(device)
optimizer = torch.optim.Adam(lr=0.0001, params = model.parameters())

num_epoch = 10000
save_every = 100
#test(model, test_dl)



for epoch in tqdm.tqdm(range(num_epoch)):
    print("-" * 10, epoch + 1)
    avg_loss = 0
    model.train()
    for i, b in enumerate(dl):
        i1, i2, s = b
        #print(i1.shape, i2.shape, s.shape)
        s = s.to(torch.float32)
        optimizer.zero_grad()
        o1 = model(i1.to(device))
        o2 = model(i2.to(device))
        #print("o1.shape, o2.shape", o1.shape, o2.shape)
        loss = crition([o1,o2], s.to(device))
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        # if (i + 1) % 50 == 0:
            # print(f"loss = {loss.item()}, epoch={epoch}")
    
    print(f"avg_train_loss = {avg_loss/len(dl)}")
    test(model, dl, crition, "train")
    test(model, test_dl, crition, "test")
    if  (epoch + 1 ) % save_every == 0:
        try:
            torch.save(model.state_dict(), f"models/model-{epoch+1}.pt")
        except Exception as e:
            print(e)

  return t[3],t[4], t[2]


---------- 1
avg_train_loss = 0.13091778094983764
test model  train
avg_train_loss = 0.1039533252083504, p= SignificanceResult(statistic=0.0770661304859091, pvalue=4.893546241267359e-09)
test model  test


  0%|          | 1/10000 [00:08<22:50:18,  8.22s/it]

avg_test_loss = 0.14002217002289824, p= SignificanceResult(statistic=0.09649716068432411, pvalue=0.0003325362758987969)
---------- 2
avg_train_loss = 0.10516761852842238
test model  train
avg_train_loss = 0.09978624621332445, p= SignificanceResult(statistic=0.06580695774364458, pvalue=5.915540846895201e-07)
test model  test


  0%|          | 2/10000 [00:15<21:56:59,  7.90s/it]

avg_test_loss = 0.13030988721273287, p= SignificanceResult(statistic=0.055774976816270784, pvalue=0.03836519716866538)
---------- 3
avg_train_loss = 0.10109734749938878
test model  train
avg_train_loss = 0.096375494507379, p= SignificanceResult(statistic=0.09131676973609479, pvalue=4.0100773478644114e-12)
test model  test


  0%|          | 3/10000 [00:23<22:09:48,  7.98s/it]

avg_test_loss = 0.12677762630302936, p= SignificanceResult(statistic=0.11737694733725153, pvalue=1.2428030766860527e-05)
---------- 4
avg_train_loss = 0.09850424243551162
test model  train
avg_train_loss = 0.0976708659588712, p= SignificanceResult(statistic=0.0717751462173519, pvalue=5.094208294205859e-08)
test model  test


  0%|          | 4/10000 [00:31<22:04:55,  7.95s/it]

avg_test_loss = 0.1254943662777841, p= SignificanceResult(statistic=0.11661966429506937, pvalue=1.4147949057458374e-05)
---------- 5
avg_train_loss = 0.09439895167015493
test model  train


In [None]:
np.hstack([np.array([1,2]), np.array([3])])