In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning

In [None]:
!pip install transformers

# Imports

In [None]:
# tokenize and pad every document to make them of the same size
# this makes vectors from the documents
import os
import pandas as pd # importing libraries
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler

import transformers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torchmetrics
from torch.utils.data import DataLoader

from tqdm import tqdm

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

seed_everything(42, workers=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Global seed set to 42


42

In [None]:
def get_msr_data(paraphrase_data_path, data_part):
  # This function reads the MRPC dataset
  with open(os.path.join(paraphrase_data_path, data_part), 'r') as f:
    data = f.read()

  sentence_1_data = [el.split('\t')[3:][0] for el in data.split('\n')[1:-1]]
  sentence_2_data = [el.split('\t')[3:][1] for el in data.split('\n')[1:-1]]
  targets = [int(el.split('\t')[0][0]) for el in data.split('\n')[1:-1]]
  return sentence_1_data, sentence_2_data, targets

# BERT

In [None]:
class BERTMSRDataset:
  # This class returns a processed data sample by index
  def __init__(self, sent_1, sent_2, targets):
    self.sent_1 = sent_1
    self.sent_2 = sent_2
    self.targets = targets
    self.sent_len = len(sent_1)

  def __len__(self):
    return self.sent_len

  def __getitem__(self, idx):

    target = torch.FloatTensor([self.targets[idx]])
    s1 = " ".join(self.sent_1[idx].split()).lower() 
    s2 = " ".join(self.sent_2[idx].split()).lower() # remove redundant spaces in a sentence

    inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN_BERT, padding='max_length') # encode sentences

    ids = torch.LongTensor(inputs['input_ids']) # a padded vector of encoded words
    token_type_ids = torch.LongTensor(inputs['token_type_ids']) # a mask that separates one sentence from the other
    mask = torch.LongTensor(inputs['attention_mask']) # a mask that highlights what part of the token ids the model needs to attend

    # data format: SENTENCE1 [SEP] SENTENCE2

    return {
        "ids": ids,
        "token_type_ids": token_type_ids,
        "mask": mask,
        "target": target,
        "sent": self.sent_1[idx] + '[SEP]' + self.sent_2[idx]
    }

class BERTModel(pl.LightningModule):
  def __init__(self, conf, learning_rate=1e-4):
    super().__init__()
    self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', config=conf) # load pretrained bert model
    self.drop = torch.nn.Dropout(0.5) # add regularization
    self.out = torch.nn.Linear(self.bert.config.hidden_size, 1) # add a classification layer

    self.loss = torch.nn.BCELoss()

    self.accuracy_t = torchmetrics.Accuracy()
    self.accuracy_v = torchmetrics.Accuracy()
    self.learning_rate = learning_rate

  def forward(self, ids, token_type_ids, mask, labels=None):
    out = self.bert(input_ids=ids, token_type_ids=token_type_ids, attention_mask=mask)['pooler_output']
    out = self.drop(out)
    out = self.out(out)
    out = torch.sigmoid(out) # pass the output of the model through the sigmoid function
    
    return out

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5)
    return [optimizer], [lr_scheduler]

  def training_step(self, train_batch, batch_idx): #function for training
    ids, token_type_ids, mask, target, sent = train_batch.values()
    y = target.to(torch.float)
    y_hat = self.forward(ids, token_type_ids, mask)
    loss = self.loss(y_hat, y)
    preds = (y_hat > 0.5).to(torch.int)
    self.log('train_loss', loss)
    self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
    print(batch_idx, self.accuracy_t.compute())
    return loss

  def validation_step(self, val_batch, batch_idx): #function for validation
    ids, token_type_ids, mask, target, sent = val_batch.values()
    y = target.to(torch.float)
    y_hat = self.forward(ids, token_type_ids, mask)
    loss = self.loss(y_hat, y)
    preds = (y_hat > 0.5).to(torch.int) #checks if the value is greater than 0.5 to make determination
    self.log('valid_loss', loss)
    self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
    return loss
  
  def validation_epoch_end(self, validation_step_outputs): 
    self.log('valid_acc_epoch', self.accuracy_v.compute())
    print('valid_acc_epoch', self.accuracy_v.compute())

  def training_epoch_end(self, outs):
    self.log('train_acc_epoch', self.accuracy_t.compute())
    print('train_acc_epoch', self.accuracy_t.compute())

In [None]:
def predict_similarity_bert(s1, s2):
  inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN_BERT_W2V, padding='max_length')

  ids = torch.LongTensor(inputs['input_ids']).unsqueeze(0).to(device)
  token_type_ids = torch.LongTensor(inputs['token_type_ids']).unsqueeze(0).to(device)
  mask = torch.LongTensor(inputs['attention_mask']).unsqueeze(0).to(device)
  outputs = bert(ids, token_type_ids, mask)
  outputs = outputs.cpu().detach().squeeze(0).numpy()[0]

  return outputs

# TFIDF

In [None]:
class SiameseClassifierTFIDF(pl.LightningModule):

    def __init__(self, learning_rate=1e-3):
        super().__init__()

        self.fc1 = torch.nn.Linear(12340, 512) #large vector set to start the vector embeddings
        self.fc2 = torch.nn.Linear(512, 64)
        self.drop = torch.nn.Dropout(0.5)
        self.fc3 = torch.nn.Linear(128, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.loss = torch.nn.BCELoss()
        self.accuracy_t = torchmetrics.Accuracy()
        self.accuracy_v = torchmetrics.Accuracy()
        self.learning_rate = learning_rate

    def forward(self, emb1, emb2): #forward pass function

        x1 = self.fc1(emb1)
        x2 = self.fc1(emb2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x1 = self.drop(x1)
        x2 = self.drop(x2)

        x1 = self.fc2(x1)
        x2 = self.fc2(x2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x = torch.cat([x1, x2], 1)
        
        x = self.fc3(x)
        x = self.sigmoid(x) #sigmoid function

        return x

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5)
        return [optimizer], [lr_scheduler]

    def training_step(self, train_batch, batch_idx): #training and validation
        x1, x2, y = train_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('train_loss', loss)
        self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
        return loss

    def validation_step(self, val_batch, batch_idx):
        x1, x2, y = val_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('val_loss', loss)
        self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        self.log('valid_acc_epoch', self.accuracy_v.compute())
        print('valid_acc_epoch', self.accuracy_v.compute())

    def training_epoch_end(self, outs):
        self.log('train_acc_epoch', self.accuracy_t.compute())
        print('train_acc_epoch', self.accuracy_t.compute())

In [None]:
def predict_similarity_tfidf(sent1, sent2): #predicting similarity
  sent1, sent2 = re.sub(r'[^a-z ]', '', sent1.lower()), re.sub(r'[^a-z ]', '', sent2.lower())
  sent1e = torch.Tensor(tfidfvectorizer.transform([sent1]).toarray().astype(np.float32))
  sent2e = torch.Tensor(tfidfvectorizer.transform([sent2]).toarray().astype(np.float32))
  pred = tfidf(sent1e, sent2e).detach().cpu().numpy().flatten()[0]
  return pred

# W2V

In [None]:
class SiameseClassifierW2V(pl.LightningModule):

    def __init__(self, learning_rate=1e-3):
        super().__init__()

        self.fc1 = torch.nn.Linear(300, 128) #vector embeddings
        self.fc2 = torch.nn.Linear(128, 32)
        self.drop = torch.nn.Dropout(0.1)
        self.fc3 = torch.nn.Linear(64, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.loss = torch.nn.BCELoss()
        self.accuracy_t = torchmetrics.Accuracy()
        self.accuracy_v = torchmetrics.Accuracy()
        self.learning_rate = learning_rate

    def forward(self, emb1, emb2): #for forward pass

        x1 = self.fc1(emb1)
        x2 = self.fc1(emb2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        #x1 = self.drop(x1)
        #x2 = self.drop(x2)

        x1 = self.fc2(x1)
        x2 = self.fc2(x2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x = torch.cat([x1, x2], 1)
        
        x = self.fc3(x)
        x = self.sigmoid(x) #sigmoid function

        return x

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5)
        return [optimizer], [lr_scheduler]

    def training_step(self, train_batch, batch_idx): #testing and validation steps
        x1, x2, y = train_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('train_loss', loss)
        self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
        return loss

    def validation_step(self, val_batch, batch_idx):
        x1, x2, y = val_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('val_loss', loss)
        self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        self.log('valid_acc_epoch', self.accuracy_v.compute())
        print('valid_acc_epoch', self.accuracy_v.compute())

    def training_epoch_end(self, outs):
        self.log('train_acc_epoch', self.accuracy_t.compute())
        print('train_acc_epoch', self.accuracy_t.compute())

class EmbeddingData:
  
  def __init__(self, embs1, embs2, targets):
    self.embs1 = embs1
    self.embs2 = embs2
    self.targets = targets

  def __len__(self):
    return len(self.embs1)

  def __getitem__(self, idx):
    return {
        'emb1': self.embs1[idx],
        'emb2': self.embs2[idx],
        'target': self.targets[idx]
    }

In [None]:
def get_doc_embeddings_w2v(tokenized_paded_documents):
  # creating document-word embeddings
  document_word_embeddings=embedding_matrix[tokenized_paded_documents]
  # calculating average of word vectors of documents
  document_embeddings = document_word_embeddings.mean(1)
  return document_embeddings

def predict_similarity_w2v(sent1, sent2):
  sent1, sent2 = re.sub(r'[^a-z ]', '', sent1.lower()), re.sub(r'[^a-z ]', '', sent2.lower())
  docs = tokenizer.texts_to_sequences([sent1, sent2])
  docs = pad_sequences(docs,maxlen=MAX_LEN_BERT_W2V,padding='post')
  embeddings = torch.Tensor(get_doc_embeddings_w2v(docs))
  pred = w2v(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).detach().cpu().numpy().flatten()[0]
  result = 'Similar' if pred > 0.5 else 'Not Similar'
  return pred

# Main

In [None]:
MAX_LEN_BERT_W2V = 128

In [None]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
W2V_PATH= "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz"
model_w2v= gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH,binary=True)

In [None]:
w2v = SiameseClassifierW2V()
tfidf = SiameseClassifierTFIDF()

TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model_config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model_config.output_hidden_states = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bert = BERTModel(model_config).to(device)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
w2v = w2v.load_from_checkpoint('/content/drive/MyDrive/w2v_model/emorec-epoch=99-valid_acc_epoch=0.60.ckpt')
tfidf = tfidf.load_from_checkpoint('/content/drive/MyDrive/tfidf_model/tfidf-epoch=42-valid_acc_epoch=0.58.ckpt')
bert = bert.load_from_checkpoint('/content/drive/MyDrive/text_similarity/bert-epoch=00-valid_acc_epoch=0.77.ckpt', conf=model_config).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
paraphrase_data_path = '/content/drive/MyDrive/msr_paraphrase'
train_1, train_2, train_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_train.txt')
test_1, test_2, test_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_test.txt')

In [None]:
import pickle
with open('/content/drive/MyDrive/w2v_model/tok_emb.pickle', 'rb') as f:
  tokenizer, embedding_matrix = pickle.load(f)

with open('/content/drive/MyDrive/tfidf_model/tfidfvectorizer.pickle', 'rb') as f:
  tfidfvectorizer = pickle.load(f)

In [None]:
preds = [] #the code will make a prediction here. There is a summation of the predictions of the other models that are combined.
for s1, s2 in tqdm(zip(test_1, test_2)):
  out_tfidf = predict_similarity_tfidf(s1, s2)
  out_bert = predict_similarity_bert(s1, s2)
  out_w2v = predict_similarity_w2v(s1, s2)

  preds.append(out_tfidf * 0.2 + out_w2v * 0.2 + out_bert * 0.6) #weights are different depending on the model. BERT has 0.6, TFIDF and Word2Vec have 0.2. 

preds = (np.array(preds) > 0.5).astype(int)
np.sum(np.array(test_targets) == preds)/len(test_targets)

1725it [01:02, 27.72it/s]


0.7878260869565218