In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning



In [None]:
# tokenize and pad every document to make them of the same size

import os
import pandas as pd #importing libraries
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torchmetrics
from torch.utils.data import DataLoader

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping



seed_everything(42, workers=True)

Global seed set to 42


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


42

In [None]:
def get_msr_data(paraphrase_data_path, data_part):
  # This function reads the MRPC dataset
  with open(os.path.join(paraphrase_data_path, data_part), 'r') as f:
    data = f.read()

  sentence_1_data = [el.split('\t')[3:][0] for el in data.split('\n')[1:-1]]
  sentence_2_data = [el.split('\t')[3:][1] for el in data.split('\n')[1:-1]]
  targets = [int(el.split('\t')[0][0]) for el in data.split('\n')[1:-1]]
  return sentence_1_data, sentence_2_data, targets

In [None]:
def get_processed_df(dset, stop_words_l):
  documents_df = pd.DataFrame(dset, columns=['documents'])
  # removing special characters and stop words from the text
  documents_df['documents']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
  return documents_df

In [None]:
def get_doc_embeddings(tokenized_paded_documents):
  # creating document-word embeddings
  document_word_embeddings=embedding_matrix[tokenized_paded_documents]
  # calculating average of word vectors of documents
  document_embeddings = document_word_embeddings.mean(1)
  return document_embeddings

In [None]:
class EmbeddingData: #dataset class
  
  def __init__(self, embs1, embs2, targets): 
    self.embs1 = embs1 #defines first embedding
    self.embs2 = embs2
    self.targets = targets #for knowing how similiar the targets are

  def __len__(self):
    return len(self.embs1)

  def __getitem__(self, idx): #allows obtaining each item of dataset by index
    return {
        'emb1': self.embs1[idx],
        'emb2': self.embs2[idx],
        'target': self.targets[idx]
    }

class SiameseClassifier(pl.LightningModule): #binary classifier class, gets vectors defined by the word2vec algorithm

    def __init__(self, first_dim, learning_rate=1e-3):
        super().__init__()

        self.fc1 = torch.nn.Linear(first_dim, 512) #vectors for each linear layer
        self.fc2 = torch.nn.Linear(512, 64)
        self.drop = torch.nn.Dropout(0.5)
        self.fc3 = torch.nn.Linear(128, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.loss = torch.nn.BCELoss()
        self.accuracy_t = torchmetrics.Accuracy()
        self.accuracy_v = torchmetrics.Accuracy()
        self.learning_rate = learning_rate

    def forward(self, emb1, emb2): #this is a value function for adding non-linearity to the model so it can learn non-linear features.

        x1 = self.fc1(emb1)
        x2 = self.fc1(emb2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x1 = self.drop(x1)
        x2 = self.drop(x2)

        x1 = self.fc2(x1)
        x2 = self.fc2(x2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x = torch.cat([x1, x2], 1)  #neural network vectors are concatnated together before passing through the final layer for single output
        
        x = self.fc3(x)
        x = self.sigmoid(x) #passes through a sigmoid function

        return x

    def configure_optimizers(self): #optimizers for properly training the neural network
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5)
        return [optimizer], [lr_scheduler]

    def training_step(self, train_batch, batch_idx): #needs to be trained
        x1, x2, y = train_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1) #forward pass through neural networ
        loss = self.loss(y_hat, y) #loss function
        preds = (y_hat > 0.5).to(torch.int) #sigmoid functions output between 0 and 1, and a prediction is made if that score is greater than 0.5 
        self.log('train_loss', loss)
        self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
        return loss

    def validation_step(self, val_batch, batch_idx):  #validiation step is similiar to training step, for helping with model generalizability
        x1, x2, y = val_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('val_loss', loss)
        self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        self.log('valid_acc_epoch', self.accuracy_v.compute())
        print('valid_acc_epoch', self.accuracy_v.compute())

    def training_epoch_end(self, outs):
        self.log('train_acc_epoch', self.accuracy_t.compute())
        print('train_acc_epoch', self.accuracy_t.compute())

In [None]:
MAX_LEN = 128 #hyperparameters like batch size
EPOCHS = 100
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #checks for GPU/CPU

In [None]:
paraphrase_data_path = '/content/drive/MyDrive/msr_paraphrase'
train_1, train_2, train_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_train.txt') #concatenate data together for training and testing
test_1, test_2, test_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_test.txt')
train_1_2 = np.array([[el1, el2] for el1, el2 in zip(train_1, train_2)])
test_1_2 = np.array([[el1, el2] for el1, el2 in zip(test_1, test_2)])

ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
train_1_2, train_targets = ros.fit_resample(train_1_2, train_targets)
ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
test_1_2, test_targets = ros.fit_resample(test_1_2, test_targets)

test_1, test_2 = list(test_1_2[:,0]), list(test_1_2[:,1])
train_1, train_2 = list(train_1_2[:,0]), list(train_1_2[:,1])

In [None]:
# Sample corpus
# The text will be enable the testing of this algorithm
stop_words_l = stopwords.words('english')
documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2 = (get_processed_df(dset, stop_words_l) for dset in (train_1, train_2, test_1, test_2))

In [None]:
tfidfvectorizer = TfidfVectorizer()
tfidfvectorizer.fit(documents_df_train_1.documents + documents_df_train_2.documents)
tfidf_vectors_train_1, tfidf_vectors_train_2, tfidf_vectors_test_1, tfidf_vectors_test_2 = (tfidfvectorizer.transform(doc.documents).toarray().astype(np.float32) for doc in (documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2))

In [None]:
'''import pickle
with open('/content/drive/MyDrive/tfidf_model/tfidfvectorizer.pickle', 'wb') as f:
  pickle.dump(tfidfvectorizer, f)'''

"import pickle\nwith open('/content/drive/MyDrive/tfidf_model/tfidfvectorizer.pickle', 'wb') as f:\n  pickle.dump(tfidfvectorizer, f)"

In [None]:
ds_train = EmbeddingData(tfidf_vectors_train_1, tfidf_vectors_train_2, train_targets)
ds_test = EmbeddingData(tfidf_vectors_test_1, tfidf_vectors_test_2, test_targets)

In [None]:
train_loader = DataLoader(ds_train, batch_size=batch_size)
val_loader = DataLoader(ds_test, batch_size=batch_size)

# model
model = SiameseClassifier(first_dim=tfidf_vectors_train_1.shape[1])

In [None]:
early_stop_callback = EarlyStopping(monitor="valid_acc_epoch", min_delta=0.00, patience=15, verbose=False, mode="max")
checkpoint_callback = ModelCheckpoint(
    monitor="valid_acc_epoch",
    dirpath="/content/drive/MyDrive/tfidf_model/",
    filename="tfidf-{epoch:02d}-{valid_acc_epoch:.2f}",
    save_top_k=1,
    mode="max",
)

# training
trainer = pl.Trainer(accelerator=device, callbacks=[early_stop_callback, checkpoint_callback], max_epochs = EPOCHS, check_val_every_n_epoch=1)
trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name       | Type     | Params
----------------------------------------
0 | fc1        | Linear   | 6.3 M 
1 | fc2        | Linear   | 32.8 K
2 | drop       | Dropout  | 0     
3 | fc3        | Linear   | 129   
4 | relu       | ReLU     | 0     
5 | sigmoid    | Sigmoid  | 0     
6 | loss       | BCELoss  | 0     
7 | accuracy_t | Accuracy | 0     
8 | accuracy_v | Accuracy | 0     
----------------------------------------
6.4 M     Trainable params
0         Non-trainable params
6.4 M     Total params
25.406    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


valid_acc_epoch tensor(0.6875)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5099)
train_acc_epoch tensor(0.6351)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5106)
train_acc_epoch tensor(0.6257)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5121)
train_acc_epoch tensor(0.6794)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5244)
train_acc_epoch tensor(0.7331)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5372)
train_acc_epoch tensor(0.7791)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5452)
train_acc_epoch tensor(0.8122)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5507)
train_acc_epoch tensor(0.8368)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5548)
train_acc_epoch tensor(0.8557)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5581)
train_acc_epoch tensor(0.8704)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5610)
train_acc_epoch tensor(0.8825)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5628)
train_acc_epoch tensor(0.8925)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5642)
train_acc_epoch tensor(0.9011)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5653)
train_acc_epoch tensor(0.9083)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5664)
train_acc_epoch tensor(0.9147)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5673)
train_acc_epoch tensor(0.9202)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5679)
train_acc_epoch tensor(0.9250)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5686)
train_acc_epoch tensor(0.9293)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5691)
train_acc_epoch tensor(0.9331)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5698)
train_acc_epoch tensor(0.9365)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5703)
train_acc_epoch tensor(0.9396)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5713)
train_acc_epoch tensor(0.9423)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5721)
train_acc_epoch tensor(0.9447)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5728)
train_acc_epoch tensor(0.9470)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5736)
train_acc_epoch tensor(0.9491)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5742)
train_acc_epoch tensor(0.9510)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5745)
train_acc_epoch tensor(0.9528)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5748)
train_acc_epoch tensor(0.9545)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5751)
train_acc_epoch tensor(0.9561)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5755)
train_acc_epoch tensor(0.9576)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5759)
train_acc_epoch tensor(0.9590)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5761)
train_acc_epoch tensor(0.9602)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5764)
train_acc_epoch tensor(0.9615)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5766)
train_acc_epoch tensor(0.9626)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5769)
train_acc_epoch tensor(0.9637)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5773)
train_acc_epoch tensor(0.9647)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5775)
train_acc_epoch tensor(0.9657)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5778)
train_acc_epoch tensor(0.9666)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5780)
train_acc_epoch tensor(0.9674)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5782)
train_acc_epoch tensor(0.9682)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5784)
train_acc_epoch tensor(0.9690)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5785)
train_acc_epoch tensor(0.9698)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5787)
train_acc_epoch tensor(0.9705)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5788)
train_acc_epoch tensor(0.9711)


In [None]:
def predict_similarity(sent1, sent2): #this function lets the model predict if the sentences are similiar or not
  sent1, sent2 = re.sub(r'[^a-z ]', '', sent1.lower()), re.sub(r'[^a-z ]', '', sent2.lower())
  docs = tokenizer.texts_to_sequences([sent1, sent2])
  docs = pad_sequences(docs,maxlen=MAX_LEN,padding='post')
  embeddings = torch.Tensor(get_doc_embeddings(docs))
  pred = model(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).detach().cpu().numpy()[0]
  result = 'Similar' if pred > 0.5 else 'Not Similar'
  return pred, result

In [None]:
i = 76 #testing cell
print(test_1[i], '\n', test_2[i], '\n', test_targets[i])
sent1 = test_1[i]
sent2 = test_2[i]
predict_similarity(sent1, sent2)