In [None]:
!pip install pytorch-lightning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#This creates vectors out of documents
import os
import pandas as pd #importing libraries
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from imblearn.over_sampling import RandomOverSampler
import torch
import torchmetrics
from torch.utils.data import DataLoader

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

seed_everything(42, workers=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Global seed set to 42


42

In [None]:
def get_msr_data(paraphrase_data_path, data_part):
  # This function reads the MRPC dataset
  with open(os.path.join(paraphrase_data_path, data_part), 'r') as f:
    data = f.read()

  sentence_1_data = [el.split('\t')[3:][0] for el in data.split('\n')[1:-1]]
  sentence_2_data = [el.split('\t')[3:][1] for el in data.split('\n')[1:-1]]
  targets = [int(el.split('\t')[0][0]) for el in data.split('\n')[1:-1]]
  return sentence_1_data, sentence_2_data, targets

In [None]:
def get_processed_df(dset, stop_words_l):
  documents_df = pd.DataFrame(dset, columns=['documents'])
  # removing special characters and stop words from the text
  documents_df['documents']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
  return documents_df

In [None]:
def get_doc_embeddings(documents_df): #function to obtain embeddings from the document
  document_embeddings = np.zeros((len(documents_df), VEC_SIZE)) #makes new array
# creating document-word embeddings
  for i in range(len(document_embeddings)):
      document_embeddings[i] = model_d2v.infer_vector(word_tokenize(documents_df_train_1.documents[i])) #fills the new array
  return document_embeddings

In [None]:
class EmbeddingData: #dataset class for pytorch processing
  
  def __init__(self, embs1, embs2, targets):
    self.embs1 = embs1 #defines embeddings
    self.embs2 = embs2
    self.targets = targets #for obtaining target similiarity

  def __len__(self):
    return len(self.embs1)

  def __getitem__(self, idx): #gets each item of the dataset by index
    return {
        'emb1': self.embs1[idx],
        'emb2': self.embs2[idx],
        'target': self.targets[idx]
    }

class SiameseClassifier(pl.LightningModule): #binary classifier class, gets vectors defined by the doc2vec algorithm

    def __init__(self, learning_rate=1e-3):
        super().__init__()

        self.fc1 = torch.nn.Linear(VEC_SIZE, 128) #vectors for each linear layer
        self.fc2 = torch.nn.Linear(128, 32)
        self.drop = torch.nn.Dropout(0.1)
        self.fc3 = torch.nn.Linear(64, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.loss = torch.nn.BCELoss()
        self.accuracy_t = torchmetrics.Accuracy()
        self.accuracy_v = torchmetrics.Accuracy()
        self.learning_rate = learning_rate

    def forward(self, emb1, emb2): #this is a value function for adding non-linearity to the model so it can learn non-linear features.

        x1 = self.fc1(emb1)
        x2 = self.fc1(emb2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x1 = self.drop(x1)
        x2 = self.drop(x2)

        x1 = self.fc2(x1)
        x2 = self.fc2(x2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x = torch.cat([x1, x2], 1) #neural network vectors are concatenated together before passing through the final layer for single output
        
        x = self.fc3(x)
        x = self.sigmoid(x) #passes through a sigmoid function

        return x

    def configure_optimizers(self): #optimizers for properly training the neural network
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate) #rate optimizer
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5) #reduces learning rate and helps neural network generalize better
        return [optimizer], [lr_scheduler]

    def training_step(self, train_batch, batch_idx): #for training
        x1, x2, y = train_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1) #forward pass through neural network
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int) #sigmoid functions output between 0 and 1, and a prediction is made if that score is greater than 0.5
        self.log('train_loss', loss)
        self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
        return loss

    def validation_step(self, val_batch, batch_idx): #validiation step is similiar to training step, for helping with model generalizability
        x1, x2, y = val_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('val_loss', loss)
        self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        self.log('valid_acc_epoch', self.accuracy_v.compute())
        print('valid_acc_epoch', self.accuracy_v.compute())

    def training_epoch_end(self, outs):
        self.log('train_acc_epoch', self.accuracy_t.compute())
        print('train_acc_epoch', self.accuracy_t.compute())

In [None]:
MAX_LEN = 128 #setting hyperparaments such as epoch and batch size
EPOCHS = 100
batch_size = 64
VEC_SIZE = 20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  #checks for GPU or CPU

In [None]:
paraphrase_data_path = '/content/drive/MyDrive/msr_paraphrase'
train_1, train_2, train_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_train.txt')
test_1, test_2, test_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_test.txt')
train_1_2 = np.array([[el1, el2] for el1, el2 in zip(train_1, train_2)]) #concatenates data together for training and testing
test_1_2 = np.array([[el1, el2] for el1, el2 in zip(test_1, test_2)])

ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
train_1_2, train_targets = ros.fit_resample(train_1_2, train_targets)
ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
test_1_2, test_targets = ros.fit_resample(test_1_2, test_targets)

test_1, test_2 = list(test_1_2[:,0]), list(test_1_2[:,1])
train_1, train_2 = list(train_1_2[:,0]), list(train_1_2[:,1])

In [None]:
# Sample corpus
# The text will be enable the testing of this algorithm
stop_words_l = stopwords.words('english')
documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2 = (get_processed_df(dset, stop_words_l) for dset in (train_1, train_2, test_1, test_2))

In [None]:
tagged_data_train_1 = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df_train_1.documents)] #cycles through documents
tagged_data_train_2 = [TaggedDocument(words=word_tokenize(doc), tags=[i+len(tagged_data_train_1)]) for i, doc in enumerate(documents_df_train_2.documents)]

tagged_data_test_1 = [TaggedDocument(words=word_tokenize(doc), tags=[i+len(tagged_data_train_1)+len(tagged_data_train_2)]) for i, doc in enumerate(documents_df_test_1.documents)]
tagged_data_test_2 = [TaggedDocument(words=word_tokenize(doc), tags=[i+len(tagged_data_train_1)+len(tagged_data_train_2)+len(tagged_data_test_1)]) for i, doc in enumerate(documents_df_test_2.documents)]

In [None]:
model_d2v = Doc2Vec(vector_size=VEC_SIZE, window=100, iter=20, dm=1, workers=2)
model_d2v.build_vocab(tagged_data_train_1 + tagged_data_train_2 + tagged_data_test_1 + tagged_data_test_2)
model_d2v.train(tagged_data_train_1 + tagged_data_train_2 + tagged_data_test_1 + tagged_data_test_2, total_examples=len(tagged_data_train_1)+len(tagged_data_train_2)+len(tagged_data_test_1)+len(tagged_data_test_2), epochs=model_d2v.epochs)



In [None]:
doc_emb_train_1, doc_emb_train_2, doc_emb_test_1, doc_emb_test_2 = (get_doc_embeddings(docs).astype(np.float32) for docs in (documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2))

In [None]:
'''scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(np.vstack((doc_emb_train_1, doc_emb_train_2)))
doc_emb_train_1 = scaler.transform(doc_emb_train_1)
doc_emb_train_2 = scaler.transform(doc_emb_train_2)
doc_emb_test_1 = scaler.transform(doc_emb_test_1)
doc_emb_test_2 = scaler.transform(doc_emb_test_2)'''

'scaler = sklearn.preprocessing.StandardScaler()\nscaler.fit(np.vstack((doc_emb_train_1, doc_emb_train_2)))\ndoc_emb_train_1 = scaler.transform(doc_emb_train_1)\ndoc_emb_train_2 = scaler.transform(doc_emb_train_2)\ndoc_emb_test_1 = scaler.transform(doc_emb_test_1)\ndoc_emb_test_2 = scaler.transform(doc_emb_test_2)'

In [None]:
ds_train = EmbeddingData(doc_emb_train_1, doc_emb_train_2, train_targets) #using embedding data for training and testing
ds_test = EmbeddingData(doc_emb_test_1, doc_emb_test_2, test_targets)

In [None]:
train_loader = DataLoader(ds_train, batch_size=batch_size)
val_loader = DataLoader(ds_test, batch_size=batch_size)

# model
model = SiameseClassifier()

In [None]:
early_stop_callback = EarlyStopping(monitor="valid_acc_epoch", min_delta=0.00, patience=90, verbose=False, mode="max")
checkpoint_callback = ModelCheckpoint(
    monitor="valid_acc_epoch",
    dirpath="/content/drive/MyDrive/d2v_model/",
    filename="d2v-{epoch:02d}-{valid_acc_epoch:.2f}",
    save_top_k=1,
    mode="max",
)

# training
trainer = pl.Trainer(accelerator=device, callbacks=[early_stop_callback, checkpoint_callback], max_epochs = EPOCHS, check_val_every_n_epoch=1)
trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name       | Type     | Params
----------------------------------------
0 | fc1        | Linear   | 2.7 K 
1 | fc2        | Linear   | 4.1 K 
2 | drop       | Dropout  | 0     
3 | fc3        | Linear   | 65    
4 | relu       | ReLU     | 0     
5 | sigmoid    | Sigmoid  | 0     
6 | loss       | BCELoss  | 0     
7 | accuracy_t | Accuracy | 0     
8 | accuracy_v | Accuracy | 0     
----------------------------------------
6.9 K     Trainable params
0         Non-trainable params
6.9 K     Total params
0.028     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


valid_acc_epoch tensor(0.4297)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.4950)
train_acc_epoch tensor(0.5380)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.4977)
train_acc_epoch tensor(0.5461)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.4990)
train_acc_epoch tensor(0.5690)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5001)
train_acc_epoch tensor(0.5820)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5009)
train_acc_epoch tensor(0.5906)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5013)
train_acc_epoch tensor(0.5967)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5014)
train_acc_epoch tensor(0.6015)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5013)
train_acc_epoch tensor(0.6055)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5016)
train_acc_epoch tensor(0.6091)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5021)
train_acc_epoch tensor(0.6124)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5024)
train_acc_epoch tensor(0.6151)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6178)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6202)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6226)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6247)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5027)
train_acc_epoch tensor(0.6268)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5028)
train_acc_epoch tensor(0.6289)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5028)
train_acc_epoch tensor(0.6309)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5028)
train_acc_epoch tensor(0.6327)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5028)
train_acc_epoch tensor(0.6343)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5023)
train_acc_epoch tensor(0.6349)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5024)
train_acc_epoch tensor(0.6344)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5024)
train_acc_epoch tensor(0.6349)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5025)
train_acc_epoch tensor(0.6357)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6364)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6372)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5026)
train_acc_epoch tensor(0.6380)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5025)
train_acc_epoch tensor(0.6390)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5024)
train_acc_epoch tensor(0.6400)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5022)
train_acc_epoch tensor(0.6411)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5020)
train_acc_epoch tensor(0.6423)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5019)
train_acc_epoch tensor(0.6435)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5017)
train_acc_epoch tensor(0.6447)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5016)
train_acc_epoch tensor(0.6460)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5015)
train_acc_epoch tensor(0.6473)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5015)
train_acc_epoch tensor(0.6485)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5014)
train_acc_epoch tensor(0.6499)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5014)
train_acc_epoch tensor(0.6512)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5013)
train_acc_epoch tensor(0.6525)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5013)
train_acc_epoch tensor(0.6537)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5012)
train_acc_epoch tensor(0.6548)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5011)
train_acc_epoch tensor(0.6548)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5010)
train_acc_epoch tensor(0.6554)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5009)
train_acc_epoch tensor(0.6560)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5008)
train_acc_epoch tensor(0.6566)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5007)
train_acc_epoch tensor(0.6572)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5007)
train_acc_epoch tensor(0.6580)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5006)
train_acc_epoch tensor(0.6587)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5006)
train_acc_epoch tensor(0.6594)


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
def predict_similarity(sent1, sent2): #function to predict the similiarity of words
  sent1, sent2 = re.sub(r'[^a-z ]', '', sent1.lower()), re.sub(r'[^a-z ]', '', sent2.lower())
  embeddings = torch.Tensor(model_d2v.infer_vector(word_tokenize(sent1))), torch.Tensor(model_d2v.infer_vector(word_tokenize(sent2)))
  pred = model(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).detach().cpu().numpy()[0]
  result = 'Similar' if pred > 0.5 else 'Not Similar'
  return pred, result

In [None]:
i = 76 #this part enables testing comparisons
print(test_1[i], '\n', test_2[i], '\n', test_targets[i])
sent1 = test_1[i]
sent2 = test_2[i]
predict_similarity(sent1, sent2)

From the start, however, the United States' declared goal was not just to topple Saddam but to stabilize Iraq and install a friendly government. 
 But the United States' ultimate goal was not just to topple Mr. Hussein but to stabilize the country and install a friendly government. 
 1


(array([0.9878164], dtype=float32), 'Similar')