In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.2-py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.1 MB/s 
[?25hCollecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 60.9 MB/s 
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 62.7 MB/s 
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 34.9 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 33.8 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylin

In [None]:
# tokenize and pad every document to make them of the same size
# this makes vectors from the documents
import os
import pandas as pd # importing libraries
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils import shuffle


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from imblearn.over_sampling import RandomOverSampler
import torch
import torchmetrics
from torch.utils.data import DataLoader

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping



seed_everything(42, workers=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Global seed set to 42


42

In [None]:
def get_msr_data(paraphrase_data_path, data_part):
  # This function reads the MRPC dataset
  with open(os.path.join(paraphrase_data_path, data_part), 'r') as f:
    data = f.read()

  sentence_1_data = [el.split('\t')[3:][0] for el in data.split('\n')[1:-1]]
  sentence_2_data = [el.split('\t')[3:][1] for el in data.split('\n')[1:-1]]
  targets = [int(el.split('\t')[0][0]) for el in data.split('\n')[1:-1]]
  return sentence_1_data, sentence_2_data, targets

In [None]:
def get_processed_df(dset, stop_words_l):
  documents_df = pd.DataFrame(dset, columns=['documents'])
  # removing special characters and stop words from the text
  documents_df['documents']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
  return documents_df

In [None]:
def get_doc_embeddings(tokenized_paded_documents):
  # creating document-word embeddings
  document_word_embeddings=embedding_matrix[tokenized_paded_documents]
  # calculating average of word vectors of documents
  document_embeddings = document_word_embeddings.mean(1)
  return document_embeddings

In [None]:
class EmbeddingData:
  
  def __init__(self, embs1, embs2, targets):
    self.embs1 = embs1
    self.embs2 = embs2
    self.targets = targets

  def __len__(self):
    return len(self.embs1)

  def __getitem__(self, idx):
    return {
        'emb1': self.embs1[idx],
        'emb2': self.embs2[idx],
        'target': self.targets[idx]
    }

class SiameseClassifier(pl.LightningModule):

    def __init__(self, learning_rate=1e-3):
        super().__init__()

        self.fc1 = torch.nn.Linear(300, 128)
        self.fc2 = torch.nn.Linear(128, 32)
        self.drop = torch.nn.Dropout(0.1)
        self.fc3 = torch.nn.Linear(64, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.loss = torch.nn.BCELoss()
        self.accuracy_t = torchmetrics.Accuracy()
        self.accuracy_v = torchmetrics.Accuracy()
        self.learning_rate = learning_rate

    def forward(self, emb1, emb2):

        x1 = self.fc1(emb1)
        x2 = self.fc1(emb2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        #x1 = self.drop(x1)
        #x2 = self.drop(x2)

        x1 = self.fc2(x1)
        x2 = self.fc2(x2)

        x1 = self.relu(x1)
        x2 = self.relu(x2)

        x = torch.cat([x1, x2], 1)
        
        x = self.fc3(x)
        x = self.sigmoid(x)

        return x

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, eta_min=1e-5)
        return [optimizer], [lr_scheduler]

    def training_step(self, train_batch, batch_idx):
        x1, x2, y = train_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('train_loss', loss)
        self.log('train_acc_step', self.accuracy_t(preds, y.to(torch.int)))
        return loss

    def validation_step(self, val_batch, batch_idx):
        x1, x2, y = val_batch.values()
        y = y.to(torch.float)
        y_hat = self.forward(x1, x2).squeeze(1)
        loss = self.loss(y_hat, y)
        preds = (y_hat > 0.5).to(torch.int)
        self.log('val_loss', loss)
        self.log('valid_acc_step', self.accuracy_v(preds, y.to(torch.int)))
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        self.log('valid_acc_epoch', self.accuracy_v.compute())
        print('valid_acc_epoch', self.accuracy_v.compute())

    def training_epoch_end(self, outs):
        self.log('train_acc_epoch', self.accuracy_t.compute())
        print('train_acc_epoch', self.accuracy_t.compute())

In [None]:
MAX_LEN = 128
EPOCHS = 100
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
W2V_PATH= "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz"
model_w2v= gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH,binary=True)

In [None]:
paraphrase_data_path = '/content/drive/MyDrive/msr_paraphrase'
train_1, train_2, train_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_train.txt')
test_1, test_2, test_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_test.txt')
train_1_2 = np.array([[el1, el2] for el1, el2 in zip(train_1, train_2)])
test_1_2 = np.array([[el1, el2] for el1, el2 in zip(test_1, test_2)])

ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
train_1_2, train_targets = ros.fit_resample(train_1_2, train_targets)
ros = RandomOverSampler(random_state=0, sampling_strategy='minority')
test_1_2, test_targets = ros.fit_resample(test_1_2, test_targets)

test_1, test_2 = list(test_1_2[:,0]), list(test_1_2[:,1])
train_1, train_2 = list(train_1_2[:,0]), list(train_1_2[:,1])

In [None]:
# Sample corpus
# The text will be enable the testing of this algorithm
stop_words_l = stopwords.words('english')
documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2 = (get_processed_df(dset, stop_words_l) for dset in (train_1, train_2, test_1, test_2))

In [None]:
# tokenize and pad every document to make them of the same size
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents_df_train_1.documents + documents_df_train_2.documents)
tokenized_documents_train_1, tokenized_documents_train_2, tokenized_documents_test_1, tokenized_documents_test_2 = (tokenizer.texts_to_sequences(doc.documents) for doc in (documents_df_train_1, documents_df_train_2, documents_df_test_1, documents_df_test_2))
tokenized_doc_pad_train_1, tokenized_doc_pad_train_2, tokenized_doc_pad_test_1, tokenized_doc_pad_test_2 = (pad_sequences(doc,maxlen=MAX_LEN,padding='post') for doc in (tokenized_documents_train_1, tokenized_documents_train_2, tokenized_documents_test_1, tokenized_documents_test_2))
vocab_size=len(tokenizer.word_index)+1

In [None]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i]=model_w2v[word]

In [None]:
'''import pickle
with open('/content/drive/MyDrive/w2v_model/tok_emb.pickle', 'wb') as f:
  pickle.dump((tokenizer, embedding_matrix), f)'''

In [None]:
doc_emb_train_1, doc_emb_train_2, doc_emb_test_1, doc_emb_test_2 = (get_doc_embeddings(docs).astype(np.float32) for docs in (tokenized_doc_pad_train_1, tokenized_doc_pad_train_2, tokenized_doc_pad_test_1, tokenized_doc_pad_test_2))

In [None]:
ds_train = EmbeddingData(doc_emb_train_1, doc_emb_train_2, train_targets)
ds_test = EmbeddingData(doc_emb_test_1, doc_emb_test_2, test_targets)

In [None]:
train_loader = DataLoader(ds_train, batch_size=batch_size)
val_loader = DataLoader(ds_test, batch_size=batch_size)

# model
model = SiameseClassifier()

In [None]:
early_stop_callback = EarlyStopping(monitor="valid_acc_epoch", min_delta=0.00, patience=15, verbose=False, mode="max")
checkpoint_callback = ModelCheckpoint(
    monitor="valid_acc_epoch",
    dirpath="/content/drive/MyDrive/w2v_model/",
    filename="emorec-{epoch:02d}-{valid_acc_epoch:.2f}",
    save_top_k=1,
    mode="max",
)

# training
trainer = pl.Trainer(accelerator=device, callbacks=[early_stop_callback, checkpoint_callback], max_epochs = EPOCHS, check_val_every_n_epoch=1)
trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name       | Type     | Params
----------------------------------------
0 | fc1        | Linear   | 38.5 K
1 | fc2        | Linear   | 4.1 K 
2 | drop       | Dropout  | 0     
3 | fc3        | Linear   | 65    
4 | relu       | ReLU     | 0     
5 | sigmoid    | Sigmoid  | 0     
6 | loss       | BCELoss  | 0     
7 | accuracy_t | Accuracy | 0     
8 | accuracy_v | Accuracy | 0     
----------------------------------------
42.7 K    Trainable params
0         Non-trainable params
42.7 K    Total params
0.171     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


valid_acc_epoch tensor(0.6875)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5099)
train_acc_epoch tensor(0.5000)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5051)
train_acc_epoch tensor(0.5000)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5034)
train_acc_epoch tensor(0.5001)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5104)
train_acc_epoch tensor(0.5001)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5084)
train_acc_epoch tensor(0.4907)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5137)
train_acc_epoch tensor(0.4930)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5203)
train_acc_epoch tensor(0.4955)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5271)
train_acc_epoch tensor(0.4978)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5345)
train_acc_epoch tensor(0.5013)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5388)
train_acc_epoch tensor(0.5046)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5418)
train_acc_epoch tensor(0.5067)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5447)
train_acc_epoch tensor(0.5085)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5478)
train_acc_epoch tensor(0.5109)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5504)
train_acc_epoch tensor(0.5148)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5524)
train_acc_epoch tensor(0.5187)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5544)
train_acc_epoch tensor(0.5228)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5564)
train_acc_epoch tensor(0.5270)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5580)
train_acc_epoch tensor(0.5312)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5595)
train_acc_epoch tensor(0.5353)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5608)
train_acc_epoch tensor(0.5392)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5595)
train_acc_epoch tensor(0.5400)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5611)
train_acc_epoch tensor(0.5381)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5625)
train_acc_epoch tensor(0.5392)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5639)
train_acc_epoch tensor(0.5400)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5649)
train_acc_epoch tensor(0.5411)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5660)
train_acc_epoch tensor(0.5422)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5670)
train_acc_epoch tensor(0.5433)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5681)
train_acc_epoch tensor(0.5446)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5693)
train_acc_epoch tensor(0.5459)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5705)
train_acc_epoch tensor(0.5472)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5717)
train_acc_epoch tensor(0.5485)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5728)
train_acc_epoch tensor(0.5498)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5739)
train_acc_epoch tensor(0.5512)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5749)
train_acc_epoch tensor(0.5525)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5759)
train_acc_epoch tensor(0.5538)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5768)
train_acc_epoch tensor(0.5552)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5777)
train_acc_epoch tensor(0.5566)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5785)
train_acc_epoch tensor(0.5580)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5793)
train_acc_epoch tensor(0.5593)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5801)
train_acc_epoch tensor(0.5607)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5803)
train_acc_epoch tensor(0.5615)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5808)
train_acc_epoch tensor(0.5616)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5814)
train_acc_epoch tensor(0.5622)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5819)
train_acc_epoch tensor(0.5627)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5824)
train_acc_epoch tensor(0.5633)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5830)
train_acc_epoch tensor(0.5640)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5835)
train_acc_epoch tensor(0.5646)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5840)
train_acc_epoch tensor(0.5653)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5845)
train_acc_epoch tensor(0.5660)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5850)
train_acc_epoch tensor(0.5667)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5855)
train_acc_epoch tensor(0.5674)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5861)
train_acc_epoch tensor(0.5682)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5866)
train_acc_epoch tensor(0.5690)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5872)
train_acc_epoch tensor(0.5699)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5877)
train_acc_epoch tensor(0.5708)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5882)
train_acc_epoch tensor(0.5717)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5887)
train_acc_epoch tensor(0.5726)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5892)
train_acc_epoch tensor(0.5736)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5896)
train_acc_epoch tensor(0.5745)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5901)
train_acc_epoch tensor(0.5754)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5901)
train_acc_epoch tensor(0.5760)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5902)
train_acc_epoch tensor(0.5762)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5903)
train_acc_epoch tensor(0.5765)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5904)
train_acc_epoch tensor(0.5769)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5905)
train_acc_epoch tensor(0.5773)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5906)
train_acc_epoch tensor(0.5778)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5908)
train_acc_epoch tensor(0.5782)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5910)
train_acc_epoch tensor(0.5787)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5913)
train_acc_epoch tensor(0.5793)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5916)
train_acc_epoch tensor(0.5799)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5918)
train_acc_epoch tensor(0.5805)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5921)
train_acc_epoch tensor(0.5811)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5924)
train_acc_epoch tensor(0.5817)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5926)
train_acc_epoch tensor(0.5824)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5929)
train_acc_epoch tensor(0.5831)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5931)
train_acc_epoch tensor(0.5838)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5934)
train_acc_epoch tensor(0.5846)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5936)
train_acc_epoch tensor(0.5853)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5939)
train_acc_epoch tensor(0.5861)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5942)
train_acc_epoch tensor(0.5869)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5941)
train_acc_epoch tensor(0.5874)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5942)
train_acc_epoch tensor(0.5877)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5942)
train_acc_epoch tensor(0.5881)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5943)
train_acc_epoch tensor(0.5885)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5944)
train_acc_epoch tensor(0.5889)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5944)
train_acc_epoch tensor(0.5893)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5944)
train_acc_epoch tensor(0.5897)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5944)
train_acc_epoch tensor(0.5902)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5945)
train_acc_epoch tensor(0.5906)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5946)
train_acc_epoch tensor(0.5911)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5948)
train_acc_epoch tensor(0.5916)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5950)
train_acc_epoch tensor(0.5921)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5951)
train_acc_epoch tensor(0.5927)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5953)
train_acc_epoch tensor(0.5933)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5956)
train_acc_epoch tensor(0.5939)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5958)
train_acc_epoch tensor(0.5945)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5960)
train_acc_epoch tensor(0.5952)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5963)
train_acc_epoch tensor(0.5958)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5965)
train_acc_epoch tensor(0.5965)


Validating: 0it [00:00, ?it/s]

valid_acc_epoch tensor(0.5967)
train_acc_epoch tensor(0.5972)


In [None]:
def predict_similarity(sent1, sent2):
  sent1, sent2 = re.sub(r'[^a-z ]', '', sent1.lower()), re.sub(r'[^a-z ]', '', sent2.lower())
  docs = tokenizer.texts_to_sequences([sent1, sent2])
  docs = pad_sequences(docs,maxlen=MAX_LEN,padding='post')
  embeddings = torch.Tensor(get_doc_embeddings(docs))
  pred = model(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)).detach().cpu().numpy()[0]
  result = 'Similar' if pred > 0.5 else 'Not Similar'
  return pred, result

In [None]:
i = 76
print(test_1[i], '\n', test_2[i], '\n', test_targets[i])
sent1 = test_1[i]
sent2 = test_2[i]
predict_similarity(sent1, sent2)

From the start, however, the United States' declared goal was not just to topple Saddam but to stabilize Iraq and install a friendly government. 
 But the United States' ultimate goal was not just to topple Mr. Hussein but to stabilize the country and install a friendly government. 
 1


(array([0.7000551], dtype=float32), 'Similar')