In [1]:
def clean_word(token):
    token = token.lower()
    token = re.sub("[^a-z0-9]*", "", token)
    token = lemmatizer.lemmatize(token)

    return token

In [2]:
def tokenize_sentences(sent, max_length, token2id, UNK=1, PAD=0):
    tokens = [token2id.get(clean_word(token), UNK) for token in sent.split()]

    if len(tokens) < max_length:
        diff = max_length - len(tokens)
        tokens.extend([PAD] * diff)
    elif len(tokens) > max_length:
        tokens = tokens[:max_length]

    return tokens

In [7]:
# Import libraries
from nltk.stem import WordNetLemmatizer
from random import shuffle
import pandas as pd
import numpy as np
import datasets
import nltk
import re

nltk.download('wordnet')
nltk.download('omw-1.4')

# Define constant parameters
GLOVE_PATH = '/home/matin/Workspace/Datasets-and-Models/GloVe/glove.6B.50d.txt'
DATASET_NAME = 'quora'
DATA_LEN = 30000
MAX_LEN = 100
TEST_SIZE = 0.1

# Create variables
token2idx = {'<PAD>': 0, '<UNK>': 1}
lemmatizer = WordNetLemmatizer()
counter = 2
dataset = []
s1, s2 = [], []

# Load dataset
data = datasets.load_dataset(DATASET_NAME, split='train')

# Split data to a fixed size
sentences = data['questions'][:DATA_LEN]
labels = data['is_duplicate'][:DATA_LEN]

# Iterate over sentences to split sentence 1 and 2
for item in sentences:
    s1.append(item['text'][0])
    s2.append(item['text'][1])

# Open GLoVe file for word embedding
with open(GLOVE_PATH, 'r') as f:
    raw_glove = f.read().strip().split('\n')

# Create an empty array to fill with glive embeddings
glove_weights = np.zeros((len(raw_glove), len(raw_glove[0].split())-1), dtype=float)

# Store values of embeddings and create dictionary with words and tokens
for idx, item in enumerate(raw_glove):
    for idy, entity in enumerate(item.split()):
        if idy == 0:
            token2idx[entity] = counter
            counter += 1
        else:
            glove_weights[idx, idy-1] = float(entity)

# Create a dataset with tokens and labels
for idx, (sent1, sent2) in enumerate(zip(s1, s2)):
    tag = 1 if labels[idx] == True else 0
    dataset.append((tokenize_sentences(sent1, MAX_LEN, token2idx),
                   tokenize_sentences(sent2, MAX_LEN, token2idx),
                   tag))
    
# Shuffle dataset and split train and test
shuffle(dataset)
test_index = int(len(dataset)*TEST_SIZE)
train = dataset[test_index:]
test = dataset[:test_index]

# Create x and y of train and test from dataset
x_train1 = np.zeros((len(train), MAX_LEN), dtype=int)
x_train2 = np.zeros((len(train), MAX_LEN), dtype=int)
y_train = np.zeros((len(train)), dtype=int)
x_test1 = np.zeros((len(test), MAX_LEN), dtype=int)
x_test2 = np.zeros((len(test), MAX_LEN), dtype=int)
y_test = np.zeros((len(test)), dtype=int)

for idx, item in enumerate(train):
    x_train1[idx, :] = item[0]
    x_train2[idx, :] = item[1]
    y_train[idx] = item[2]
    
    if idx <= len(test)-1:
        x_test1[idx, :] = test[idx][0]
        x_test2[idx, :] = test[idx][1]
        y_test[idx] = test[idx][2]

[nltk_data] Downloading package wordnet to /home/matin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/matin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Using custom data configuration default
Reusing dataset quora (/home/matin/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


In [27]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Embedding, Lambda, Concatenate
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.backend import abs as ab
from tensorflow.keras.models import Model
from tensorflow.keras import Input

(num_vocab, emb_dim) = glove_weights.shape
lstm = Bidirectional(LSTM(20, dropout=0.2, recurrent_dropout=0.2))
embed = Embedding(input_dim=num_vocab, output_dim=emb_dim, input_length=MAX_LEN, weights=[glove_weights], trainable=False)

input1 = Input(shape=(MAX_LEN,))
e1 = embed(input1)
t1 = lstm(e1)

input2 = Input(shape=(MAX_LEN,))
e2 = embed(input2)
t2 = lstm(e2)

sub = lambda x: ab(x[0] - x[1])
sub_layer = Lambda(function=sub, output_shape=20)([t1, t2])
preds = Dense(1, activation='sigmoid')(sub_layer)
model = Model(inputs=[input1, input2], outputs=preds)

model.compile(loss=binary_crossentropy, optimizer='adam',metrics=['accuracy'])

In [28]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 100, 50)      20000000    ['input_10[0][0]',               
                                                                  'input_11[0][0]']               
                                                                                                  
 bidirectional_5 (Bidirectional  (None, 40)          11360       ['embedding_5[0][0]',      

In [None]:
history = model.fit([x_train1, x_train2], y_train, epochs=20, validation_data=([x_test1, x_test2], y_test), batch_size=16)

In [38]:
model.save('./model.h5')