In [1]:

import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import smart_open
import collections
import scipy.stats as stats
import tensorflow as tf
import gensim

from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile

from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam, Adadelta, Nadam #schedules
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, Conv2D, GlobalAveragePooling1D, Conv2D, ZeroPadding2D
from tensorflow.keras.layers import Bidirectional, GlobalAveragePooling2D, GlobalAveragePooling3D, BatchNormalization, Dropout
from tensorflow.keras.layers import Subtract, Add, Multiply, Activation, Input, Concatenate, Reshape 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras import optimizers

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.model_selection import GridSearchCV, RandomSearch
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm, tree
import xgboost

Using TensorFlow backend.


In [2]:
os.chdir('/Users/patrickrs/Documents/GitLab/revealapp/00_exploration')

### Get the data

In [3]:
data = pd.read_csv('data/SICK_train.txt', sep="\t")

In [4]:
data

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL
...,...,...,...,...,...
4495,9993,A door is being opened by a man,A bald man in a band is playing guitar in the ...,1.1,NEUTRAL
4496,9997,Someone is boiling okra in a pot,The man is not playing the drums,1.0,NEUTRAL
4497,9998,The man is singing heartily and playing the gu...,A bicyclist is holding a bike over his head in...,1.0,NEUTRAL
4498,9999,A man in blue has a yellow ball in the mitt,A man is jumping rope outside,1.2,NEUTRAL


### Cleaning the data

In [5]:
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words('english')

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = re.sub("xxxx", "", sentence)
    sentence = re.sub("xxx", "", sentence)
    sentence = re.sub("xx", "", sentence)
    sentence = re.sub("\s\s+", " ", sentence)
       
    ''' stemming of words (seems not to affect accuracy, but should make things faster
    porter = PorterStemmer()
    words = word_tokenize(sentence)
    sentence = " ".join([porter.stem(word) for word in words])
     ''' 
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
    sentence = " ".join(sentence)
    
    return sentence

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data['sentence_A'] = [clean_sentence(sentence) for sentence in data['sentence_A']]

In [7]:
data['sentence_B'] = [clean_sentence(sentence) for sentence in data['sentence_B']]

In [8]:
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,group kids playing yard old man standing backg...,group boys yard playing man standing background,4.5,NEUTRAL
1,2,group children playing house man standing back...,group kids playing yard old man standing backg...,3.2,NEUTRAL
2,3,young boys playing outdoors man smiling nearby,kids playing outdoors near man smile,4.7,ENTAILMENT
3,5,kids playing outdoors near man smile,group kids playing yard old man standing backg...,3.4,NEUTRAL
4,9,young boys playing outdoors man smiling nearby,group kids playing yard old man standing backg...,3.7,NEUTRAL


### Import word2vec from google

In [9]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for content in data:
        corpus_temp = nltk.word_tokenize(content)
        corpus.append(corpus_temp)
    return corpus

corpus = build_corpus(pd.concat([data['sentence_A'], data['sentence_B']]))

In [10]:
corpus

[['group', 'kids', 'playing', 'yard', 'old', 'man', 'standing', 'background'],
 ['group', 'children', 'playing', 'house', 'man', 'standing', 'background'],
 ['young', 'boys', 'playing', 'outdoors', 'man', 'smiling', 'nearby'],
 ['kids', 'playing', 'outdoors', 'near', 'man', 'smile'],
 ['young', 'boys', 'playing', 'outdoors', 'man', 'smiling', 'nearby'],
 ['two', 'dogs', 'fighting'],
 ['brown', 'dog', 'attacking', 'another', 'animal', 'front', 'man', 'pants'],
 ['brown', 'dog', 'attacking', 'another', 'animal', 'front', 'man', 'pants'],
 ['nobody', 'riding', 'bicycle', 'one', 'wheel'],
 ['person', 'riding', 'bicycle', 'one', 'wheel'],
 ['person', 'black', 'motorbike', 'tricks', 'jacket'],
 ['man', 'jersey', 'dunking', 'ball', 'basketball', 'game'],
 ['man', 'jersey', 'dunking', 'ball', 'basketball', 'game'],
 ['player', 'dunking', 'basketball', 'net', 'crowd', 'background'],
 ['two', 'people', 'kickboxing', 'spectators', 'watching'],
 ['two', 'young', 'women', 'sparring', 'kickboxing', 

In [11]:
# Importing pre-trained model, updating vocab 
# and training the model (takes long to run)
def pre_trained_w2v(corpus):
    '''Function to load the GoogleNews pre-trained word2vec and
    train it further on another corpus.
    
    Args:
        corpus: A list of lists where the sublists are sentences
                and the sublist items are words.
    Returns: 
        A word2vec model with a large vocabulary.
    '''
    w2v_model_2 = Word2Vec(size=300, min_count=1)
    w2v_model_2.build_vocab(corpus)
    total_examples = w2v_model_2.corpus_count
    w2v_google_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
    w2v_model_2.build_vocab([list(w2v_google_model.vocab.keys())], update=True)
    w2v_model_2.intersect_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)
    # intersect_word2vec_format() will let you bring vectors from an external file into a model that's already had its own vocabulary initialized
    # see https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.intersect_word2vec_format.html
    w2v_model_2.train(corpus, total_examples=total_examples, epochs=w2v_model_2.iter)
    return w2v_model_2
w2v_model_2 = pre_trained_w2v(corpus)



In [12]:
embedding_matrix = np.zeros((len(w2v_model_2.wv.vocab) + 1, 300))
for i, vec in enumerate(w2v_model_2.wv.vectors):
  embedding_matrix[i] = vec
feature_size = 500
tokenizer = Tokenizer(num_words = feature_size)
# fit the tokenizer on our text
tokenizer.fit_on_texts(pd.concat([data['sentence_A'], data['sentence_B']]))
# get all words that the tokenizer knows
word_index = tokenizer.word_index
# put the tokens in a matrix
X1 = tokenizer.texts_to_sequences(data['sentence_A'])
X1 = pad_sequences(X1)
X2 = tokenizer.texts_to_sequences(data['sentence_B'])
X2 = pad_sequences(X2)

# X2 was padded with one column less:
x0 = np.zeros((X2.shape[0], X1.shape[1]))
x0[:,X1.shape[1] - X2.shape[1]:] = X2
X2 = x0

X =  np.concatenate((X1, X2), axis = 1)
# prepare the labels
y = data['SimilarityScore']

KeyError: 'SimilarityScore'

In [None]:
# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state = 42)
X1_train = X_train[:, :X1.shape[1]]
X2_train = X_train[:, X1.shape[1]:X_train.shape[1]]
X1_test = X_test[:, :X1.shape[1]]
X2_test = X_test[:, X1.shape[1]:X_train.shape[1]]

# Siamese Model

In [None]:
input_1 = Input(shape=(X1.shape[1],))
input_2 = Input(shape=(X2.shape[1],))


common_embed = Embedding(name = "Sentence_Embedd",
                        input_dim = len(w2v_model_2.wv.vocab) + 1 ,
                        output_dim = 300,
                        input_length  = X1.shape[1],
                        weights = [embedding_matrix],
                        trainable=False)
lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)


common_lstm = LSTM(64,return_sequences=True, activation="relu")
vector_1 = common_lstm(lstm_1)
vector_1 = Flatten()(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten()(vector_2)

x3 = Subtract()([vector_1, vector_2])
x3 = Multiply()([x3, x3])

x1_ = Multiply()([vector_1, vector_1])
x2_ = Multiply()([vector_2, vector_2])
x4 = Subtract()([x1_, x2_])

# https://stackoverflow.com/a/51003359/10650182
# Calculates cosine similarity
x5 = keras.layers.Dot(axes = 1, normalize=True)([vector_1, vector_2])
    
conc = Concatenate(axis = -1)([x5, x4, x3])

x = Dense(100, activation="relu", name='conc_layer')(conc)
x = Dropout(0.2)(x)
out = Dense(1, activation="relu", name = 'out')(x)

model = Model([input_1, input_2], out)

model.compile(loss="mse",  metrics=[tf.keras.metrics.RootMeanSquaredError()], optimizer=Adam(0.00001))


In this network. input_1 and input_2 are pre-processed, Keras-tokenized text sequences which are to be compared for similar intent. These two text sequences are then fed through a common network of a basic embedding layer and an LSTM units. Once the feature vectors are obtained from this common network, a series of similarity measures are computed and are concatenated to be finally input into a Dense layer followed by sigmoid output unit which will finally help in classifying whether the given texts are similar or not.

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='model.png', show_shapes=False)

In [None]:
'''Learning rate schedule
initial_learning_rate = 0.1
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)''' 


In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(
        restore_best_weights=True,
        # Stop training when `val_loss` is no longer improving
        monitor='val_loss',
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 10 epochs"
        patience=10,
        verbose=1)
            ]

In [None]:
batch = 1024
epochs = 1000
history = model.fit([X1_train, X2_train], y_train, 
                     batch, 
                     epochs = epochs, 
                     callbacks=callbacks,
                     validation_data = ([X1_test, X2_test], y_test)
                    )

In [None]:
print(history.history.keys())

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
data['sentence_A'][0]

In [None]:
# GRU better than LSTM? https://www.aclweb.org/anthology/R19-1116.pdf