In [40]:

import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import smart_open
import collections
import scipy.stats as stats
import tensorflow as tf
import gensim

from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile

from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam, Adadelta, Nadam, Adagrad, Adamax, Ftrl, RMSprop, SGD #schedules
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, Conv2D, GlobalAveragePooling1D, Conv2D, ZeroPadding2D
from tensorflow.keras.layers import Bidirectional, GlobalAveragePooling2D, GlobalAveragePooling3D, BatchNormalization, Dropout
from tensorflow.keras.layers import Subtract, Add, Multiply, Activation, Input, Concatenate, Reshape, Dot, GRU, LayerNormalization 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import bert
import tensorflow_hub as hub

from collections import namedtuple
from tqdm import tqdm

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.model_selection import GridSearchCV, RandomSearch
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm, tree
import xgboost

# Loading cleaned and categorized data

In [24]:
 path = '/Users/patrickrs/Documents/GitLab/revealapp/10_cleaning/src'

current_path = os.getcwd()
os.chdir(path)
%run ./Load+Clean_News.ipynb
%run ./cont_to_cat_News.ipynb
os.chdir(current_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Get the data

In [25]:
data = news

In [26]:
data

Unnamed: 0,sim,SimilarityScore,sentence1,sentence2
0,1,4.000,last year wanted murder,last year sought murder
1,1,5.000,promarket economists dont object corporations blatantly use snob appeal promote products,economists companies openly using attractiveness luxury promote products
2,1,5.000,perhaps importantly ahmadinejad destabilizing influence bernanke,perhaps important ahmadinejad destabilising influence bernanke
3,1,4.667,europe,europe
4,1,4.500,gays modern practices rejected selfindulgent,gay practical modern rejected laws
...,...,...,...,...
649,0,1.800,indian pakistani governments nearly engaged fourth conflict 1999,indian pakistani governments conducted nuclear tests may 1998
650,1,3.800,iguaran stated detainees accused homicide criminal collaboration kidnappings funding terrorism,iguaran stated detainees also involved murders police members antikidnapping group
651,1,2.400,3 suspected extremists released bail,1 suspected extremist provisionally released without bail
652,0,0.800,6 czech hospital employees charged human organ trafficking,accused charged international drug trafficking


# Tokenize Sentences Using Bert Tokenizer

In [34]:
#Import tokenizer using the original vocab file, do lower case all the word pieces and then tokenize the sentences.
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False) # Maybe set to true?
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [35]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [36]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [37]:
def create_single_input(sentence,MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

In [38]:
def create_input_array(sentences):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [41]:
input1=create_input_array(data['sentence1'])
input2=create_input_array(data['sentence2'])

100%|██████████| 654/654 [00:00<00:00, 4896.27it/s]
100%|██████████| 654/654 [00:00<00:00, 5659.99it/s]


In [42]:
input1

[array([[  101,  2197,  2095, ...,     0,     0,     0],
        [  101, 20877, 17007, ...,     0,     0,     0],
        [  101,  3383, 14780, ...,     0,     0,     0],
        ...,
        [  101,  1017,  6878, ...,     0,     0,     0],
        [  101,  1020,  5569, ...,     0,     0,     0],
        [  101, 25817,  3663, ...,     0,     0,     0]], dtype=int32),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)]

In [10]:
# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state = 42)
X1_train = X_train[:, :X1.shape[1]]
X2_train = X_train[:, X1.shape[1]:X_train.shape[1]]
X1_test = X_test[:, :X1.shape[1]]
X2_test = X_test[:, X1.shape[1]:X_train.shape[1]]

In [43]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                          trainable=True)

MAX_SEQ_LEN=128 # max is 512
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

# Siamese Model

In [53]:

x = .GlobalAveragePooling1D()(sequence_output)
x = Dropout(0.2)(x)
out = Dense(1, activation="sigmoid", name="dense_output")(x)

model = .Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

In [55]:
model.compile(loss= "binary_crossentropy", metrics=['acc', keras.metrics.AUC()],
              optimizer = Adam(lr = 0.00001)
             )

In this network. input_1 and input_2 are pre-processed, Keras-tokenized text sequences which are to be compared for similar intent. These two text sequences are then fed through a common network of a basic embedding layer and an LSTM units. Once the feature vectors are obtained from this common network, a series of similarity measures are computed and are concatenated to be finally input into a Dense layer followed by sigmoid output unit which will finally help in classifying whether the given texts are similar or not.

In [40]:
model.summary()

In [44]:
plot_model(model, to_file='model.png', show_shapes=False)

NameError: name 'model' is not defined

In [56]:
model.fit(input1, data['sim'],
          epochs=5,
          atch_size=32,
          validation_split=0.2,
          shuffle=True)

Train on 523 samples, validate on 131 samples
Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [42]:
callbacks = [
    keras.callbacks.EarlyStopping(
        restore_best_weights=True,
        monitor='val_loss',
        min_delta=1e-3,
        patience=50,
        verbose=1)
            ]

In [43]:
y[y==1].count()/len(y)

0.8700305810397554

In [44]:
batch = 523
epochs = 1000
history = model.fit([X1_train, X2_train], y_train, 
                     batch, 
                     epochs = epochs, 
                     callbacks=callbacks,
                     validation_data = ([X1_test, X2_test], y_test),
                     class_weight = {0: (1/0.26), 1: (1/1.74)}
                    )

ValueError: A target array with shape (523, 1) was passed for an output of shape (None, 300, 1) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss (Binary Cross-Entropy)')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('loss (Binary Cross-Entropy)')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]

In [None]:
# keep probabilities for the positive outcome only
#plt.plot(history.history['val_auc*'])

In [None]:
y_pred = model.predict([X1_test, X2_test])

In [None]:
import pyroc

In [None]:
roc = pyroc.ROC(y_test, y_pred)

In [None]:
fig = plt.gcf()
fig.set_size_inches(8.5, 3.5, forward=True)
rocc =roc.plot()

In [None]:
# GRU better than LSTM? https://www.aclweb.org/anthology/R19-1116.pdf
# good tut on Malstm and siamese: https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07
# explanation of LSTM: http://colah.github.io/posts/2015-08-Understanding-LSTMs/
# Our old friend Prabhnoor: https://medium.com/@prabhnoor0212/siamese-network-keras-31a3a8f37d04
## How to deal with imbalanced data: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

# Using bert layer: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b
#                   https://androidkt.com/simple-text-classification-using-bert-in-tensorflow-keras-2-0/

