# Deep Learning models
## ELMo and GloVe:

## Imports and environment configurations:

You should run this notebook after installing all the requirements in requirements.txt `pip install -r requirements.txt` .

It is to note that this notebook was run as a google colab notebook and we took advantage of the GPU offered by this service.
In addition some magic commands like `%tensorflow_version 1.x` are only available on colab and it's the equivalent of `!pip install tensorflow==1.15.2`. 

## Imports and environment configurations:

In [None]:
#%tensorflow_version 1.x #Colab magic command
import os
import re
import numpy as np
import pandas as pd
import keras
from keras import regularizers
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from keras import layers
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
from keras.layers import Input, Lambda, Dense, Concatenate, GaussianNoise, Bidirectional, LSTM , Flatten, Permute, RepeatVector
from keras.models import Model

TensorFlow 1.x selected.


Using TensorFlow backend.


In [None]:
def clean_reader(file_path):
        x = list(open(file_path, "r", encoding='utf-8').readlines())
        x = [s.strip() for s in x]
        tweets = []
        for elem in x:
            if elem!='':
                tweet=''
                for word in elem.split(','):
                    tweet+=word+' '
                tweets.append(tweet)
        return tweets

In [None]:
x_pos = clean_reader("../cleaned_data/cleaned_train_pos.txt")
x_neg = clean_reader("../cleaned_data/cleaned_train_neg.txt")

In [None]:
tokenizer = Tokenizer(num_words=100000)

tokenizer.fit_on_texts(x_pos)
tokenizer.fit_on_texts(x_neg)

sequences_pos = tokenizer.texts_to_sequences(x_pos)
sequences_neg = tokenizer.texts_to_sequences(x_neg)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 88699 unique tokens.


In [None]:
max_pos = max(len(l) for l in sequences_pos)
max_neg = max(len(l) for l in sequences_neg)
max_ = max(max_pos,max_neg)
# padding
data_pos = pad_sequences(sequences_pos, maxlen=max_,padding='post',)
data_neg = pad_sequences(sequences_neg, maxlen=max_,padding='post',)

In [None]:
labels_pos = np.ones(len(data_pos))
labels_val = np.zeros(len(data_neg))

### GloVe embeddings

In [None]:
embeddings_index = {}
f = open('../Resources/glove.6B.300d.txt',encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

- We load this embedding matrix into an Embedding layer and we get our GLOVE_EMBEDDING_LAYER

In [None]:
glove_embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            #input_length=max_,
                            trainable=False)

### ELMo embeddings

- Now we implement elmo embedding layer

In [None]:
# Initialize session
sess = tf.Session()
K.set_session(sess)

In [None]:
x_pos = clean_reader("../cleaned_data/cleaned_train_pos.txt")
x_neg = clean_reader("../cleaned_data/cleaned_train_neg.txt")

In [None]:
x_poss = list(open("../cleaned_data/cleaned_train_pos.txt", "r", encoding='utf-8').readlines())
x_poss = [s.strip() for s in x_poss]

x_negg = list(open("../cleaned_data/cleaned_train_neg.txt", "r", encoding='utf-8').readlines())
x_negg = [s.strip() for s in x_negg]

lengths_x_pos = []
for elem in x_poss:
    lengths_x_pos.append(len(elem.split(',')))

lengths_x_neg = []
for elem in x_negg:
    lengths_x_neg.append(len(elem.split(',')))

x_pos_tok = []
for elem in x_poss:
    x_pos_tok.append(elem.split(','))

x_neg_tok = []
for elem in x_negg:
    x_neg_tok.append(elem.split(','))

for elem in x_pos_tok:
    while(len(elem)!=max_):
        elem.append('')

for elem in x_neg_tok:
    while(len(elem)!=max_):
        elem.append('')

x_pos_tokenized = []
for elem in x_pos_tok:
    x_pos_tokenized.append(np.array(elem))

x_neg_tokenized = []
for elem in x_neg_tok:
    x_neg_tokenized.append(np.array(elem))

With the tokens signature, the module takes tokenized sentences as input. 

The input tensor is a string tensor with shape [batch_size, max_length] and an int32 tensor with shape [batch_size] corresponding to the sentence length. 

The length input is necessary to exclude padding in the case of sentences with varying length.

In [None]:
print(tf.__version__)  ##verify:= 1.15.2 ?

1.15.2


In [None]:
# we will load the elmo module
elmo_module = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

def ELMoEmbedding3(x):
    lengths = K.cast(K.argmax(K.cast(K.equal(x, '--PAD--'), 'uint8')),'int32')
    return elmo_module(inputs=dict(tokens=x, sequence_len=lengths),
                      as_dict=True,
                      signature='tokens',
                      )['word_emb']

### ELMo + GloVe Embedding model:

- define two sets of inputs

In [None]:
# create the elmo layer and model
input_text = Input(shape=(48,) ,dtype=tf.string)
embedding = Lambda(ELMoEmbedding3,output_shape=(48,512))(input_text)
elmo = Model(inputs =[input_text],outputs =embedding)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
input_glove = Input(shape=(48,) , dtype="int32",)
# the second branch opreates on the second input
glove_embedding = glove_embedding_layer(input_glove)
glove = Model(inputs=input_glove, outputs=glove_embedding)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
# Concatenate the two outputs
c = Concatenate()([glove.output, elmo.output])

# Apply gaussian nois (prevent from overfitting)
c = layers.GaussianNoise(0.3)(c)


x = Bidirectional(layers.LSTM(64, return_sequences=True,bias_regularizer=regularizers.l2(1e-4),dropout=0.25))(c)
attention = Dense(1, activation='tanh',bias_regularizer=regularizers.l2(1e-4))(x)
attention = Flatten()(attention)
attention = layers.Activation('softmax')(attention)
attention = RepeatVector(64*2)(attention)
attention = Permute([2,1])(attention)

senti = layers.Multiply()([x,attention])
senti = layers.Lambda(lambda xin: K.sum(xin,axis=-2),output_shape=(64*2,))(senti)
ourput_layer = layers.Dense(1,activation='sigmoid',bias_regularizer=regularizers.l2(1e-4))(senti)

In [None]:
# our model will accept the inputs of the two branches and
# then output a single value

model = Model(inputs=[input_text, input_glove], outputs=ourput_layer)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 48)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 48)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 48, 300)      26610000    input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 48, 512)      0           input_1[0][0]                    
____________________________________________________________________________________________

In [None]:
x_tokenized = x_pos_tokenized + x_neg_tokenized

x_lengths = lengths_x_pos + lengths_x_neg

sequences_pos = tokenizer.texts_to_sequences(x_pos)
padded_pos = pad_sequences(sequences_pos, maxlen=max_,padding='post',)

sequences_neg = tokenizer.texts_to_sequences(x_neg)
padded_neg = pad_sequences(sequences_neg, maxlen=max_,padding='post',)

x_padded = np.concatenate([padded_pos, padded_neg])

y = 100000*[1] + 100000*[0]

indices = np.random.permutation(x_padded.shape[0])
training_idx, test_idx = indices[:180000], indices[180000:]

x_tokenized_train, x_tokenized_test = np.array(x_tokenized)[training_idx,:], np.array(x_tokenized)[test_idx,:]
x_lengths_train, x_lengths_test = np.array(x_lengths)[training_idx], np.array(x_lengths)[test_idx]
x_padded_train , x_padded_test = x_padded[training_idx,:], x_padded[test_idx,:]
y_train, y_test = np.array(y)[training_idx], np.array(y)[test_idx]

model.fit([x_tokenized_train[90000:],x_padded_train[90000:]], 
          y_train[90000:],
          validation_data=([x_tokenized_test[10000:],x_padded_test[10000:]], y_test[10000:]),
          epochs=2,
          batch_size=126, verbose=1)



















Train on 90000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f5a0afbb550>

In [None]:
tesst = list(open("../cleaned_data/cleaned_test_data.txt", "r", encoding='utf-8').readlines())
tesst = [s.strip() for s in tesst]
lengths_test = []
for elem in tesst:
    lengths_test.append(len(elem.split(',')))

x_test_tok = []
for elem in tesst:
    x_test_tok.append(elem.split(','))
    
for elem in x_test_tok:
    while(len(elem)!=48):
        elem.append('')

x_test_tokenized = []
for elem in x_test_tok:
    x_test_tokenized.append(np.array(elem))

tesst = list(open("../cleaned_data/cleaned_test_data.txt", "r", encoding='utf-8').readlines())
tesst = [s.strip() for s in tesst]
x_test = []
for elem in tesst:
    if elem!='':
        tweet=''
        for word in elem.split(','):
            tweet+=word+' '
        x_test.append(tweet)

In [None]:
sequences_test = tokenizer.texts_to_sequences(x_test)
padded_test = pad_sequences(sequences_test, maxlen=max_,padding='post',)

In [None]:
y_pred = model.predict([x_test_tokenized,padded_test])

predictions = []
for elem in y_pred:
    if(elem[0])<0.5 : x=-1
    else: x =1
    predictions.append(x)


In [None]:
def create_csv_submission(y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    ids=np.arange(1,10001)
    with open(name, 'w',newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
create_csv_submission(predictions,"ELMo_GloVe_MODEL.csv") #0.827 Acc 0.831 F1