# Deep Learning models
## ELMo embeddings:

## Imports and environment configurations:

You should run this notebook after installing all the requirements in requirements.txt `pip install -r requirements.txt` .

It is to note that this notebook was run as a google colab notebook and we took advantage of the GPU offered by this service.
In addition some magic commands like `%tensorflow_version 1.x` are only available on colab and it's the equivalent of `!pip install tensorflow==1.15.1`. (We use this version of tensorflow because of compatibility issues and stability)

In [None]:
#%tensorflow_version 1.x
import tensorflow_hub as hub
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K
from sklearn import preprocessing

In [None]:
print(tf.__version__)

1.15.2


In [6]:
pos_tweets, neg_tweets, test = load_cleaned_data(full=False, stop_words=True)

Ommiting repetitions
Translating emojis
removing numbers
adding <tag> for hashtags
tokenizing
removing pontuations
dealing with slang words
removing stop words


In [None]:
train_df,test_df = create_train_test_dfs(pos_tweets, neg_tweets, test)

## ELMo model 1.0 (Default mode: sentence):


In [None]:
embed = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [None]:
# for default elmo mode
def create_sentence(x):
    sentence = " ".join(x)
    return sentence

In [None]:
# for elmo tokenized mode
def elmo_tokens(x):
    tweet_tokenized = x
    tokens_length = len(x)
    embed_input = {"tokens": tweet_tokenized, "sequence_len": tokens_length}
    return embed_input

In [None]:
# Change lambda functions accordingly to elmo mode, (create sentence for default and elmo_tokens for tokenized)
train_df['tweets'] = train_df['tweets'].apply(lambda x: create_sentence(x) )
test_df['tweets'] = test_df['tweets'].apply(lambda x: create_sentence(x) )

In [None]:
X_train, y_train, X_test = train_df['tweets'].values, train_df['sign'].values, test_df['tweets'].values

In [None]:
#encoding and decoding our labels
le = preprocessing.LabelEncoder()
le.fit(y_train)

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)

X_train = X_train
y_train = encode(le, y_train)

In [None]:
#Train Keras neural model with ELMO Embeddings

def ELMoEmbedding(x):
    return embed(inputs= tf.squeeze(tf.cast(x, tf.string)),
                                      signature="default",
                                      as_dict=True)["default"]

input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
dense = Dense(256, activation='relu')(embedding)
pred = Dense(2, activation='sigmoid')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        (None, 1)                 0         
_________________________________________________________________
lambda_29 (Lambda)           (None, 1024)              0         
_________________________________________________________________
dense_53 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_54 (Dense)             (None, 2)                 514       
Total params: 262,914
Trainable params: 262,914
Non-trainable params: 0
_________________________________________________________________


In [None]:
#fitting the model
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(X_train, y_train, validation_split = 0.04,  epochs=3, batch_size=120, verbose=1)
    model.save_weights('./NN_MODELS_WEIGHTS/elmo-model_10-12-2020.h5')

Train on 192000 samples, validate on 8000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# predicting tweets sentiment
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights('./NN_MODELS_WEIGHTS/elmo-model_10-12-2020.h5')  
    predicts = model.predict(X_test, batch_size=16)

y_preds = decode(le, predicts)

In [None]:
y_preds = [-1 if x == 0 else 1 for x in y_preds]

In [None]:
def create_csv_submission(y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    ids=np.arange(1,10001)
    with open(name, 'w',newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
create_csv_submission(y_preds,"ELMo_submission_sample_training.csv") #0.777 Accuracy , 0.787 F1

## ELMo model 2.0 (Tokens):


In [8]:
#This was used in Google Colab
#import sys
#sys.path.append('/content/drive/Shareddrives/Project ML/')

In [2]:
#%tensorflow_version 1.x

TensorFlow 1.x selected.


In [3]:
# Import our dependencies
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import keras.layers as layers
from keras.engine import Layer
from keras.layers import Input, Lambda, Dense, Concatenate, GaussianNoise, Bidirectional, LSTM , Flatten, Permute, RepeatVector
from keras.models import Model, load_model
import keras.backend as K
from keras import regularizers

# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.


In [1]:
def clean_reader(file_path):
        x = list(open(file_path, "r", encoding='utf-8').readlines())
        x = [s.strip() for s in x]
        tweets = []
        for elem in x:
            if elem!='':
                tweet=''
                for word in elem.split(','):
                    tweet+=word+' '
                tweets.append(tweet)
        return tweets

In [2]:
x_pos = clean_reader("../cleaned_data/cleaned_train_pos.txt")
x_neg = clean_reader("../cleaned_data/cleaned_train_neg.txt")

In [12]:
tokenizer = Tokenizer(num_words=100000)

tokenizer.fit_on_texts(x_pos)
tokenizer.fit_on_texts(x_neg)

sequences_pos = tokenizer.texts_to_sequences(x_pos)
sequences_neg = tokenizer.texts_to_sequences(x_neg)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 88699 unique tokens.


In [13]:
max_pos = max(len(l) for l in sequences_pos)
max_neg = max(len(l) for l in sequences_neg)
max_ = max(max_pos,max_neg)

In [14]:
x_poss = list(open("../cleaned_data/cleaned_train_pos.txt", "r", encoding='utf-8').readlines())
x_poss = [s.strip() for s in x_poss]

x_negg = list(open("../cleaned_data/cleaned_train_neg.txt", "r", encoding='utf-8').readlines())
x_negg = [s.strip() for s in x_negg]

lengths_x_pos = []
for elem in x_poss:
    lengths_x_pos.append(len(elem.split(',')))

lengths_x_neg = []
for elem in x_negg:
    lengths_x_neg.append(len(elem.split(',')))

x_pos_tok = []
for elem in x_poss:
    x_pos_tok.append(elem.split(','))

x_neg_tok = []
for elem in x_negg:
    x_neg_tok.append(elem.split(','))

for elem in x_pos_tok:
    while(len(elem)!=max_):
        elem.append('')

for elem in x_neg_tok:
    while(len(elem)!=max_):
        elem.append('')

x_pos_tokenized = []
for elem in x_pos_tok:
    x_pos_tokenized.append(np.array(elem))

x_neg_tokenized = []
for elem in x_neg_tok:
    x_neg_tokenized.append(np.array(elem))

In [15]:
# we will load the elmo module
elmo_module = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

def ELMoEmbedding3(x):
    lengths = K.cast(K.argmax(K.cast(K.equal(x, '--PAD--'), 'uint8')),'int32')
    return elmo_module(inputs=dict(tokens=x, sequence_len=lengths),
                      as_dict=True,
                      signature='tokens',
                      )['word_emb']

In [16]:
input_text = Input(shape=(48,) ,dtype=tf.string)
embedding = Lambda(ELMoEmbedding3,output_shape=(48,512))(input_text)

x = Bidirectional(layers.LSTM(64, return_sequences=True,bias_regularizer=regularizers.l2(1e-4),dropout=0.25))(embedding)

attention = Flatten()(x)

output_layer = layers.Dense(1,activation='sigmoid',bias_regularizer=regularizers.l2(1e-4))(attention)

elmo_model_tokenized = Model(inputs=[input_text],outputs=output_layer)


elmo_model_tokenized.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
elmo_model_tokenized.summary()


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 48)                0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 48, 512)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 48, 128)           295424    
_________________________________________________________________
flatten_1 (Flatten)          (None, 6144)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6145      
Total params: 301,569
Trainable params: 301,569
Non-trainable params: 0
_________________________________________________________________


In [17]:
x_tokenized = np.asarray(x_pos_tokenized + x_neg_tokenized)

y = 100000*[1] + 100000*[0]

indices = np.random.permutation(x_tokenized.shape[0])
training_idx, test_idx = indices[:180000], indices[180000:]

x_tokenized_train, x_tokenized_test = np.array(x_tokenized)[training_idx,:], np.array(x_tokenized)[test_idx,:]
y_train, y_test = np.array(y)[training_idx], np.array(y)[test_idx]

In [19]:
#training
elmo_model_tokenized.fit(x_tokenized_train[90000:], 
                         y_train[90000:],
                         validation_data=(x_tokenized_test[10000:], y_test[10000:]),
                         epochs=2,
                         batch_size=256, verbose=1)



















Train on 90000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7faa44b41940>

In [20]:
elmo_model_tokenized.save_weights('./NN_MODELS_WEIGHTS/elmo-model_tokens.h5')

In [24]:
tesst = list(open("../cleaned_data/cleaned_test_data.txt", "r", encoding='utf-8').readlines())
tesst = [s.strip() for s in tesst]
lengths_test = []
for elem in tesst:
    lengths_test.append(len(elem.split(',')))

x_test_tok = []
for elem in tesst:
    x_test_tok.append(elem.split(','))
    
for elem in x_test_tok:
    while(len(elem)!=48):
        elem.append('')

x_test_tokenized = []
for elem in x_test_tok:
    x_test_tokenized.append(np.array(elem))

x_test = clean_reader("../cleaned_data/cleaned_test_data.txt")

In [28]:
y_pred = elmo_model_tokenized.predict([x_test_tokenized])

predictions = []
for elem in y_pred:
    if(elem[0])<0.5 : x=-1
    else: x =1
    predictions.append(x)

In [35]:
create_csv_submission(predictions,"ELMo_tokens_sample_training.csv") #0.824 acc	0.822 F-1