In [159]:
import re
import os 
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import sparse
import ast

import keras
from keras.layers import Input, Dense

import gensim 
from gensim.models import Word2Vec 

from sklearn import model_selection, preprocessing, metrics
from sklearn.metrics import recall_score

import textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import data_lake_helper as dl_helper

In [59]:
data_lake = dl_helper.DataLake(version='v3')

In [60]:
def prepare_data(df_to_prepare, key='text_normalized'):
    train_x = df_to_prepare[key].tolist()
    train_y = df_to_prepare.category.tolist()
    return (train_x, train_y)

In [61]:
def load_feature(feature, load_version=None):

    if load_version is None:
        data_lake_ = data_lake
    else:
        data_lake_ = DataLake(version=load_version)
    
    df[feature] = data_lake_.load_obj(feature + '.pkl')

# Loading features 

In [121]:
df = data_lake.load_obj('df-cleaned.pkl')

f_name = 'text_normalized'
load_feature(f_name)

load_feature('letter_lenght')

df_train_table = df[df.path == 'dataset/train_set/']
df_test_table = df[df.path == 'dataset/test_set/']

#reducing df_train_table for testing purposes
#n = 2500
#df_train_table = df_train_table[:n]

In [122]:
train_x, train_y = prepare_data(df_train_table)
valid_x, valid_y = prepare_data(df_test_table)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


In [123]:
letter_types = sorted(df.category.unique().tolist())

In [124]:
# getting letter with highest words count
#
load_feature('letter_lenght')
pad_sequence_max_len = round(max(df_train_table.letter_lenght.mean(), df_test_table.letter_lenght.mean()))#.max()
pad_sequence_max_len = int(pad_sequence_max_len)
del df['letter_lenght']
print(pad_sequence_max_len)

6646


In [125]:
# create a tokenizer
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(df[f_name])
word_index = tokenizer.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(train_x), maxlen=pad_sequence_max_len)
valid_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(valid_x), maxlen=pad_sequence_max_len)

In [126]:
#obtengo la sentence tokenizada a partir del Tokenizer de keras
word_index_ = {v:k for k,v in word_index.items()}
 
def custom_tokenizer(text_to_tokenize):
    text_sequences = tokenizer.texts_to_sequences([text_to_tokenize])[0]
    return [word_index_[idx] for idx in text_sequences]

# Word Embeddings 

In [68]:
#Word Embeddings
##########################################

# load the pre-trained word-embedding vectors

try:
    word2vec_model = data_lake.load_obj('word2vec_data' + '.pkl')

except:
    print("loading pre-trained word-embedding vectors...")
    time_start = time.time()
    
    data = []

    for i in range(0,len(df)):
        #creating Word2Vec item with list of list:
        # - inner list represents a document
        # - outer list represents the corpus
        
        text = df.iloc[i]['text_normalized']
        data.append(custom_tokenizer(text))
    
    word2vec_model = Word2Vec(data)
    data_lake.save_obj(word2vec_model,'word2vec_data' + '.pkl')
    
    print(word2vec_model)
    print("loading finished - time: ", time.time() - time_start)


In [113]:
# create token-embedding mapping

# - If mask_zero is set to True, as a consequence, index 0 cannot be used
#   in the vocabulary (input_dim should equal size of vocabulary + 1).
#

not_founded_words = 0
vector_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, vector_dim), dtype='float32')

for word, i in word_index.items():
    try:
        embedding_matrix[i] = word2vec_model[word]
    except:
        not_founded_words += 1
        pass

print('total words', len(word_index))
print('not_founded_words', not_founded_words)
print('embedding_matrix.shape', embedding_matrix.shape)

  del sys.path[0]


total words 157052
not_founded_words 103265
embedding_matrix.shape (157053, 100)


In [114]:
#removing feature that we wont't use anymore
del df['text_normalized']

### t-SNE word embedding

In [None]:
import numpy as np
from sklearn.manifold import TSNE
from numpy import save as numpy_save
from numpy import load as numpy_load
from keras import preprocessing
import seaborn as sns

try:
    embedding_matrix_tsne = numpy_load('embedding_matrix_tsne.npy')
except:
    embedding_matrix_tsne = TSNE(n_components=2).fit_transform(embedding_matrix)
    numpy_save('embedding_matrix_tsne.npy', embedding_matrix_tsne)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 10)

# label encode the target variable 
train_y = df_train_table.category
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valores = [k for (k,v) in zip(df_train_table.category, train_y)]

sns.scatterplot(embedding_matrix_tsne[:,0], embedding_matrix_tsne[:,1], hue=valores, legend='full', palette=palette)


# Model Building

In [115]:
# - If mask_zero is set to True, as a consequence, index 0 cannot be used
#   in the vocabulary (input_dim should equal size of vocabulary + 1).
#

def get_embedding_layer_v1(input_layer):
    return layers.Embedding(input_dim=embedding_matrix.shape[0],
                             output_dim=embedding_matrix.shape[1],
                             weights=[embedding_matrix],
                             mask_zero=True,
                             trainable=False)(input_layer)
    
def get_embedding_layer_v2(input_layer):
    
    return layers.Embedding(input_dim=len(word_index) + 1, #size of vocab                  
                             output_dim=vector_dim,
                             mask_zero=True,
                             trainable=True)(input_layer)

In [133]:
letter_types = sorted(df.category.unique().tolist())
output_layer_neurons = len(letter_types)

def create_nn(layer_provider, embedding_version):
    # Add an Input Layer
    input_layer = layers.Input((pad_sequence_max_len, ))

    # Add the word embedding Layer
    if embedding_version == 'v1':
        embedding_layer = get_embedding_layer_v1(input_layer)
    else:
        embedding_layer = get_embedding_layer_v2(input_layer)
        
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add Custom Layer
    custom_layer = layer_provider.build_layer(vector_dim, embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(custom_layer)
    output_layer1 = layers.Dropout(0.3)(output_layer1)
    output_layer2 = layers.Dense(output_layer_neurons, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy')
    #sparse_categorical_crossentropy since we didn’t one-hot encode the labels. 
    
    return model

In [117]:
def fit_nn(classifier, nn_name):
    try:
        classifier = data_lake.load_obj(classifier, nn_name + '.pkl')
    except:    
        classifier.fit(train_seq_x, train_y,
                       epochs=1, batch_size=128, validation_split = 0.2, verbose=1)
        #data_lake.save_obj(classifier, nn_name +'.pkl')
    
    return classifier        

In [118]:
def predict_nn(classifier):
    score = classifier.predict(valid_seq_x)

    items_recall = recall_score(valid_y, score.argmax(axis=-1), average=None)

    print('RECALL')
    for item in zip(letter_types,items_recall):
        print(str(item))
        
    accuracy = metrics.accuracy_score(score.argmax(axis=-1), valid_y)
    print('ACCURACY')
    print(accuracy)

# LSTM 

In [120]:
class LSTMLayer:
    
    def __init__(self):
        self.name = 'model_lstm'

    def build_layer(self, vector_dim, prev_layer):
        return layers.LSTM(vector_dim)(prev_layer)

In [92]:
#train size = 2500
lstm = create_nn(LSTMLayer(), embedding_version='v1')
lstm = fit_nn(lstm, LSTMLayer().name)
predict_nn(lstm)

RECALL
('AK', 0.0)
('AR', 0.025)
('CL', 0.6109324758842444)
('ER', 0.0)
('FL', 0.0)
('ND', 0.0)
('RL', 0.9069767441860465)
('TM', 0.0)
('UR', 0.0)
('UU', 0.0)
ACCURACY
0.5449620801733478


# GRU

In [128]:
class GRULayer:
    def __init__(self):
        self.name = 'model_gru'

    def build_layer(self, vector_dim, prev_layer):
        return layers.GRU(vector_dim)(prev_layer)

#train size = 2500
gru = create_nn(GRULayer(), embedding_version='v1')
gru = fit_nn(gru, GRULayer().name)
predict_nn(gru)

RECALL
('AK', 0.0)
('AR', 0.075)
('CL', 0.7459807073954984)
('ER', 0.0)
('FL', 0.06930693069306931)
('ND', 1.0)
('RL', 0.005813953488372093)
('TM', 0.0)
('UR', 0.25)
('UU', 0.01282051282051282)
ACCURACY
0.27085590465872156


# bidirectional rnn  

In [131]:
class BidirectionalGRULayer:
    def __init__(self):
        self.name = 'model_bidirectional_gru'

    def build_layer(self, vector_dim, prev_layer):
        return layers.Bidirectional(layers.GRU(vector_dim))(prev_layer)

#train size = 2500
bidir_gru = create_nn(BidirectionalGRULayer(), embedding_version='v1')
bidir_gru = fit_nn(bidir_gru, BidirectionalGRULayer().name)
predict_nn(bidir_gru)

RECALL
('AK', 0.0)
('AR', 0.05)
('CL', 0.8102893890675241)
('ER', 0.0)
('FL', 0.0)
('ND', 0.3333333333333333)
('RL', 0.0)
('TM', 0.0)
('UR', 0.0)
('UU', 0.01282051282051282)
ACCURACY
0.27735644637053086


In [134]:
class BidirectionalLSTMLayer:
    def __init__(self):
        self.name = 'model_bidirectional_lstm'

    def build_layer(self, vector_dim, prev_layer):
        return layers.Bidirectional(layers.LSTM(vector_dim))(prev_layer)

#train size = 2500
bidir_lstm = create_nn(BidirectionalLSTMLayer(), embedding_version='v1')
bidir_lstm = fit_nn(bidir_lstm, BidirectionalLSTMLayer().name)
predict_nn(bidir_lstm)

RECALL
('AK', 0.0)
('AR', 0.0)
('CL', 0.8520900321543409)
('ER', 0.0)
('FL', 0.0)
('ND', 0.0)
('RL', 0.0029069767441860465)
('TM', 0.0)
('UR', 0.0)
('UU', 0.0)
ACCURACY
0.28819068255687974


##### Final observations
- We demonstrate that the LSTM, GRU and Bidirectional GRU networks are working properly. These were trained with the first 2500 items because the training task consumed many resources. Altough, the NNs training was not the best, because the first 2500 items were taken and therefore it is unknown if the validation dataset has a similar distribution to the training dataset, or even if it lacks some categories.

- RandomSearch or GridSearch could be used to find the best classif hyperparameters, but since I'm focussing on learning different RNN's architectures I will pospose this.

- Cross Validation is missing in ordern to estimate better how accurately the predictive model will perform in practice.