In [1]:
import re
import os 
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import sparse
import ast

import seaborn as sns
import matplotlib.pyplot  as plt

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import keras
from keras.layers import Input, Dense

import gensim 
from gensim.models import Word2Vec 

from sklearn import model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer

import pickle

import textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

Using TensorFlow backend.


In [2]:
class DataLake:
    
    path = 'features/'
    
    def __init__(self, version):
        self.version = version 
        self.version_path = self.path + version + "/"
        
        #create version folder if not exists
        if not os.path.exists(self.path + self.version):
            try:
                os.makedirs(self.path + self.version)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
    
    def save_obj(self, obj, file_name):
        with open(self.version_path + file_name, 'wb') as output:  # Overwrites any existing file.
            pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
    
    def save_npz(self, obj, file_name):
        sparse.save_npz(self.version_path + file_name, obj)
    
    def load_npz(self, file_name):
        return sparse.load_npz(self.version_path + file_name)
    
    def load_obj(self, obj_name):
        with open(self.version_path + obj_name, 'rb') as input:
            return pickle.load(input)

    def load_config(self, file_name):
        config_path = 'configs/' + self.version + "/"
        file = open(config_path + file_name, "r")
        contents = file.read()
        return ast.literal_eval(contents)
    
data_lake = DataLake(version='v3')

In [3]:
def prepare_data(df_to_prepare, key='text_normalized'):
    train_x = df_to_prepare[key].tolist()
    train_y = df_to_prepare.category.tolist()
    return (train_x, train_y)

In [4]:
def load_feature(feature, load_version=None):

    if load_version is None:
        data_lake_ = data_lake
    else:
        data_lake_ = DataLake(version=load_version)
    
    df[feature] = data_lake_.load_obj(feature + '.pkl')

# Loading features 

In [5]:
df = data_lake.load_obj('df-cleaned.pkl')

f_name = 'text_normalized'
load_feature(f_name)

df_train_table = df[df.path == 'dataset/train_set/']
df_test_table = df[df.path == 'dataset/test_set/']

In [6]:
train_x, train_y = prepare_data(df_train_table)
valid_x, valid_y = prepare_data(df_test_table)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


In [7]:
letter_types = sorted(df.category.unique().tolist())

In [10]:
# getting letter with higher words count
#
load_feature('letter_lenght')
pad_sequence_max_len = round(df.letter_lenght.mean())#.max()
del df['letter_lenght']
print(pad_sequence_max_len)

6541


In [11]:
# create a tokenizer
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(df[f_name])
word_index = tokenizer.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(train_x), maxlen=pad_sequence_max_len)
valid_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(valid_x), maxlen=pad_sequence_max_len)

In [36]:
#obtengo la sentence tokenizada a partir del Tokenizer de keras
word_index_ = {v:k for k,v in word_index.items()}
 
def custom_tokenizer(text_to_tokenize):
    text_sequences = tokenizer.texts_to_sequences([text_to_tokenize])[0]
    return [word_index_[idx] for idx in text_sequences]

# Word Embeddings 

In [58]:
#Word Embeddings
##########################################

# load the pre-trained word-embedding vectors

try:
    word2vec_model = data_lake.load_obj('word2vec_data' + '.pkl')

except:
    print("loading pre-trained word-embedding vectors...")
    time_start = time.time()
    
    data = []

    for i in range(0,len(df)):
        #creating Word2Vec item with list of list:
        # - inner list represents a document
        # - outer list represents the corpus
        
        text = df.iloc[i]['text_normalized']
        data.append(custom_tokenizer(text))
    
    word2vec_model = Word2Vec(data)
    data_lake.save_obj(word2vec_model,'word2vec_data' + '.pkl')
    
    print(word2vec_model)
    print("loading finished - time: ", time.time() - time_start)


In [48]:
# create token-embedding mapping

# - If mask_zero is set to True, as a consequence, index 0 cannot be used
#   in the vocabulary (input_dim should equal size of vocabulary + 1).
#

not_founded_words = 0
vector_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, vector_dim), dtype='float32')

for word, i in word_index.items():
    try:
        embedding_matrix[i] = word2vec_model[word]
    except:
        not_founded_words += 1
        pass

print('total words', len(word_index))
print('not_founded_words', not_founded_words)
print('embedding_matrix.shape', embedding_matrix.shape)

  del sys.path[0]


total words 157052
not_founded_words 103265
embedding_matrix.shape (157053, 100)


In [13]:
#removing feature that we wont't use anymore
del df['text_normalized']

# Model Building

In [47]:
# - If mask_zero is set to True, as a consequence, index 0 cannot be used
#   in the vocabulary (input_dim should equal size of vocabulary + 1).
#

def get_embedding_layer_v1(input_layer):
    return layers.Embedding(input_dim=embedding_matrix.shape[0],
                             output_dim=embedding_matrix.shape[1],
                             weights=[embedding_matrix],
                             mask_zero=True,
                             trainable=False)(input_layer)
    
def get_embedding_layer_v2(input_layer):
    
    return layers.Embedding(input_dim=len(word_index) + 1, #size of vocab                  
                             output_dim=vector_dim,
                             mask_zero=True,
                             trainable=True)(input_layer)

In [51]:
def create_nn(layer_provider):
    # Add an Input Layer
    input_layer = layers.Input((pad_sequence_max_len, ))

    # Add the word embedding Layer
    embedding_layer = get_embedding_layer_v1(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add Custom Layer
    custom_layer = layer_provider.build_layer(vector_dim, embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(custom_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

In [52]:
def fit_nn(classifier, nn_name):
    try:
        classifier = data_lake.load_obj(classifier, nn_name + '.pkl')
    except:    
        classifier.fit(train_seq_x, train_y,
                       epochs=3, batch_size=128, validation_split = 0.2, verbose=1)

        data_lake.save_obj(classifier, nn_name +'.pkl')

In [57]:
def predict_nn(classifier):
    score = classifier.predict(valid_seq_x)

    items_recall = recall_score(valid_y, score.argmax(axis=-1), average=None)

    print('RECALL')
    for item in zip(letter_types,items_recall):
        print(str(item))
        
    accuracy = metrics.accuracy_score(score.argmax(axis=-1), valid_y.argmax(axis=-1))
    print('ACCURACY')
    print(accuracy)

# LSTM 

In [56]:
class LSTMLayer:
    
    def __init__(self):
        self.name = 'model_lstm'

    def build_layer(self, vector_dim, prev_layer):
        return layers.LSTM(vector_dim)(prev_layer)

lstm = create_nn(LSTMLayer())
fit_nn(lstm, LSTMLayer().name)
predict_nn(lstm)

Train on 41887 samples, validate on 10472 samples
Epoch 1/3
  128/41887 [..............................] - ETA: 1:19:07 - loss: 0.4065

KeyboardInterrupt: 

# GRU

In [None]:
class GRULayer:
    def __init__(self):
        self.name = 'model_gru'

    def build_layer(self, vector_dim, prev_layer):
        return layers.GRU(vector_dim)(prev_layer)

gru = create_nn(GRULayer())
fit_nn(gru, GRULayer().name)
predict_nn(gru)

# bidirectional rnn  

In [None]:
class BidirectionalGRULayer:
    def __init__(self):
        self.name = 'model_bidirectional_gru'

    def build_layer(self, vector_dim, prev_layer):
        return layers.Bidirectional(layers.GRU(vector_dim))(prev_layer)

bidir_gru = create_nn(BidirectionalGRULayer())
fit_nn(bidir_gru, BidirectionalGRULayer().name)
predict_nn(bidir_gru)