In [1]:
import re
import os 
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import sparse
import ast

import seaborn as sns
import matplotlib.pyplot  as plt

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import keras
from keras import Sequential
from keras import layers, models, optimizers
from keras.layers import Input, Dense, AlphaDropout
from keras.optimizers import Adam
from keras.preprocessing import text, sequence

from sklearn import model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer

import pickle
import textblob, string

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

import data_lake_helper as dl_helper

In [3]:
data_lake = dl_helper.DataLake(version='v3')

In [3]:
def prepare_data(df_to_prepare, key='text_normalized'):
    train_x = df_to_prepare[key].tolist()
    train_y = df_to_prepare.category.tolist()
    return (train_x, train_y)

In [4]:
def load_feature(feature, load_version=None):

    if load_version is None:
        data_lake_ = data_lake
    else:
        data_lake_ = DataLake(version=load_version)
    
    df[feature] = data_lake_.load_obj(feature + '.pkl')

# Loading features 

In [5]:
df = data_lake.load_obj('df-cleaned.pkl')

f_name = 'text_normalized'
load_feature(f_name)

df_train_table = df[df.path == 'dataset/train_set/']
df_test_table = df[df.path == 'dataset/test_set/']

In [6]:
train_x, train_y = prepare_data(df_train_table)
valid_x, valid_y = prepare_data(df_test_table)

# label encode the target variable 
encoder = preprocessing.LabelBinarizer()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [7]:
letter_types = sorted(df.category.unique().tolist())

In [8]:
#removing feature that we wont't use anymore
del df[f_name]

## SNN

In [9]:
model_params = data_lake.load_config('snn_config.txt')

In [10]:
LINE_BREAK = '\n'
class BLCounter(TransformerMixin, BaseEstimator):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [[text.count(LINE_BREAK)] for text in texts]

class TextCuter(TransformerMixin, BaseEstimator):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [text[:15000] for text in texts]

In [11]:

try:
    train_x = data_lake.load_npz('snn_xtrain_vect.npz')
    valid_x = data_lake.load_npz('snn_xvalid_vect.npz')

except:
    vectorizer = FeatureUnion([
      ("c_vect", Pipeline([
           ("TC", TextCuter()),
           ("CV", CountVectorizer(ngram_range=model_params['ngram_range'], 
                                  max_features=model_params['max_features'],
                                  stop_words=model_params['letters_language'])),
           ("SS", StandardScaler(with_mean=False))])),
      ("len", Pipeline([("BC", BLCounter()), ("SS", StandardScaler())]))])

    train_x = vectorizer.fit_transform(df_train_table[f_name])
    valid_x = vectorizer.transform(df_test_table[f_name])
    
    data_lake.save_npz(train_x, 'snn_xtrain_vect.npz')
    data_lake.save_npz(valid_x, 'snn_xvalid_vect.npz')


In [12]:

pipeline = Sequential()
pipeline.add(AlphaDropout(model_params['input_layer_dropout'], input_shape=(model_params['max_features'] + 1,)))

for _ in range(model_params['n_hidden_layers']):
    pipeline.add(Dense(model_params['neurons_per_layer'], 
                        activation=model_params['hidden_layers_activation'],
                        bias_initializer=model_params['neurons_initializer'],
                        kernel_initializer=model_params['neurons_initializer']))
    pipeline.add(AlphaDropout(model_params['hidden_layer_dropout']))

pipeline.add(Dense(len(letter_types), 
                    bias_initializer=model_params['neurons_initializer'],
                    kernel_initializer=model_params['neurons_initializer'],
                    activation=model_params['output_layer_activation']))

pipeline.compile(optimizer=Adam(lr=model_params['optimizer_learning_rate'],
                                 decay=model_params['optimizer_learning_decay']),
                  loss=model_params['training_loss'], metrics=model_params['training_metrics_list'])


In [13]:
EPOCHS = 100

try:
    pipeline = data_lake.load_obj('snn_model.pkl')

except:
    print("fitting SNN...")
    time_start = time.time()

    pipeline.fit(
        train_x, train_y,
        epochs=EPOCHS, batch_size=128, validation_split = 0.2, verbose=1)

    data_lake.save_obj(pipeline, 'snn_model.pkl')
    
    print("fitting finished - time: ", time.time() - time_start)
    print()

In [14]:
score = pipeline.predict(valid_x)

items_recall = recall_score(valid_y.argmax(-1), score.argmax(axis=-1), average=None)

print('RECALL')
for item in zip(letter_types,items_recall):
    print(str(item))
    
print()
print('ACCURACY')
metrics.accuracy_score(score.argmax(-1), valid_y.argmax(-1))

RECALL
('AK', 0.7391304347826086)
('AR', 1.0)
('CL', 0.9967845659163987)
('ER', 0.8181818181818182)
('FL', 1.0)
('ND', 0.0)
('RL', 0.9854651162790697)
('TM', 0.75)
('UR', 1.0)
('UU', 0.0)

ACCURACY


0.8959913326110509