In [None]:
#https://mlfromscratch.com/model-stacking-explained/#/
#https://machinelearningmastery.com/super-learner-ensemble-in-python/
#https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/

In [1]:
import re
import os 
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import sparse
import ast

import seaborn as sns
import matplotlib.pyplot  as plt

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import keras
from keras import Sequential
from keras import layers, models, optimizers
from keras.layers import Input, Dense, AlphaDropout
from keras.optimizers import Adam
from keras.preprocessing import text, sequence

from sklearn import model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer

import pickle
import textblob, string

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

import data_lake_helper as dl_helper

In [2]:
data_lake = dl_helper.DataLake(version='v3')

In [3]:
def prepare_data(df_to_prepare, key='text_normalized'):
    train_x = df_to_prepare[key].tolist()
    train_y = df_to_prepare.category.tolist()
    return (train_x, train_y)

In [4]:
def load_feature(feature, load_version=None):

    if load_version is None:
        data_lake_ = data_lake
    else:
        data_lake_ = DataLake(version=load_version)
    
    df[feature] = data_lake_.load_obj(feature + '.pkl')

# Loading features 

In [46]:
df = data_lake.load_obj('df-cleaned.pkl')

f_name = 'text_normalized'
load_feature(f_name)

df_train_table = df[df.path == 'dataset/train_set/']
df_test_table = df[df.path == 'dataset/test_set/']


In [47]:
train_x, train_y = prepare_data(df_train_table)
valid_x, valid_y = prepare_data(df_test_table)

# label encode the target variable 
encoder = preprocessing.LabelBinarizer()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [48]:
letter_types = sorted(df.category.unique().tolist())

In [14]:
#removing feature that we wont't use anymore
#del df[f_name]

## SNN

In [10]:
LINE_BREAK = '\n'
class BLCounter(TransformerMixin, BaseEstimator):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [[text.count(LINE_BREAK)] for text in texts]

class TextCuter(TransformerMixin, BaseEstimator):
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return [text[:15000] for text in texts]

In [99]:
model_params = data_lake.load_config('snn_config.txt')

snn_vectorizer = FeatureUnion([
  ("c_vect", Pipeline([
       ("TC", TextCuter()),
       ("CV", CountVectorizer(ngram_range=model_params['ngram_range'], 
                              max_features=model_params['max_features'],
                              stop_words=model_params['letters_language'])),
       ("SS", StandardScaler(with_mean=False))])),
  ("len", Pipeline([("BC", BLCounter()), ("SS", StandardScaler())]))])

train_x = snn_vectorizer.fit_transform(df_train_table[f_name])
valid_x = snn_vectorizer.transform(df_test_table[f_name])


FeatureUnion(transformer_list=[('c_vect',
                                Pipeline(steps=[('TC', TextCuter()),
                                                ('CV',
                                                 CountVectorizer(max_features=50000,
                                                                 ngram_range=[1,
                                                                              3],
                                                                 stop_words='english')),
                                                ('SS',
                                                 StandardScaler(with_mean=False))])),
                               ('len',
                                Pipeline(steps=[('BC', BLCounter()),
                                                ('SS', StandardScaler())]))])

In [123]:
#loading already trained SNN
pipeline = data_lake.load_obj('snn_model.pkl')

In [152]:

class SNNtransformer(TransformerMixin, BaseEstimator):
    
    _estimator_type = "classifier"
    
    def fit(self, x, y=None):
        return self
    
    def predict(self, texts):
        texts_t = snn_vectorizer.transform(texts)

        score = pipeline.predict(texts_t).argmax(axis=-1)
        encoder = preprocessing.LabelBinarizer()
        score_bin = encoder.fit_transform(score)
        
        return score_bin


In [153]:
snn_model_pipeline = make_pipeline(SNNtransformer())

In [140]:
snn_predictions = snn_model_pipeline.transform(df_test_table[f_name])
snn_accuracy = metrics.accuracy_score(snn_predictions, valid_y)
print(snn_accuracy)

0.15059588299024917


## RandomForestClassifier 

In [50]:
from sklearn.base import TransformerMixin, BaseEstimator

config = data_lake.load_config('tf_idf_word_vect_config.txt')

tfidf_vect = TfidfVectorizer(analyzer=config['analyzer'],
                             token_pattern=config['token_pattern'],
                             max_features=config['max_features'])
tfidf_vect.fit(df[f_name])


TfidfVectorizer(max_features=5000, token_pattern='\\w{1,}')

In [51]:
class TfIdfMapper(TransformerMixin, BaseEstimator):
    
    #https://github.com/scikit-learn/scikit-learn/issues/17597
    n_features_in_ = None
    
    def fit(self, x, y=None):
        return self
    def transform(self, texts):
        return tfidf_vect.transform(texts)


In [52]:
rforest_pipeline = make_pipeline(TfIdfMapper(), ensemble.RandomForestClassifier())
rforest_pipeline.fit(df_train_table[f_name], train_y)


Pipeline(steps=[('tfidfmapper', TfIdfMapper()),
                ('randomforestclassifier', RandomForestClassifier())])

In [69]:
predictions = rforest_pipeline.predict(df_test_table[f_name])
accuracy = metrics.accuracy_score(predictions, valid_y)
print(accuracy)

0.9014084507042254


## Stacking

In [154]:
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('rforest_pipeline', rforest_pipeline))
    level0.append(('snn_model_pipeline', snn_model_pipeline))

    # define meta learner model
    level1 = LogisticRegression()

    # define the stacking ensemble
    model = ensemble.StackingClassifier(estimators=level0, final_estimator=level1, cv=2)#5)
    return model

In [155]:
stack = get_stacking()
stack.fit(df_train_table[f_name][:1000], df_train_table.category[:1000])

StackingClassifier(cv=2,
                   estimators=[('rforest_pipeline',
                                Pipeline(steps=[('tfidfmapper', TfIdfMapper()),
                                                ('randomforestclassifier',
                                                 RandomForestClassifier())])),
                               ('snn_model_pipeline',
                                Pipeline(steps=[('snntransformer',
                                                 SNNtransformer())]))],
                   final_estimator=LogisticRegression())

In [156]:
stack_predictions = stack.predict(df_test_table[f_name])
accuracy = metrics.accuracy_score(stack_predictions, df_test_table.category)
print(accuracy)

0.856988082340195
