
# Load Libraries / Data

In [56]:
import numpy as np
import pandas as pd
import nltk
import re
from timeit import default_timer as timer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from IPython.display import Image

from sklearn.base import clone
from sklearn.externals import joblib
from sklearn.feature_selection import RFECV, SelectKBest, f_classif, chi2, SelectPercentile
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler, Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV, ParameterGrid
from sklearn.model_selection import train_test_split, learning_curve, validation_curve, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD, NMF
from sklearn.metrics import accuracy_score, auc, f1_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, Perceptron, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn_pandas import DataFrameMapper

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import StackingClassifier, StackingCVClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import hyperopt.pyll.stochastic

import lightgbm as lgb

import parfit.parfit as pf

from yellowbrick.model_selection import LearningCurve, ValidationCurve
from yellowbrick.classifier import ClassificationReport, ConfusionMatrix

In [57]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
sns.set()
sns.despine()

<Figure size 432x288 with 0 Axes>

In [58]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)  
warnings.filterwarnings("ignore", category=UserWarning)  

np.set_printoptions(precision=2)

In [59]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Data


In [60]:
random_state = 42

kfold = StratifiedKFold(n_splits=8, shuffle=False, random_state=random_state)

In [61]:
data_raw = pd.read_csv("../../data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [62]:
data_raw.head()
data_raw.info()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [63]:
data_raw.sentiment.value_counts(normalize=True)

1    0.5
0    0.5
Name: sentiment, dtype: float64

# X  y

In [64]:
train_raw = data_raw[:20000]
test_raw = data_raw[20000:]

In [65]:
train_raw.head()
train_raw.info()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
id           20000 non-null object
sentiment    20000 non-null int64
review       20000 non-null object
dtypes: int64(1), object(2)
memory usage: 468.8+ KB


In [66]:
test_raw.head()
test_raw.info()

Unnamed: 0,id,sentiment,review
20000,"""3862_4""",0,"""I just watched it. A couple of laughs, but no..."
20001,"""674_10""",1,"""While to most people watching the movie, this..."
20002,"""8828_10""",1,"""I was so glad I came across this short film. ..."
20003,"""2963_8""",1,"""The creators of south park in their own film ..."
20004,"""2483_1""",0,"""Unspeakably discombobulated turkey, a mix of ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 20000 to 24999
Data columns (total 3 columns):
id           5000 non-null object
sentiment    5000 non-null int64
review       5000 non-null object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [67]:
X = train_raw[['review']]
y = train_raw[['sentiment']]

In [68]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=random_state)    

In [69]:
X_train.shape
X_train.head()
y_train.shape 
y_train.head()

(15000, 1)

Unnamed: 0,review
5514,"""There is a DVD published in the UK in 2002 Co..."
1266,"""Frownland is like one of those intensely emba..."
5864,"""I rented this because I'm a bit weary of '80s..."
15865,"""I like bad movies. I like to rent bad movies ..."
12892,"""The story line was very straight forward and ..."


(15000, 1)

Unnamed: 0,sentiment
5514,0
1266,1
5864,0
15865,0
12892,0


In [70]:
X_val.shape 
X_val.head()
y_val.shape
y_val.head()

(5000, 1)

Unnamed: 0,review
10650,"""I can't believe that Steven Segal's career ha..."
2041,"""I wasn't quite sure if this was just going to..."
8668,"""First of all, if you'r a fan of the comic, we..."
1114,"""I really liked this movie, and went back to s..."
13902,"""Yes, CHUNKY, this is the nick-name that Donna..."


(5000, 1)

Unnamed: 0,sentiment
10650,0
2041,1
8668,0
1114,1
13902,0


In [71]:
test_raw.head()

Unnamed: 0,id,sentiment,review
20000,"""3862_4""",0,"""I just watched it. A couple of laughs, but no..."
20001,"""674_10""",1,"""While to most people watching the movie, this..."
20002,"""8828_10""",1,"""I was so glad I came across this short film. ..."
20003,"""2963_8""",1,"""The creators of south park in their own film ..."
20004,"""2483_1""",0,"""Unspeakably discombobulated turkey, a mix of ..."


In [72]:
X_test = test_raw[['review']]
y_test = test_raw[['sentiment']]

In [73]:
X_test.shape
y_test.shape

(5000, 1)

(5000, 1)

In [74]:
X_train.shape
# X_train.info()
type(X_train)

(15000, 1)

pandas.core.frame.DataFrame

In [75]:
y_train.shape
# y_train.info()
type(y_train)

(15000, 1)

pandas.core.frame.DataFrame

In [76]:
X_test.shape
# X_test.info()
type(X_test)

(5000, 1)

pandas.core.frame.DataFrame

In [77]:
y_test.shape
# y_train.info()
type(y_test)

(5000, 1)

pandas.core.frame.DataFrame

# PRE LOADING

In [78]:
def tokenizer_split(text):
    return [porter.stem(w) for w in text.split(' ')]

def tokenizer_porter(text):
    return [porter.stem(w) for w in text.split(' ') if not w in stop_words ]

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [79]:
stop_words = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

In [80]:
%run ../modules/Class_imdb.ipynb
%run ../modules/transformers_imdb.ipynb

In [81]:
%run ../modules/params_imdb.ipynb

In [82]:
master = Super_Analytica()

# KNN

In [84]:
best_params = {
    'vec__max_df': 0.9,
    'vec__min_df': 2,
    'vec__ngram_range': (1, 3),
    'vec__sublinear_tf': True
}

In [85]:
pipe_11 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('vec', TfidfVectorizer(max_features=20000, stop_words='english'))
])

In [30]:
param['KNN_HP_1'] = {'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
                     'vec__min_df': 1+hp.randint('vec__min_df', 3),
                     'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
                     'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
                     'KNN__n_neighbors': hp.choice('KNN__n_neighbors', [5, 10, 20, 50, 100])
                     }

md_KNN = Analytica('KNN', param['KNN_HP_1'], 'KNN_HP_cv_1')
md_KNN.evaluate_HP_cv(pipe_11, pkl_W=True)

cv score: 0.8635867991710253
cv score: 0.9181418809822073
cv score: 0.9180400187034932
cv score: 0.8812490042883298
cv score: 0.8709427189377432
cv score: 0.8828288012635919
cv score: 0.8815645852696157
cv score: 0.8847189196778402
cv score: 0.859410908060279
cv score: 0.8962396901224139
cv score: 0.9088575776477203
cv score: 0.9084367468127114
cv score: 0.8638321281275992
cv score: 0.8713825393886276
cv score: 0.8808014312794109
cv score: 0.8458000258183851
cv score: 0.9181310201126542
cv score: 0.9113226827087968
cv score: 0.88541751226959
cv score: 0.8824836517796526
test score : 0.9109078642824955
KNN_HP_cv_1  HP_cv and pickled to disk Done in 51.0  minutes


# TREES

In [31]:
clf_T

['DT', 'ADA', 'RF', 'XT']

In [32]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


pipe_9 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('vec', TfidfVectorizer(max_features=4000, stop_words='english', tokenizer=LemmaTokenizer())),
    ('kbest', SelectPercentile(chi2))
])

## RF


In [33]:
param['RF_HP_1'] = {'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
                    'vec__min_df': 1+hp.randint('vec__min_df', 3),
                    'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
                    'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
                    'kbest__percentile': hp.uniform('kbest__percentile', 50, 100),
                    'RF__n_estimators': hp.choice('XRF__n_estimators', [1, 10, 100]),
                    'RF__min_samples_split': hp.choice("XRF__min_samples_split", [2, 5, 10])
                    }

md_RF = Analytica('RF', param['RF_HP_1'], 'RF_HP_cv_1')
md_RF.evaluate_HP_cv(pipe_9, pkl_W=True)

cv score: 0.8609212186641566
cv score: 0.9151532439440775
cv score: 0.9188926242342879
cv score: 0.8526289751620496
cv score: 0.8646061027964095
cv score: 0.6847619628262527
cv score: 0.6568572147867677
cv score: 0.9150687439716685
cv score: 0.9143412223915055
cv score: 0.6857283289053231
cv score: 0.6862960612167145
cv score: 0.6881330266822392
cv score: 0.9138440839502189
cv score: 0.9175665906172578
cv score: 0.8652829845468245
cv score: 0.8602839437147823
cv score: 0.9182693291399608
cv score: 0.8727119172604663
cv score: 0.6576564926598241
cv score: 0.660696991971852
test score : 0.9243954761685306
RF_HP_cv_1  HP_cv and pickled to disk Done in 110.0  minutes


## XT

In [34]:
param['XT_HP_1'] = {
#     'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
#     'vec__min_df': 1+hp.randint('vec__min_df', 3),
#     'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
#     'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
#     'kbest__percentile': hp.uniform('kbest__percentile', 50, 100),
    "XT__n_estimators": hp.choice('XT__n_estimators', [10, 50, 100, 300 ,500, 1000]),
    "XT__max_features": hp.choice("XT__max_features", ['auto', 'sqrt'])  
}
md_XT = Analytica('XT', param['XT_HP_1'], 'XT_HP_cv_1')
md_XT.evaluate_HP_cv(pipe_9, pkl_W=True)

cv score: 0.91955756683169
cv score: 0.9204293417225304
cv score: 0.916253207577916
cv score: 0.916253207577916
cv score: 0.9204293417225304
cv score: 0.9208372615615411
cv score: 0.9204293417225304
cv score: 0.916253207577916
cv score: 0.916253207577916
cv score: 0.91955756683169
cv score: 0.9114242655849374
cv score: 0.9204293417225304
cv score: 0.9208372615615411
cv score: 0.9114242655849374
cv score: 0.9204293417225304
cv score: 0.9114242655849374
cv score: 0.916253207577916
cv score: 0.9204293417225304
cv score: 0.916253207577916
cv score: 0.9208372615615411
test score : 0.9269171524876081
XT_HP_cv_1  HP_cv and pickled to disk Done in 214.0  minutes


# LINEAR

In [35]:
P_algos = clf_L + ['NB', 'SGDC']

In [36]:
pipe_4 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('vec', TfidfVectorizer(stop_words='english'))
])

## LGR

In [37]:
param['LGR_HP_1'] = {'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
                     'vec__min_df': 1+hp.randint('vec__min_df', 3),
                     'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
                     'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
                     'LGR__C': hp.choice('zz_LGR__C', [1.e-1, 1.e+0, 1.e+1, 1.e+2, 1.e+3])}

md_LGR = Analytica('LGR', param['LGR_HP_1'], 'LGR_HP_cv_1')
md_LGR.evaluate_HP_cv(pipe_4, pkl_W=True)

cv score: 0.9259066385865086
cv score: 0.9449830696733194
cv score: 0.957536945885062
cv score: 0.9499822921034902
cv score: 0.9553372821164023
cv score: 0.9271932609002183
cv score: 0.9579825619023741
cv score: 0.9578279835292726
cv score: 0.9579825619023741
cv score: 0.9431423095658205
cv score: 0.9551329450608764
cv score: 0.9523539659868917
cv score: 0.9480607289980103
cv score: 0.9561318741269375
cv score: 0.9132339973850523
cv score: 0.9569788559798167
cv score: 0.9500875158644093
cv score: 0.9533172179177993
cv score: 0.9576487702763121
cv score: 0.9514621531462295
test score : 0.956640961042153
LGR_HP_cv_1  HP_cv and pickled to disk Done in 40.0  minutes


## NB

In [39]:
param['NB_HP'] = {'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
                  'vec__min_df': 1+hp.randint('vec__min_df', 3),
                  'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
                  'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
                  'NB__alpha': hp.choice('zz_alpha', [0.001, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0])} 

md_NB = Analytica('NB', param['NB_HP'], 'NB_HP_cv_1')
md_NB.evaluate_HP_cv(pipe_4, pkl_W=True)

cv score: 0.9197738502322675
cv score: 0.9440684330819463
cv score: 0.9364668165767746
cv score: 0.9476790725229912
cv score: 0.9459125585349135
cv score: 0.9262811140256602
cv score: 0.9341356161862976
cv score: 0.934078929033085
cv score: 0.9467129471242959
cv score: 0.9203149379739433
cv score: 0.929875058211677
cv score: 0.9440665579370997
cv score: 0.9340961873507142
cv score: 0.9461426987685675
cv score: 0.9454166172860267
cv score: 0.9302292196910463
cv score: 0.9035766528617117
cv score: 0.9353902015772754
cv score: 0.9350405820167356
cv score: 0.9197738502322675
test score : 0.9453560254598335
NB_HP_cv_1  HP_cv and pickled to disk Done in 32.0  minutes


## SGDC

In [40]:
param['SGDC_HP_1'] = {'vec__ngram_range': hp.choice('vec__ngram_range', [(1, 1), (1, 2), (1, 3)]),
                 'vec__min_df': 1+hp.randint('vec__min_df', 3),
                 'vec__max_df': hp.uniform('vec__max_df', 0.7, 1.0),
                 'vec__sublinear_tf': hp.choice('vec__sublinear_tf', [True, False]),
                 'SGDC__l1_ratio': hp.uniform('clf__l1_ratio', 0.0, 1.0),
                 'SGDC__alpha': hp.loguniform('clf__alpha', -9*np.log(10), -4*np.log(10)),
                 'SGDC__n_iter': 20 + 5*hp.randint('clf__n_iter', 12)
                }

md_SGDC = Analytica('SGDC', param['SGDC_HP_1'], 'SGDC_HP_cv_1')
md_SGDC.evaluate_HP_cv(pipe_4, pkl_W=True)

cv score: 0.9450426968241976
cv score: 0.9441612729944222
cv score: 0.9358798497349807
cv score: 0.9489867691224106
cv score: 0.9360752317401003
cv score: 0.9357362640214764
cv score: 0.9434793048638149
cv score: 0.9405733497589576
cv score: 0.9454791772284534
cv score: 0.9435095029009354
cv score: 0.9339604382112671
cv score: 0.9442342263945177
cv score: 0.9433021806454284
cv score: 0.9435176254453077
cv score: 0.9369460258045818
cv score: 0.9461913550646146
cv score: 0.9532435781651598
cv score: 0.9500694535962713
cv score: 0.9453354745165232
cv score: 0.9348276056245558
test score : 0.950860875988284
SGDC_HP_cv_1  HP_cv and pickled to disk Done in 31.0  minutes


# PLOTTING

In [41]:
master.get_members_evaluations_all()

dict_keys(['KNN_HP_cv_1', 'RF_HP_cv_1', 'XT_HP_cv_1', 'LGR_HP_cv_1', 'NB_HP_cv_1', 'SGDC_HP_cv_1'])

In [42]:
%%time
master.plot_learning_curve(load_saved=False)

saving l_curve_ image for  KNN_HP_cv_1
saving l_curve_ image for  RF_HP_cv_1
saving l_curve_ image for  XT_HP_cv_1
saving l_curve_ image for  LGR_HP_cv_1
saving l_curve_ image for  NB_HP_cv_1
saving l_curve_ image for  SGDC_HP_cv_1
CPU times: user 40min 31s, sys: 4min 59s, total: 45min 31s
Wall time: 46min 11s


<Figure size 432x288 with 0 Axes>

In [43]:
# %%time
# master.plot_Classification_Report( load_saved=False)

In [44]:
%%time
master.plot_Confusion_Matrix( load_saved=False)

saving c_matrx_ image for KNN_HP_cv_1
saving c_matrx_ image for RF_HP_cv_1
saving c_matrx_ image for XT_HP_cv_1
saving c_matrx_ image for LGR_HP_cv_1
saving c_matrx_ image for NB_HP_cv_1
saving c_matrx_ image for SGDC_HP_cv_1
CPU times: user 5min 38s, sys: 12.7 s, total: 5min 51s
Wall time: 5min 55s


<Figure size 432x288 with 0 Axes>

In [45]:
master.get_members_evaluations_all()

dict_keys(['KNN_HP_cv_1', 'RF_HP_cv_1', 'XT_HP_cv_1', 'LGR_HP_cv_1', 'NB_HP_cv_1', 'SGDC_HP_cv_1'])

In [46]:
print('Done')

Done
