# Load Libraries / Data

In [1]:
import pyprind
import numpy as np
import pandas as pd
import os
import io
import nltk
import re
import eli5
# import pickle
from timeit import default_timer as timer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer


from IPython.display import Image


from sklearn.base import clone
from sklearn.externals import joblib
from sklearn.feature_selection import RFECV, SelectKBest, f_classif, chi2, SelectPercentile, mutual_info_classif
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler, Normalizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV, ParameterGrid
from sklearn.model_selection import train_test_split, learning_curve, validation_curve, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD, NMF
from sklearn.metrics import accuracy_score, auc, f1_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, Perceptron, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn_pandas import DataFrameMapper
# from sklearn_pandas import cross_val_score as cross_val_score_df 

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import StackingClassifier, StackingCVClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import hyperopt.pyll.stochastic
   
from xgboost import XGBClassifier
import lightgbm as lgb

import parfit.parfit as pf

from yellowbrick.features import FeatureImportances, PCADecomposition
from yellowbrick.features.manifold import Manifold
from yellowbrick.model_selection import LearningCurve, ValidationCurve
from yellowbrick.classifier import ClassificationReport, ConfusionMatrix, ROCAUC, ClassBalance, ClassPredictionError, DiscriminationThreshold 
from yellowbrick.classifier import DecisionBoundariesVisualizer

In [2]:
# %run ../modules/library_to_import.ipynb

In [3]:
# %run ../modules/Class_imdb.ipynb
# %run ../modules/transformers_imdb.ipynb


In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
# sns.despine()
sns.set()
# sns.set(style='white', context='notebook', palette='deep')
# sns.set(style='white', palette='deep')
sns.despine()

<Figure size 432x288 with 0 Axes>

In [5]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)  
warnings.filterwarnings("ignore", category=UserWarning)  

np.set_printoptions(precision=2)

In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [7]:
random_state = 42

## Data


In [8]:
# df = pd.read_csv('../../data/movie_data.csv')
data_raw = pd.read_csv("../../data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [9]:
data_raw.head()
data_raw.info()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [10]:
data_raw.sentiment.value_counts(normalize=True)

1    0.5
0    0.5
Name: sentiment, dtype: float64

# X  y

In [11]:
data_raw.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [12]:
# train_raw = data_raw[:20000]
# test_raw = data_raw[20000:]


train_raw = data_raw[:2000]
test_raw = data_raw[23000:]

In [13]:
train_raw.head()
train_raw.info()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
id           2000 non-null object
sentiment    2000 non-null int64
review       2000 non-null object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [14]:
test_raw.head()
test_raw.info()

Unnamed: 0,id,sentiment,review
23000,"""864_2""",0,"""Even though there's a repertoire of over 180 ..."
23001,"""1888_8""",1,"""\""If I wanted to dribble, I'd call a nurse.\""..."
23002,"""7543_8""",1,"""this is a visual adaptation of manga with ver..."
23003,"""2394_7""",1,"""Nice, pleasant, and funny, but not earth-shat..."
23004,"""7694_3""",0,"""Unless you're twelve, this movie really isn't..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 23000 to 24999
Data columns (total 3 columns):
id           2000 non-null object
sentiment    2000 non-null int64
review       2000 non-null object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [15]:
X = train_raw[['review']]
y = train_raw[['sentiment']]

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=random_state)    

In [17]:
X_train.shape
X_train.head()
y_train.shape 
y_train.head()

(1500, 1)

Unnamed: 0,review
1738,"""Harold Pinter rewrites Anthony Schaeffer's cl..."
548,"""The American Humane Association, which is the..."
936,"""a friend gave it to me saying it was another ..."
1389,"""The movie is okay, it has it's moments, the m..."
1607,"""Was this movie stupid? Yup. Did this movie de..."


(1500, 1)

Unnamed: 0,sentiment
1738,0
548,0
936,0
1389,1
1607,0


In [18]:
X_val.shape 
X_val.head()
y_val.shape
y_val.head()

(500, 1)

Unnamed: 0,review
1860,"""I had never heard of this film before a coupl..."
353,"""This movie is all about subtlety and the diff..."
1333,"""If you loved Deep Cover, you might like this ..."
905,"""I thought it was one of the best sequels I ha..."
1289,"""Today I found \""They All Laughed\"" on VHS on ..."


(500, 1)

Unnamed: 0,sentiment
1860,1
353,1
1333,1
905,1
1289,0


In [19]:
X_test = test_raw[['review']]
y_test = test_raw[['sentiment']]

In [20]:
X_test.shape
y_test.shape

(2000, 1)

(2000, 1)

In [21]:
kfold = StratifiedKFold(n_splits=8, shuffle=False, random_state=random_state)

# PRE LOADING

In [22]:
def tokenizer_split(text):
    return [porter.stem(w) for w in text.split(' ')]

def tokenizer_porter(text):
    return [porter.stem(w) for w in text.split(' ') if not w in stop_words ]

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [23]:
stop_words = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

In [24]:
%run ../modules/transformers_imdb.ipynb

In [31]:
# %load params_imdb.py

%run params_imdb.py

# !cat 'params_imdb.py'

# !python 'params_imdb.py'

NameError: name 'KNeighborsClassifier' is not defined

In [26]:
KNeighborsClassifier()

NameError: name 'clf' is not defined

In [None]:
# %run ../modules/Class_imdb.ipynb

from super_analytica import Super_Analytica
from analytica import Analytica
from analytica_2 import Analytica_2
import lemma_tokenizer

In [None]:
master = Super_Analytica()

# Selecting Pipeline

In [None]:
nc = 500

## Pipeline Test - Dimensionality Reduction

In [27]:
# Conslusao é retirar Truncated dos pipelines posteriores. Podemos criar dois fature_matrix  
# com a mesma qnt de features e mostrar como mesmo assim cada processo demora tempos bem diferentes.
# Qual percentile gera a mesma quantidade de features de n_components ?

pipe_1 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('tf-idf_vec', TfidfVectorizer()),
    ('TruncatedSVD', TruncatedSVD(n_components=nc))
])

# variacao do pipe 1
pipe_2 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('vec', TfidfVectorizer()),  
    ('kbest', SelectKBest(chi2, k=nc))

])

master.evaluate_CV(pipe_1, ['LGR']) # precisa de pickle mesmo ?
master.evaluate_CV(pipe_2, ['LGR']) 


# Quase o mesmo resultado com tempo 7 vezes menor

LGR   Cross Validation Done in  7.0  minutes  /-->acc : 0.9399058344319137 

LGR   Cross Validation Done in  1.0  minutes  /-->acc : 0.9277069410813374 



# Cross-Validation

## Non Parametric 

- Os pipes 2 e 3 tem reducao de dimensionalidade para algoritmos Non-Parametric
- clf_T = ['DT', 'ADA', 'RF', 'XT'] + ['KNN']

In [None]:
pipe_2 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('vec', TfidfVectorizer()),  
    ('kbest', SelectPercentile(chi2))

])

pipe_3 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('counts', CountVectorizer()),
    ('tf-idf', TfidfTransformer()),
    ('kbest', SelectPercentile(chi2))
])

NonP_algos = clf_T + ['KNN'] # Non Parametric 

In [None]:
chosen = NonP_algos
assert set(chosen).issubset(set(clf_all.keys()))

In [30]:
clf_all = {}
clf_all['KNN'] = KNeighborsClassifier(n_jobs=-1) #
clf_all['SGDC'] = SGDClassifier(loss='modified_huber', penalty='elasticnet', random_state=random_state)
clf_all['NB'] = MultinomialNB()


clf_all['LSVC'] = LinearSVC(random_state=random_state)
clf_all['LGR'] = LogisticRegression(random_state = random_state) #
# clf['LDA'] = LinearDiscriminantAnalysis()
clf_all['RDG'] = RidgeClassifier(random_state=random_state)


# Tree Based Models
clf_all['DT'] = DecisionTreeClassifier(random_state=random_state)
clf_all['ADA'] = AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1)
clf_all['RF'] = RandomForestClassifier(random_state=random_state)
clf_all['XT'] = ExtraTreesClassifier(random_state=random_state)
clf_all['LGBM'] = lgb.LGBMClassifier(objective = 'binary', metric = 'auc')



In [None]:
master.evaluate_CV(pipe_2, NonP_algos, '_pp2_apagar', pkl_W=False)
master.evaluate_CV(pipe_3, NonP_algos, '_pp3_apagar', pkl_W=False)

In [None]:
type(clf_all)
NonP_algos

## Parametric

- Os pipes 3 e 4 nao tem reducao de dimensionalidade, para algo parametricos
- clf_L = ['LinearSVC', 'LGR', 'RDG'] + ['NB', 'SGDC']



In [None]:
pipe_4 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('tf-idf_vec', TfidfVectorizer())
])


pipe_5 = Pipeline([
    ('col', Col_Extractor(['review'])),
    ('prep', Preprocessor()),
    ('to_array', To_array()),
    ('counts', CountVectorizer()),
    ('tf-idf', TfidfTransformer())
])

P_algos = clf_L + ['NB', 'SGDC']

In [None]:
master.evaluate_CV(pipe_4, P_algos, '_pp4_apagar', pkl_W=False)
master.evaluate_CV(pipe_5, P_algos, '_pp5_apagar', pkl_W=False)

# PLOTTING

In [32]:
master.get_members_evaluations_all()

dict_keys(['LGR', 'DT_pp2', 'ADA_pp2', 'RF_pp2', 'XT_pp2', 'KNN_pp2', 'DT_pp3', 'ADA_pp3', 'RF_pp3', 'XT_pp3', 'KNN_pp3', 'LSVC_pp4', 'LGR_pp4', 'RDG_pp4', 'NB_pp4', 'SGDC_pp4', 'LSVC_pp5', 'LGR_pp5', 'RDG_pp5', 'NB_pp5', 'SGDC_pp5'])

In [None]:
%%time
master.plot_learning_curve(load_saved=False)

In [34]:
%%time
master.plot_Classification_Report( load_saved=False)

saving c_reprt_ image for LGR
saving c_reprt_ image for DT_pp2
saving c_reprt_ image for ADA_pp2
saving c_reprt_ image for RF_pp2
saving c_reprt_ image for XT_pp2
saving c_reprt_ image for KNN_pp2
saving c_reprt_ image for DT_pp3
saving c_reprt_ image for ADA_pp3
saving c_reprt_ image for RF_pp3
saving c_reprt_ image for XT_pp3
saving c_reprt_ image for KNN_pp3
saving c_reprt_ image for LSVC_pp4
saving c_reprt_ image for LGR_pp4
saving c_reprt_ image for RDG_pp4
saving c_reprt_ image for NB_pp4
saving c_reprt_ image for SGDC_pp4
saving c_reprt_ image for LSVC_pp5
saving c_reprt_ image for LGR_pp5
saving c_reprt_ image for RDG_pp5
saving c_reprt_ image for NB_pp5
saving c_reprt_ image for SGDC_pp5
CPU times: user 3min 30s, sys: 17.8 s, total: 3min 48s
Wall time: 3min 56s


<Figure size 432x288 with 0 Axes>

In [35]:
%%time
master.plot_Confusion_Matrix( load_saved=False)

saving c_matrx_ image for LGR
saving c_matrx_ image for DT_pp2
saving c_matrx_ image for ADA_pp2
saving c_matrx_ image for RF_pp2
saving c_matrx_ image for XT_pp2
saving c_matrx_ image for KNN_pp2
saving c_matrx_ image for DT_pp3
saving c_matrx_ image for ADA_pp3
saving c_matrx_ image for RF_pp3
saving c_matrx_ image for XT_pp3
saving c_matrx_ image for KNN_pp3
saving c_matrx_ image for LSVC_pp4
saving c_matrx_ image for LGR_pp4
saving c_matrx_ image for RDG_pp4
saving c_matrx_ image for NB_pp4
saving c_matrx_ image for SGDC_pp4
saving c_matrx_ image for LSVC_pp5
saving c_matrx_ image for LGR_pp5
saving c_matrx_ image for RDG_pp5
saving c_matrx_ image for NB_pp5
saving c_matrx_ image for SGDC_pp5
CPU times: user 3min 28s, sys: 19.7 s, total: 3min 47s
Wall time: 3min 55s


<Figure size 432x288 with 0 Axes>

In [36]:
master.get_members_evaluations_all()

dict_keys(['LGR', 'DT_pp2', 'ADA_pp2', 'RF_pp2', 'XT_pp2', 'KNN_pp2', 'DT_pp3', 'ADA_pp3', 'RF_pp3', 'XT_pp3', 'KNN_pp3', 'LSVC_pp4', 'LGR_pp4', 'RDG_pp4', 'NB_pp4', 'SGDC_pp4', 'LSVC_pp5', 'LGR_pp5', 'RDG_pp5', 'NB_pp5', 'SGDC_pp5'])

In [37]:
print('Done')

Done
