In [2]:
#Imports and packages needed
import pandas as pd 
import numpy as np
import script.functions as func
import pickle
import autoreload 
%load_ext autoreload
%autoreload 2

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from hpsklearn import HyperoptEstimator, multinomial_nb, tfidf
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from hyperopt import hp, Trials, tpe, fmin, space_eval

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
#Import Data 
train = pd.read_csv('../data/clean_train.csv', index_col=0)
test = pd.read_csv('../data/clean_test.csv', index_col=0)

#Separate dependent and independent variables and split data
X = train['Phrase']
y = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

## Naive Bayes Classification

In [76]:
#Optimizing using hyper-opt sklearn
estimator = HyperoptEstimator(classifier= multinomial_nb('nb'),
                              preprocessing = [tfidf('tf')], 
                             algo=tpe.suggest)
estimator.fit(X_train, y_train)

100%|██████████| 1/1 [00:03<00:00,  3.47s/trial, best loss: 0.4835557673975215]
100%|██████████| 2/2 [00:02<00:00,  2.63s/trial, best loss: 0.45527486495074676]
100%|██████████| 3/3 [00:02<00:00,  2.58s/trial, best loss: 0.45527486495074676]
100%|██████████| 4/4 [00:03<00:00,  3.15s/trial, best loss: 0.45527486495074676]
100%|██████████| 5/5 [00:00<00:00,  1.06trial/s, best loss: 0.45527486495074676]
100%|██████████| 6/6 [00:02<00:00,  2.57s/trial, best loss: 0.45527486495074676]
100%|██████████| 7/7 [00:01<00:00,  1.95s/trial, best loss: 0.45527486495074676]
100%|██████████| 8/8 [00:01<00:00,  1.93s/trial, best loss: 0.45527486495074676]
100%|██████████| 9/9 [00:02<00:00,  2.73s/trial, best loss: 0.45527486495074676]
100%|██████████| 10/10 [00:01<00:00,  2.00s/trial, best loss: 0.45527486495074676]


In [96]:
print(f'Train Score: {estimator.score(X_train, y_train)}')
print(f'Test Score: {estimator.score(X_test, y_test)}')

Train Score: 0.7751561084894418
Test Score: 0.5530505243088656


In [91]:
#Optimizing using hyper-opt
pipe = Pipeline([('tf_vec', TfidfVectorizer()),
                ('nb', MultinomialNB())])

space = {}
space['tf_vec__ngram_range'] = hp.choice('tf_vec__ngram_range', [(1,1), (1,2), (1,3)])
space['tf_vec__stop_words'] = hp.choice('tf_vec__stop_words', [None, 'english'])
space['tf_vec__min_df'] = hp.randint('tf_vec__min_df', 3)
space['tf_vec__max_df'] = hp.uniform('tf_vec__max_df', 0.7, 1.0)
space['nb__alpha'] = hp.loguniform('nb__alpha', 0,1)

def objective(params):
    pipe.set_params(**params)
    score = cross_val_score(pipe, X_train, y_train, cv=5)
    return 1 - score.mean()

trials = Trials()
best = fmin(objective,
            space,
            algo = tpe.suggest,
           max_evals=10,
           trials=trials)

100%|██████████| 10/10 [00:42<00:00,  4.26s/trial, best loss: 0.44212458726097004]


In [92]:
best_params = space_eval(space, best)
pipe.set_params(**best_params)
pipe.fit(X_train, y_train)
print(f'Train Score: {pipe.score(X_train, y_train)}')
print(f'Test Score: {pipe.score(X_test, y_test)}')

Train Score: 0.6374310818755263
Test Score: 0.5600095328884652


In [97]:
#Pickle the models
pickle.dump(estimator, open( "../models/nb_1.pkl", "wb" ))
pickle.dump(pipe, open( "../models/nb_2.pkl", "wb" ))

## LSTM Modeling

In [105]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

In [91]:
#Preprocessing 
X_train, X_train_df = func.encode_and_pad(pd.DataFrame(X_train), 30000, 200)
X_test, X_test_df = func.encode_and_pad(pd.DataFrame(X_test), 30000, 200)

In [95]:
X_train.shape

(200,)

In [110]:
model = Sequential()
model.add(Embedding(30000, 128, input_shape=(None,)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(5, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 128)         3840000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
bidirectional_33 (Bidirectio (None, None, 128)         98816     
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
bidirectional_34 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_13 (Dense)             (None, 5)                 645       
Total params: 4,038,277
Trainable params: 4,038,277
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
result = model.fit(X_train, y_train, batch_size=500, epochs=30, validation_data=(X_test, y_test))

Train on 62937 samples, validate on 20980 samples
Epoch 1/30
 3000/62937 [>.............................] - ETA: 9:08 - loss: 1.5082 - acc: 0.4690