The purpose of this notbook is to train the best models we found with their best parameters on various different hashing vectorizers. The resulting combination or vectoirizer and model will be chosen to pickle and deploy.

In [None]:
# import cleaned data
import pandas as pd
train_data = pd.read_csv('../../data/cleaned_data.csv', index_col='id')

In [None]:
# Apply hashing vectorizer to comment data
from sklearn.feature_extraction.text import HashingVectorizer

tfid_vectorizer = HashingVectorizer()
tfid_tranformed_train_data = tfid_vectorizer.fit_transform(train_data.comment_text)

In [None]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = tfid_tranformed_train_data

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)


In [None]:
# Pipeline code
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

hasher = HashingVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
               'vect__n_features': [2**20, 600000],  # Default, and value close to number of actual features
               'vect__alternate_sign': [True, False]
              },]

# Best SDG w/ hashing vect
sdg_pipeline = Pipeline([('vect', hasher),
                     ('sdg', SGDClassifier(random_state=0, n_jobs=-1, learning_rate='optimal', early_stopping=False, class_weight=None, loss='squared_hinge', penalty='l2', shuffle=False))])

# Best LG w/ hashing vect
lr_pipeline = Pipeline([('vect', hasher),
                     ('sdg', LogisticRegression(random_state=0, n_jobs=-1, C=1.0, penalty='l2', solver='saga'))])

gs_sdg_hash = GridSearchCV(sdg_pipeline, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

gs_lr_hash = GridSearchCV(lr_pipeline, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [None]:
gs_sdg_hash.fit(X_train, y_train)

In [None]:
gs_lr_hash.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_sdg_hash.best_params_)
print('CV Accuracy: %.3f' % gs_sdg_hash.best_score_)

best_sdg = gs_sdg_hash.best_estimator_
print('Test Accuracy: %.3f' % best_sdg.score(X_test, y_test))

In [None]:
print('Best parameter set: %s ' % gs_lr_hash.best_params_)
print('CV Accuracy: %.3f' % gs_lr_hash.best_score_)

best_lr = gs_lr_hash.best_estimator_
print('Test Accuracy: %.3f' % best_lr.score(X_test, y_test))