In [3]:
# import cleaned data
import pandas as pd
train_data = pd.read_csv('../data/cleaned_data.csv', index_col='id')

In [4]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = train_data.comment_text

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)


In [5]:
# Pipeline code
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               #'vect__stop_words': [stop, None],
               'sdg__loss': ['hinge', 'squared_hinge', 'modified_huber', 'log'],
               'sdg__penalty': ['l2', 'l1'],
               'sdg__shuffle': [True, False],
               'sdg__class_weight': ['balanced', None]
              },]

tfidf = Pipeline([('vect', tfidf),
                     ('sdg', SGDClassifier(random_state=0, n_jobs=-1, learning_rate='optimal', early_stopping=False))])

gs_tfidf = GridSearchCV(tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [6]:
gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


KeyboardInterrupt: 

In [None]:
print('Best parameter set: %s ' % gs_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_tfidf.best_score_)


Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)} 
CV Accuracy: 0.946


In [None]:
clf = gs_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))


Test Accuracy: 0.946
