# Tune Logistic Regression with tfidf Vect

In [1]:
# import cleaned data
import pandas as pd
train_data = pd.read_csv('../data/cleaned_data.csv', index_col='id')

In [2]:
# Select columns and split data
from sklearn.model_selection import train_test_split
y = train_data.target
X = train_data.comment_text

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=1, stratify=y)

# print("Number of features: ", X_train.shape[1])
# print("Number of train samples: ", y_train.shape[0])
# print("Number of train samples: ", y_test.shape[0])

In [5]:
# Pipeline code
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               #'vect__stop_words': [stop, None],
               'clf__solver': ['sag', 'saga', ],
               'clf__penalty': ['l2', 'none'],
               'clf__C': [1.0, 10.0]},
            #   {'vect__ngram_range': [(1, 1)],
            #    #'vect__stop_words': [stop, None],
            #    'vect__use_idf':[False],
            #    'vect__norm':[None],
            #    'clf__penalty': ['l1', 'l2'],
            #    'clf__C': [1.0, 10.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [6]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2', 'none'],
                          'clf__solver': ['sag', 'saga'],
                          'vect__ngram_range': [(1, 1)]}],
             scoring='accuracy', verbose=2)

In [7]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)


Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__solver': 'saga', 'vect__ngram_range': (1, 1)} 
CV Accuracy: 0.945


In [8]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))


Test Accuracy: 0.945
