In [1]:
"""Script to fine tune the hyperparameters"""
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from time import time
from pprint import pprint
from sklearn.model_selection import train_test_split

In [3]:
# This is for the ExtraTreesClassifier

# Import previously created model see add_features_gt_average.ipynb
# Read in data then split data into training and test sets
# Create Label column defining whether or not the article's upvotes exceed the average vote for the subreddit
# Split data into training and test sets

pipeline = joblib.load('datascience.pkl')
data = pd.read_csv('processed_datascience.csv.bz2')
data['gtavg'] = data.ups > data.ups.mean()
train_X, test_X, train_y, test_y = train_test_split(data.title, 
                                                    data.gtavg, 
                                                    test_size=0.20,
                                                    random_state=25)

# Hyperparameters to explore
parameters = {
    'clf__n_estimators':(120,300),
    'clf__max_depth':(5,8,15,25,30, None),
    'clf__min_samples_split':(2,5,10,15,100),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__max_features':('log2','sqrt',None),
}

# 
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


y = grid_search.predict(test_X)

accuracy_score(y_pred=y, y_true=test_y)
print('Accuracy: {:03.1f}%'.format(accuracy * 100))

Performing grid search...
pipeline: ['union', 'clf']
parameters:
{'clf__max_depth': (5, 8, 15, 25, 30, None),
 'clf__max_features': ('log2', 'sqrt', None),
 'clf__min_samples_leaf': (1, 2, 5, 10),
 'clf__min_samples_split': (2, 5, 10, 15, 100),
 'clf__n_estimators': (120, 300, 500, 800, 1200)}
Fitting 3 folds for each of 1800 candidates, totalling 5400 fits


KeyboardInterrupt: 