In [1]:
# Import Statements
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load competition data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.shape, test.shape

((2586, 3), (288, 2))

In [4]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [5]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [8]:
# define pipeline components
vect = TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', clf)])

In [9]:
# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect), 
                 # Classifier
                 ('clf', rfc)
                ])

In [12]:
%%time

parameters = {
    'vect__max_df': ( 0.8, 0.9, 0.95, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (200, 500, 1000),
    'clf__n_estimators':(50, 100, 200),
    'clf__max_depth':(5, 10, 20)
}

grid_search = GridSearchCV(pipe,parameters, cv=4, n_jobs=-1, verbose=3)
grid_search.fit(train['description'], train['category'])

Fitting 4 folds for each of 216 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  4.7min finished


CPU times: user 7.47 s, sys: 1.21 s, total: 8.68 s
Wall time: 4min 45s


In [13]:
grid_search.best_score_

0.8808971384377416

In [16]:
%%time

parameters2 = {
    'vect__max_df': (0.95, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (200, 1000),
    'clf__n_estimators':(50, 200),
    'clf__max_depth':(5, 20)
}

grid_search2 = GridSearchCV(pipe, parameters2, cv=4, n_jobs=-1, verbose=3)
grid_search2.fit(train['description'], train['category'])

Fitting 4 folds for each of 32 candidates, totalling 128 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed:   47.8s finished


CPU times: user 2.56 s, sys: 239 ms, total: 2.8 s
Wall time: 49.6 s


In [17]:
grid_search2.best_score_

0.8793503480278422