In [1]:
# Import Statements
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# load competition data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.shape, test.shape

((2586, 3), (288, 2))

In [4]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [5]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [6]:
# define pipeline components
vect = TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier()
svd = TruncatedSVD(algorithm='randomized', n_iter=15, random_state=42)

In [7]:
# Pipe
pipe = Pipeline([('vect', vect), ('svd', svd), ('rfc', rfc)])

In [13]:
%%time

params = { 
    'vect__max_df': (.9, .95, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (200, 500, 1000),
    'svd__n_components': (10, 50, 100),
    'rfc__n_estimators': (50, 100, 200),
    'rfc__max_depth': (5, 25, 100)
}

# Fit
rand_search = RandomizedSearchCV(pipe, params, n_iter=200, iid=False, cv=3, random_state=42, n_jobs=3, verbose=4)
rand_search.fit(train['description'], train['category'])

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   15.9s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:   59.7s
[Parallel(n_jobs=3)]: Done 215 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done 386 tasks      | elapsed:  4.1min
[Parallel(n_jobs=3)]: Done 600 out of 600 | elapsed:  6.1min finished


CPU times: user 6.05 s, sys: 893 ms, total: 6.94 s
Wall time: 6min 6s


In [14]:
rand_search.best_score_

0.8847400001796509