In [3]:
# Import Statements
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [4]:
# load competition data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
train.shape, test.shape

((2586, 3), (288, 2))

In [6]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [7]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [8]:
# define pipeline components
vect = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(algorithm='randomized', n_iter=15, random_state=42)
clf = SGDClassifier(early_stopping=True, random_state=42)

In [9]:
# Pipe
pipe = Pipeline([('vect', vect), ('svd', svd), ('clf', clf)])

In [97]:
# vect_test = TfidfVectorizer(stop_words='english', max_df=0.001)
# sparse = vect_test.fit_transform(train['description'])

In [98]:
# dtm = pd.DataFrame(sparse.todense(), columns=vect_test.get_feature_names())
# print(dtm.shape)
# dtm.head()

(2586, 5260)


Unnamed: 0,00,005,011,035,070,076,08,080,09,10042,...,yum,yuzu,zapping,zero,zestiness,zigzag,zin,zinginess,zings,zippy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# svd_test = TruncatedSVD(algorithm='randomized', n_iter=15, random_state=42, n_components=100)

# trunc = pd.DataFrame(svd_test.fit_transform(dtm))
# print(trunc.shape)
# trunc.head()

(2586, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.205803,-0.121454,-0.038693,-0.106037,0.02943,-0.037292,0.140523,0.02137,0.012332,-0.012559,...,-0.028711,0.023045,0.006259,-0.018904,0.04384,-0.010877,0.015947,-0.01719,-0.004033,-0.01901
1,0.101602,-0.02035,-0.031952,0.088345,-0.017834,-0.066574,-0.001879,0.009665,-0.001589,-0.023107,...,0.023465,-0.00047,0.009562,0.022751,-0.027146,-0.038213,-0.031908,0.044621,0.044816,-0.038817
2,0.18847,-0.077965,0.040524,-0.123018,0.142918,-0.064508,0.104494,-0.011252,0.017774,-0.03845,...,0.055049,-0.05328,0.010775,-0.001342,-0.01311,-0.01931,-0.045327,-0.027777,0.052944,0.016628
3,0.22417,-0.039588,-0.120368,-0.020921,0.097833,-0.029783,-0.123222,0.006342,-0.014826,0.155845,...,-0.013688,0.032345,-0.019474,-0.030495,0.003388,-0.071878,-0.013212,-0.012354,0.032698,0.013783
4,0.140977,-0.060083,0.049328,-0.076552,-0.043988,-0.068143,0.026502,0.103619,-0.08928,0.191584,...,0.026624,0.034101,-0.005253,0.044789,-0.00963,-0.004549,-0.013607,-0.004227,-0.003575,-0.022326


In [11]:
%%time

params = { 
    # 'vect__max_df': [0.001, 0.01, 1],
    'vect__min_df': [0.01, 0.02, 0.03],
    'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
    'svd__n_components': [50, 100, 150],
    'clf__max_iter': [1000, 2000, 3000, 4000, 5000]
}

# Fit
rand_search = RandomizedSearchCV(pipe, params, n_iter=200, iid=False, cv=4, random_state=42, n_jobs=3, verbose=4)
rand_search.fit(train['description'], train['category'])

Fitting 4 folds for each of 135 candidates, totalling 540 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  6.4min
[Parallel(n_jobs=3)]: Done 215 tasks      | elapsed: 23.1min
[Parallel(n_jobs=3)]: Done 386 tasks      | elapsed: 49.9min
[Parallel(n_jobs=3)]: Done 540 out of 540 | elapsed: 87.5min finished


CPU times: user 39 s, sys: 1.65 s, total: 40.7 s
Wall time: 1h 27min 48s


In [12]:
rand_search.best_score_

0.911831594236135

In [13]:
rand_search.get_params()

{'cv': 4,
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('vect',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words='english', strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=15,
          random_state=42, tol=0.0)),
  ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
          early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
          l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
          n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
          po

In [15]:
pred = rand_search.predict(test['description'])

In [17]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')
print(submission.shape)
submission.head()

(288, 2)


Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [18]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model
submission.to_csv('./data/submission_03.csv', index=False)