In [13]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
rfc = RandomForestClassifier()
from sklearn.tree import DecisionTreeClassifier

In [15]:
import spacy
nlp = spacy.load("en_core_web_lg")

def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [16]:
X = get_word_vectors(train.description)

rfc.fit(X, train.category)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
rfc.score(X,train.category)

0.9899458623356535

In [18]:
x_test = get_word_vectors(test.description)


In [19]:
pred = rfc.predict(x_test)

In [20]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [21]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [43]:
vect = TfidfVectorizer(stop_words='english')
clf = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', clf)])

param = {
    'vect__max_df': (0.2,0.5,0.75, 1.0),
    'vect__min_df': (.02,.035, .05, .1),
    'vect__max_features': (100, 250, 500,1000,1250),
    'clf__max_depth':(1,3,5,7,9,11,None),
    'clf__min_samples_leaf': (1,2,3),
   # 'clf__max_leaf_nodes': (None,2,3,4,5,7,8,9,10)
    
}

In [45]:
grid_search = GridSearchCV(pipe,param, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(train.description, train.category)

Fitting 2 folds for each of 1680 candidates, totalling 3360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3360 out of 3360 | elapsed:  4.1min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.2, 0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.035, 0.05, 0.1), 'vect__max_features': (100, 250, 500, 1000, 1250), 'clf__max_depth': (1, 3, 5, 7, 9, 11, None), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [46]:
grid_search.best_score_

0.8406805877803558

In [38]:
random_search = RandomizedSearchCV(pipe,param, cv=20, n_jobs=-1, verbose=1)
random_search.fit(train.description, train.category)

Fitting 20 folds for each of 10 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   18.3s finished


RandomizedSearchCV(cv=20, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'vect__max_df': (0.2, 0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.035, 0.05, 0.1), 'vect__max_features': (100, 250, 500, 1000, 1250), 'clf__max_depth': (1, 3, 5, 7, 9, 11, None), 'clf__min_samples_leaf': (1, 2, 3)},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [39]:
random_search.best_score_

0.8395204949729311

In [134]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD( 
                   algorithm='arpack',
                   n_iter=5)

lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', clf)])

In [143]:
params = { 
    'lsi__svd__n_components': [2,4,8,10,11,50,150],

    'lsi__vect__max_df':[.82,.9, .95, 1.0]    

}

In [144]:
grid_search = RandomizedSearchCV(pipe,params, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(train.description, train.category)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.1s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'lsi__svd__n_components': [2, 4, 8, 10, 11, 50, 150], 'lsi__vect__max_df': [0.82, 0.9, 0.95, 1.0]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [145]:
grid_search.best_score_

0.8646558391337974