In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
rfc = RandomForestClassifier()
from sklearn.tree import DecisionTreeClassifier

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg")

def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [16]:
X = get_word_vectors(train.description)

rfc.fit(X, train.category)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
rfc.score(X,train.category)

0.9899458623356535

In [18]:
x_test = get_word_vectors(test.description)


In [19]:
pred = rfc.predict(x_test)

In [20]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [21]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [35]:
vect = TfidfVectorizer(stop_words='english')
clf = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', clf)])

param = {
    'vect__max_df': (0.2,0.5,0.75, 1.0),
    'vect__min_df': (.02,.035, .05, .1),
    'vect__max_features': (100, 250, 500,1000,1250),
    'clf__max_depth':(1,3,5,7,9,11,None),
    'clf__min_samples_leaf': (1,2,3),
   # 'clf__max_leaf_nodes': (None,2,3,4,5,7,8,9,10)
    
}

In [45]:
grid_search = GridSearchCV(pipe,param, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(train.description, train.category)

Fitting 2 folds for each of 1680 candidates, totalling 3360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3360 out of 3360 | elapsed:  4.1min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.2, 0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.035, 0.05, 0.1), 'vect__max_features': (100, 250, 500, 1000, 1250), 'clf__max_depth': (1, 3, 5, 7, 9, 11, None), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [46]:
grid_search.best_score_

0.8406805877803558

In [38]:
random_search = GridSearchCV(pipe,param, cv=20, n_jobs=-1, verbose=1)
random_search.fit(train.description, train.category)

Fitting 20 folds for each of 10 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   18.3s finished


RandomizedSearchCV(cv=20, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'vect__max_df': (0.2, 0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.035, 0.05, 0.1), 'vect__max_features': (100, 250, 500, 1000, 1250), 'clf__max_depth': (1, 3, 5, 7, 9, 11, None), 'clf__min_samples_leaf': (1, 2, 3)},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [39]:
random_search.best_score_

0.8395204949729311

In [9]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD( 
                   algorithm='arpack',
                   n_iter=5)

lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', clf)])

In [10]:
params = { 
    'lsi__svd__n_components': [2,4,8,10,11,50,150],

    'lsi__vect__max_df':[.82,.9, .95, 1.0]    

}

In [11]:
grid_search = GridSearchCV(pipe,params, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(train.description, train.category)

Fitting 2 folds for each of 28 candidates, totalling 56 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  56 out of  56 | elapsed:    9.7s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'lsi__svd__n_components': [2, 4, 8, 10, 11, 50, 150], 'lsi__vect__max_df': [0.82, 0.9, 0.95, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [12]:
grid_search.best_score_

0.8677494199535963

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [5]:
gnb = GaussianNB()
KNN = KNeighborsClassifier(n_neighbors=1)
MNB = MultinomialNB()
BNB = BernoulliNB()
LR = LogisticRegression()
SDG = SGDClassifier()
SVC = SVC()
LSVC = LinearSVC()
NSVC = NuSVC()

In [29]:
def model(model):
    vect = TfidfVectorizer(stop_words='english')
    clf = model

    pipe = Pipeline([('vect', vect), ('clf', clf)])

    param = {
        'vect__max_df': (0.2,0.5,0.75, 1.0),
        'vect__min_df': (.02,.035, .05, .1),
        'vect__max_features': (100, 250, 500,1000,1250),

       # 'clf__max_leaf_nodes': (None,2,3,4,5,7,8,9,10)

    }
    grid_search = GridSearchCV(pipe,param, cv=8, n_jobs=-1, verbose=1)
    grid_search.fit(train.description, train.category)    
    return(grid_search)

In [18]:
models = model(LSVC)

Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   12.1s finished


In [19]:
models.best_score_

0.8843774168600155

In [21]:
models = model(KNN)
models.best_score_

Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   17.4s finished


0.7962103634957464

In [22]:
models = model(MNB)
models.best_score_

Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   10.3s finished


0.845707656612529

In [23]:
models = model(BNB)
models.best_score_

Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   11.2s finished


0.8812838360402165

In [25]:
models = model(SDG)
models.best_score_

Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    9.8s finished


0.8781902552204176

In [33]:
ss = model(SDG)
ss.best_score_

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   52.7s finished


0.9048723897911833

In [31]:
model1 = model(LSVC)
model1.best_score_

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   56.7s finished


0.9079659706109822

In [32]:
model2 = model(BNB)
model2.best_score_

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   54.7s finished


0.8975251353441609

In [36]:

model3 = model(clf)
model3.best_score_

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:  1.2min finished


0.8619489559164734

In [37]:
!pip install xgboost



In [40]:
pip install xgboost

'C:\Users\WIN' is not recognized as an internal or external command,
operable program or batch file.


Note: you may need to restart the kernel to use updated packages.


In [42]:
conda install -c conda-forge xgboost

usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: xgboost



Note: you may need to restart the kernel to use updated packages.


In [45]:
from sklearn.ensemble import GradientBoostingClassifier

xg = GradientBoostingClassifier()

xgb = model(xg)
xgb.best_score_

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:  9.7min finished


0.8820572312451663

In [46]:
pred = model1.predict(test.description)

In [47]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [48]:
submission.to_csv('submission8.csv', index=False)

In [49]:
pred2= ss.predict(test.description)

In [50]:
pred3 = model2.predict(test.description)

In [52]:
pred4 = xgb.predict(test.description)

In [53]:
from scipy import stats

In [57]:
mode = []
for i in range(len(pred)):
    one = pred[i]
    two = pred2[i]
    three = pred3[i]
    four = pred4[i]
    mo = stats.mode([one,two,three,four])
    mode.append(mo[0][0])

In [59]:
submission = pd.DataFrame({'id': test['id'], 'category':mode})
submission['category'] = submission['category'].astype('int64')

submission.to_csv('submission9.csv', index=False)