In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv", low_memory=False)
df_sample = pd.read_csv("sample_submission.csv")

In [52]:
df_train.location.isna().sum()

2533

In [53]:
len(df_train)

7613

In [54]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [55]:
df_train.location.fillna(0, inplace=True)

In [56]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,0,Our Deeds are the Reason of this #earthquake M...,1
1,4,,0,Forest fire near La Ronge Sask. Canada,1
2,5,,0,All residents asked to 'shelter in place' are ...,1
3,6,,0,"13,000 people receive #wildfires evacuation or...",1
4,7,,0,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,0,Two giant cranes holding a bridge collapse int...,1
7609,10870,,0,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,0,Police investigating after an e-bike collided ...,1


In [57]:
df_train.head(61)

Unnamed: 0,id,keyword,location,text,target
0,1,,0,Our Deeds are the Reason of this #earthquake M...,1
1,4,,0,Forest fire near La Ronge Sask. Canada,1
2,5,,0,All residents asked to 'shelter in place' are ...,1
3,6,,0,"13,000 people receive #wildfires evacuation or...",1
4,7,,0,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
56,80,ablaze,South Africa,TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE O...,1
57,81,ablaze,"Sao Paulo, Brazil",Set our hearts ablaze and every city was a gif...,0
58,82,ablaze,hollywoodland,They sky was ablaze tonight in Los Angeles. I'...,0
59,83,ablaze,"Edmonton, Alberta - Treaty 6",How the West was burned: Thousands of wildfire...,1


In [58]:
df_train.dropna(inplace=True, axis=0)

In [59]:
len(df_train)

7552

In [60]:
df_train.isna().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [61]:
df_train.shape

(7552, 5)

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
X = df_train.text
y = df_train.target

In [64]:
X.shape, y.shape

((7552,), (7552,))

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
count_vect = CountVectorizer()

In [68]:
X_train_counts = count_vect.fit_transform(X_train)

In [69]:
X_train_counts

<6041x18552 sparse matrix of type '<class 'numpy.int64'>'
	with 88648 stored elements in Compressed Sparse Row format>

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6041, 18552)

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [72]:
vectorizer = TfidfVectorizer()
X_test_tfidf = vectorizer.fit_transform(X_test)

In [73]:
X_train_tfidf.shape, X_test_tfidf.shape

((6041, 18552), (1511, 6974))

In [74]:
from sklearn.svm import LinearSVC

In [75]:
clf = LinearSVC()

In [76]:
clf.fit(X_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [77]:
from sklearn.pipeline import Pipeline

In [78]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [79]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [80]:
predictions = text_clf.predict(X_test)

In [81]:
from sklearn.metrics import confusion_matrix, classification_report

In [82]:
print(confusion_matrix(y_test, predictions))

[[723 151]
 [163 474]]


In [83]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       874
           1       0.76      0.74      0.75       637

    accuracy                           0.79      1511
   macro avg       0.79      0.79      0.79      1511
weighted avg       0.79      0.79      0.79      1511



In [84]:
from sklearn import metrics

In [85]:
metrics.accuracy_score(y_test, predictions)

0.7921906022501655

In [102]:
X_kag = pd.read_csv("test.csv", sep=',')
X_kag

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [106]:
original_test = pd.DataFrame(X_kag)

In [139]:
len(original_test)

3263

In [142]:
kag_preds = text_clf.predict(X_kag.text)

In [144]:
len(kag_preds)

3263

In [145]:
ids = original_test.id

In [146]:
len(ids)

3263

In [147]:
ids

0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64

In [148]:
ids_list = []
for i in ids:
    ids_list.append(i)
ids_list[-1]

10875

In [149]:
len(kag_preds)

3263

In [153]:
ans = pd.DataFrame({'id':ids_list,
                   'target':kag_preds})

In [151]:
ans

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [154]:
ans.to_csv("new_linear_svc_export.csv", index=False)

In [50]:
df.to_csv("KAGGLE_LINEARSVC_01.06.20.csv", index=False)

In [270]:
accuracy_scores = {"LinearSVC": 0.7921906022501655,
                  "GaussianNB": 0.7921906022501655,
                  "KNEIGHBORS": 0.7809397749834547,
                  "Random FOrest Classifier": 0.7796161482461945,
                   "Random Forest Regressor": 0.7782925215089345,
                   "CatBoostClassifier": 0.786896095301125
                  }

In [220]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [227]:
gaus_clf = Pipeline([('tfidf', TfidfVectorizer()), ('gnb', LinearSVC())])

In [231]:
gaus_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('gnb',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [233]:
gaus_preds = gaus_clf.predict(X_test)

In [235]:
metrics.accuracy_score(y_test, gaus_preds)

0.7921906022501655

In [236]:
0.7921906022501655 == 0.7921906022501655

True

In [244]:
from sklearn.neighbors import KNeighborsClassifier

In [239]:
nbrs = NearestNeighbors()

In [246]:
nbrs_clf = Pipeline([('tfidf', TfidfVectorizer()), ('nbrs', KNeighborsClassifier())])

In [247]:
nbrs_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nbrs',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None

In [248]:
nbrs_preds = nbrs_clf.predict(X_test)

In [249]:
metrics.accuracy_score(y_test, nbrs_preds)

0.7809397749834547

In [251]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [252]:
text_clf_rand = Pipeline([('tfidf', TfidfVectorizer()), ('nbrs', RandomForestClassifier())])

In [254]:
text_clf_rand.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [255]:
rfclf_preds = text_clf_rand.predict(X_test)

In [256]:
metrics.accuracy_score(y_test, rfclf_preds)

0.7796161482461945

In [258]:
text_reg_rand = Pipeline([('tfidf', TfidfVectorizer()), ('nbrs', RandomForestClassifier())])

In [259]:
text_reg_rand.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [260]:
reg_preds = text_reg_rand.predict(X_test)

In [261]:
metrics.accuracy_score(y_test, reg_preds)

0.7782925215089345

In [263]:
from catboost import CatBoostClassifier

In [264]:
cbclf = Pipeline([('tfidf', TfidfVectorizer()), ('nbrs', CatBoostClassifier())])

In [265]:
cbclf.fit(X_train, y_train)

Learning rate set to 0.022206
0:	learn: 0.6898704	total: 164ms	remaining: 2m 43s
1:	learn: 0.6872955	total: 238ms	remaining: 1m 58s
2:	learn: 0.6845109	total: 314ms	remaining: 1m 44s
3:	learn: 0.6820593	total: 391ms	remaining: 1m 37s
4:	learn: 0.6794369	total: 467ms	remaining: 1m 32s
5:	learn: 0.6771215	total: 543ms	remaining: 1m 29s
6:	learn: 0.6747648	total: 621ms	remaining: 1m 28s
7:	learn: 0.6722928	total: 697ms	remaining: 1m 26s
8:	learn: 0.6699116	total: 770ms	remaining: 1m 24s
9:	learn: 0.6671953	total: 845ms	remaining: 1m 23s
10:	learn: 0.6649096	total: 920ms	remaining: 1m 22s
11:	learn: 0.6624643	total: 991ms	remaining: 1m 21s
12:	learn: 0.6603837	total: 1.06s	remaining: 1m 20s
13:	learn: 0.6583623	total: 1.14s	remaining: 1m 20s
14:	learn: 0.6563050	total: 1.21s	remaining: 1m 19s
15:	learn: 0.6548137	total: 1.29s	remaining: 1m 19s
16:	learn: 0.6532255	total: 1.38s	remaining: 1m 19s
17:	learn: 0.6516238	total: 1.46s	remaining: 1m 19s
18:	learn: 0.6495772	total: 1.54s	remaining:

159:	learn: 0.5534775	total: 12.3s	remaining: 1m 4s
160:	learn: 0.5531828	total: 12.4s	remaining: 1m 4s
161:	learn: 0.5526656	total: 12.5s	remaining: 1m 4s
162:	learn: 0.5522566	total: 12.6s	remaining: 1m 4s
163:	learn: 0.5520013	total: 12.6s	remaining: 1m 4s
164:	learn: 0.5516271	total: 12.7s	remaining: 1m 4s
165:	learn: 0.5514196	total: 12.8s	remaining: 1m 4s
166:	learn: 0.5511450	total: 12.9s	remaining: 1m 4s
167:	learn: 0.5508565	total: 12.9s	remaining: 1m 4s
168:	learn: 0.5504774	total: 13s	remaining: 1m 3s
169:	learn: 0.5501525	total: 13.1s	remaining: 1m 3s
170:	learn: 0.5497657	total: 13.1s	remaining: 1m 3s
171:	learn: 0.5494813	total: 13.2s	remaining: 1m 3s
172:	learn: 0.5491642	total: 13.3s	remaining: 1m 3s
173:	learn: 0.5487854	total: 13.4s	remaining: 1m 3s
174:	learn: 0.5485145	total: 13.4s	remaining: 1m 3s
175:	learn: 0.5482125	total: 13.5s	remaining: 1m 3s
176:	learn: 0.5478843	total: 13.6s	remaining: 1m 3s
177:	learn: 0.5476002	total: 13.7s	remaining: 1m 3s
178:	learn: 0.

321:	learn: 0.5121808	total: 24.5s	remaining: 51.7s
322:	learn: 0.5119413	total: 24.6s	remaining: 51.6s
323:	learn: 0.5117561	total: 24.7s	remaining: 51.5s
324:	learn: 0.5115836	total: 24.8s	remaining: 51.4s
325:	learn: 0.5113390	total: 24.8s	remaining: 51.3s
326:	learn: 0.5111070	total: 24.9s	remaining: 51.3s
327:	learn: 0.5109225	total: 25s	remaining: 51.2s
328:	learn: 0.5106985	total: 25.1s	remaining: 51.1s
329:	learn: 0.5104995	total: 25.1s	remaining: 51s
330:	learn: 0.5103315	total: 25.2s	remaining: 51s
331:	learn: 0.5101642	total: 25.3s	remaining: 50.9s
332:	learn: 0.5099426	total: 25.4s	remaining: 50.8s
333:	learn: 0.5097972	total: 25.4s	remaining: 50.7s
334:	learn: 0.5094890	total: 25.5s	remaining: 50.7s
335:	learn: 0.5092720	total: 25.6s	remaining: 50.6s
336:	learn: 0.5090270	total: 25.7s	remaining: 50.5s
337:	learn: 0.5088113	total: 25.7s	remaining: 50.4s
338:	learn: 0.5086504	total: 25.8s	remaining: 50.4s
339:	learn: 0.5084492	total: 25.9s	remaining: 50.3s
340:	learn: 0.5081

480:	learn: 0.4770980	total: 36.6s	remaining: 39.5s
481:	learn: 0.4768255	total: 36.6s	remaining: 39.4s
482:	learn: 0.4765024	total: 36.7s	remaining: 39.3s
483:	learn: 0.4763331	total: 36.8s	remaining: 39.2s
484:	learn: 0.4760808	total: 36.9s	remaining: 39.1s
485:	learn: 0.4758534	total: 36.9s	remaining: 39.1s
486:	learn: 0.4755937	total: 37s	remaining: 39s
487:	learn: 0.4754168	total: 37.1s	remaining: 38.9s
488:	learn: 0.4751914	total: 37.2s	remaining: 38.8s
489:	learn: 0.4749935	total: 37.3s	remaining: 38.8s
490:	learn: 0.4747274	total: 37.3s	remaining: 38.7s
491:	learn: 0.4744992	total: 37.4s	remaining: 38.6s
492:	learn: 0.4742467	total: 37.5s	remaining: 38.5s
493:	learn: 0.4739554	total: 37.6s	remaining: 38.5s
494:	learn: 0.4737052	total: 37.6s	remaining: 38.4s
495:	learn: 0.4735140	total: 37.7s	remaining: 38.3s
496:	learn: 0.4731916	total: 37.8s	remaining: 38.2s
497:	learn: 0.4729683	total: 37.8s	remaining: 38.2s
498:	learn: 0.4727635	total: 37.9s	remaining: 38.1s
499:	learn: 0.47

641:	learn: 0.4442886	total: 48.8s	remaining: 27.2s
642:	learn: 0.4440882	total: 48.8s	remaining: 27.1s
643:	learn: 0.4439373	total: 48.9s	remaining: 27s
644:	learn: 0.4438352	total: 49s	remaining: 27s
645:	learn: 0.4436895	total: 49.1s	remaining: 26.9s
646:	learn: 0.4435240	total: 49.1s	remaining: 26.8s
647:	learn: 0.4433502	total: 49.2s	remaining: 26.7s
648:	learn: 0.4432489	total: 49.3s	remaining: 26.7s
649:	learn: 0.4430609	total: 49.4s	remaining: 26.6s
650:	learn: 0.4428845	total: 49.5s	remaining: 26.5s
651:	learn: 0.4427026	total: 49.5s	remaining: 26.4s
652:	learn: 0.4425354	total: 49.6s	remaining: 26.4s
653:	learn: 0.4423644	total: 49.7s	remaining: 26.3s
654:	learn: 0.4422293	total: 49.8s	remaining: 26.2s
655:	learn: 0.4421551	total: 49.9s	remaining: 26.1s
656:	learn: 0.4419663	total: 49.9s	remaining: 26.1s
657:	learn: 0.4418148	total: 50s	remaining: 26s
658:	learn: 0.4417182	total: 50.1s	remaining: 25.9s
659:	learn: 0.4415592	total: 50.2s	remaining: 25.8s
660:	learn: 0.4413856	

803:	learn: 0.4198287	total: 1m 1s	remaining: 14.9s
804:	learn: 0.4196939	total: 1m 1s	remaining: 14.8s
805:	learn: 0.4195052	total: 1m 1s	remaining: 14.7s
806:	learn: 0.4194030	total: 1m 1s	remaining: 14.7s
807:	learn: 0.4193565	total: 1m 1s	remaining: 14.6s
808:	learn: 0.4192520	total: 1m 1s	remaining: 14.5s
809:	learn: 0.4191340	total: 1m 1s	remaining: 14.4s
810:	learn: 0.4190155	total: 1m 1s	remaining: 14.4s
811:	learn: 0.4189504	total: 1m 1s	remaining: 14.3s
812:	learn: 0.4187772	total: 1m 1s	remaining: 14.2s
813:	learn: 0.4187113	total: 1m 1s	remaining: 14.1s
814:	learn: 0.4185772	total: 1m 1s	remaining: 14.1s
815:	learn: 0.4185076	total: 1m 1s	remaining: 14s
816:	learn: 0.4183542	total: 1m 2s	remaining: 13.9s
817:	learn: 0.4181712	total: 1m 2s	remaining: 13.8s
818:	learn: 0.4180087	total: 1m 2s	remaining: 13.8s
819:	learn: 0.4178105	total: 1m 2s	remaining: 13.7s
820:	learn: 0.4176459	total: 1m 2s	remaining: 13.6s
821:	learn: 0.4175521	total: 1m 2s	remaining: 13.5s
822:	learn: 0.

961:	learn: 0.3989099	total: 1m 13s	remaining: 2.9s
962:	learn: 0.3987521	total: 1m 13s	remaining: 2.82s
963:	learn: 0.3985547	total: 1m 13s	remaining: 2.75s
964:	learn: 0.3983813	total: 1m 13s	remaining: 2.67s
965:	learn: 0.3982133	total: 1m 13s	remaining: 2.6s
966:	learn: 0.3980625	total: 1m 13s	remaining: 2.52s
967:	learn: 0.3979522	total: 1m 13s	remaining: 2.44s
968:	learn: 0.3977294	total: 1m 13s	remaining: 2.37s
969:	learn: 0.3976046	total: 1m 14s	remaining: 2.29s
970:	learn: 0.3974862	total: 1m 14s	remaining: 2.21s
971:	learn: 0.3973548	total: 1m 14s	remaining: 2.14s
972:	learn: 0.3972211	total: 1m 14s	remaining: 2.06s
973:	learn: 0.3970538	total: 1m 14s	remaining: 1.98s
974:	learn: 0.3968960	total: 1m 14s	remaining: 1.91s
975:	learn: 0.3967840	total: 1m 14s	remaining: 1.83s
976:	learn: 0.3966244	total: 1m 14s	remaining: 1.75s
977:	learn: 0.3965618	total: 1m 14s	remaining: 1.68s
978:	learn: 0.3964483	total: 1m 14s	remaining: 1.6s
979:	learn: 0.3963162	total: 1m 14s	remaining: 1.

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nbrs',
                 <catboost.core.CatBoostClassifier object at 0x1a24809990>)],
         verbose=False)

In [266]:
cbclf_preds = cbclf.predict(X_test)

In [268]:
metrics.accuracy_score(y_test, cbclf_preds)

0.786896095301125

In [274]:
from catboost import CatBoost

In [275]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [276]:
model = CatBoost()

In [277]:
model.fit(X_train_tfidf, y_train)

Learning rate set to 0.053757
0:	learn: 0.4917419	total: 92.3ms	remaining: 1m 32s
1:	learn: 0.4895844	total: 176ms	remaining: 1m 27s
2:	learn: 0.4874599	total: 247ms	remaining: 1m 22s
3:	learn: 0.4852030	total: 321ms	remaining: 1m 19s
4:	learn: 0.4833599	total: 399ms	remaining: 1m 19s
5:	learn: 0.4815071	total: 470ms	remaining: 1m 17s
6:	learn: 0.4797251	total: 539ms	remaining: 1m 16s
7:	learn: 0.4780246	total: 612ms	remaining: 1m 15s
8:	learn: 0.4763516	total: 687ms	remaining: 1m 15s
9:	learn: 0.4746688	total: 761ms	remaining: 1m 15s
10:	learn: 0.4730872	total: 834ms	remaining: 1m 14s
11:	learn: 0.4717372	total: 909ms	remaining: 1m 14s
12:	learn: 0.4704617	total: 983ms	remaining: 1m 14s
13:	learn: 0.4695168	total: 1.06s	remaining: 1m 14s
14:	learn: 0.4683126	total: 1.14s	remaining: 1m 15s
15:	learn: 0.4672580	total: 1.23s	remaining: 1m 15s
16:	learn: 0.4664007	total: 1.3s	remaining: 1m 15s
17:	learn: 0.4656256	total: 1.38s	remaining: 1m 15s
18:	learn: 0.4648786	total: 1.47s	remaining:

160:	learn: 0.4153696	total: 12.3s	remaining: 1m 4s
161:	learn: 0.4151142	total: 12.4s	remaining: 1m 3s
162:	learn: 0.4148951	total: 12.4s	remaining: 1m 3s
163:	learn: 0.4147055	total: 12.5s	remaining: 1m 3s
164:	learn: 0.4144508	total: 12.6s	remaining: 1m 3s
165:	learn: 0.4142226	total: 12.7s	remaining: 1m 3s
166:	learn: 0.4139470	total: 12.7s	remaining: 1m 3s
167:	learn: 0.4137025	total: 12.8s	remaining: 1m 3s
168:	learn: 0.4135028	total: 12.9s	remaining: 1m 3s
169:	learn: 0.4132631	total: 13s	remaining: 1m 3s
170:	learn: 0.4130137	total: 13.1s	remaining: 1m 3s
171:	learn: 0.4127354	total: 13.2s	remaining: 1m 3s
172:	learn: 0.4125278	total: 13.2s	remaining: 1m 3s
173:	learn: 0.4123051	total: 13.3s	remaining: 1m 3s
174:	learn: 0.4121018	total: 13.4s	remaining: 1m 3s
175:	learn: 0.4118573	total: 13.5s	remaining: 1m 3s
176:	learn: 0.4116157	total: 13.5s	remaining: 1m 2s
177:	learn: 0.4113303	total: 13.6s	remaining: 1m 2s
178:	learn: 0.4110820	total: 13.7s	remaining: 1m 2s
179:	learn: 0.

322:	learn: 0.3829910	total: 24.8s	remaining: 52.1s
323:	learn: 0.3828350	total: 24.9s	remaining: 52s
324:	learn: 0.3826830	total: 25s	remaining: 51.9s
325:	learn: 0.3825694	total: 25.1s	remaining: 51.8s
326:	learn: 0.3823743	total: 25.1s	remaining: 51.7s
327:	learn: 0.3822259	total: 25.2s	remaining: 51.6s
328:	learn: 0.3820016	total: 25.3s	remaining: 51.6s
329:	learn: 0.3818509	total: 25.3s	remaining: 51.5s
330:	learn: 0.3817060	total: 25.4s	remaining: 51.4s
331:	learn: 0.3815903	total: 25.5s	remaining: 51.3s
332:	learn: 0.3814919	total: 25.6s	remaining: 51.2s
333:	learn: 0.3813547	total: 25.7s	remaining: 51.2s
334:	learn: 0.3811737	total: 25.8s	remaining: 51.1s
335:	learn: 0.3810644	total: 25.8s	remaining: 51.1s
336:	learn: 0.3809134	total: 25.9s	remaining: 51s
337:	learn: 0.3808169	total: 26s	remaining: 50.9s
338:	learn: 0.3806629	total: 26.1s	remaining: 50.8s
339:	learn: 0.3805179	total: 26.1s	remaining: 50.7s
340:	learn: 0.3803722	total: 26.2s	remaining: 50.6s
341:	learn: 0.380129

483:	learn: 0.3625554	total: 37.8s	remaining: 40.3s
484:	learn: 0.3624648	total: 37.9s	remaining: 40.2s
485:	learn: 0.3624011	total: 38s	remaining: 40.2s
486:	learn: 0.3623533	total: 38.1s	remaining: 40.1s
487:	learn: 0.3621744	total: 38.1s	remaining: 40s
488:	learn: 0.3620863	total: 38.2s	remaining: 39.9s
489:	learn: 0.3620316	total: 38.3s	remaining: 39.9s
490:	learn: 0.3619038	total: 38.4s	remaining: 39.8s
491:	learn: 0.3617857	total: 38.5s	remaining: 39.7s
492:	learn: 0.3616788	total: 38.6s	remaining: 39.7s
493:	learn: 0.3615453	total: 38.7s	remaining: 39.6s
494:	learn: 0.3614213	total: 38.7s	remaining: 39.5s
495:	learn: 0.3612877	total: 38.8s	remaining: 39.4s
496:	learn: 0.3611853	total: 38.9s	remaining: 39.4s
497:	learn: 0.3609874	total: 39s	remaining: 39.3s
498:	learn: 0.3608671	total: 39.1s	remaining: 39.2s
499:	learn: 0.3607468	total: 39.2s	remaining: 39.2s
500:	learn: 0.3606617	total: 39.2s	remaining: 39.1s
501:	learn: 0.3605084	total: 39.3s	remaining: 39s
502:	learn: 0.360403

642:	learn: 0.3472657	total: 51.1s	remaining: 28.3s
643:	learn: 0.3472112	total: 51.1s	remaining: 28.3s
644:	learn: 0.3471343	total: 51.2s	remaining: 28.2s
645:	learn: 0.3470296	total: 51.3s	remaining: 28.1s
646:	learn: 0.3469856	total: 51.4s	remaining: 28s
647:	learn: 0.3469221	total: 51.4s	remaining: 27.9s
648:	learn: 0.3468341	total: 51.5s	remaining: 27.9s
649:	learn: 0.3467324	total: 51.6s	remaining: 27.8s
650:	learn: 0.3466351	total: 51.7s	remaining: 27.7s
651:	learn: 0.3465300	total: 51.8s	remaining: 27.6s
652:	learn: 0.3463897	total: 51.9s	remaining: 27.6s
653:	learn: 0.3462768	total: 51.9s	remaining: 27.5s
654:	learn: 0.3461665	total: 52s	remaining: 27.4s
655:	learn: 0.3460633	total: 52.1s	remaining: 27.3s
656:	learn: 0.3460075	total: 52.2s	remaining: 27.2s
657:	learn: 0.3458970	total: 52.2s	remaining: 27.1s
658:	learn: 0.3458312	total: 52.3s	remaining: 27.1s
659:	learn: 0.3457982	total: 52.4s	remaining: 27s
660:	learn: 0.3456682	total: 52.5s	remaining: 26.9s
661:	learn: 0.3456

804:	learn: 0.3340720	total: 1m 3s	remaining: 15.4s
805:	learn: 0.3340130	total: 1m 3s	remaining: 15.3s
806:	learn: 0.3339717	total: 1m 3s	remaining: 15.2s
807:	learn: 0.3339395	total: 1m 3s	remaining: 15.2s
808:	learn: 0.3338109	total: 1m 3s	remaining: 15.1s
809:	learn: 0.3337582	total: 1m 4s	remaining: 15s
810:	learn: 0.3336985	total: 1m 4s	remaining: 14.9s
811:	learn: 0.3336690	total: 1m 4s	remaining: 14.9s
812:	learn: 0.3336228	total: 1m 4s	remaining: 14.8s
813:	learn: 0.3335757	total: 1m 4s	remaining: 14.7s
814:	learn: 0.3335041	total: 1m 4s	remaining: 14.6s
815:	learn: 0.3334316	total: 1m 4s	remaining: 14.5s
816:	learn: 0.3333910	total: 1m 4s	remaining: 14.5s
817:	learn: 0.3332733	total: 1m 4s	remaining: 14.4s
818:	learn: 0.3331502	total: 1m 4s	remaining: 14.3s
819:	learn: 0.3330782	total: 1m 4s	remaining: 14.2s
820:	learn: 0.3329614	total: 1m 4s	remaining: 14.1s
821:	learn: 0.3329106	total: 1m 4s	remaining: 14.1s
822:	learn: 0.3328250	total: 1m 5s	remaining: 14s
823:	learn: 0.33

961:	learn: 0.3233313	total: 1m 16s	remaining: 3.01s
962:	learn: 0.3232493	total: 1m 16s	remaining: 2.93s
963:	learn: 0.3231606	total: 1m 16s	remaining: 2.85s
964:	learn: 0.3230935	total: 1m 16s	remaining: 2.77s
965:	learn: 0.3230638	total: 1m 16s	remaining: 2.69s
966:	learn: 0.3229748	total: 1m 16s	remaining: 2.62s
967:	learn: 0.3229417	total: 1m 16s	remaining: 2.54s
968:	learn: 0.3229025	total: 1m 16s	remaining: 2.46s
969:	learn: 0.3228664	total: 1m 16s	remaining: 2.38s
970:	learn: 0.3228425	total: 1m 17s	remaining: 2.3s
971:	learn: 0.3227661	total: 1m 17s	remaining: 2.22s
972:	learn: 0.3226988	total: 1m 17s	remaining: 2.14s
973:	learn: 0.3226651	total: 1m 17s	remaining: 2.06s
974:	learn: 0.3225662	total: 1m 17s	remaining: 1.98s
975:	learn: 0.3225192	total: 1m 17s	remaining: 1.9s
976:	learn: 0.3224866	total: 1m 17s	remaining: 1.82s
977:	learn: 0.3224149	total: 1m 17s	remaining: 1.75s
978:	learn: 0.3223874	total: 1m 17s	remaining: 1.67s
979:	learn: 0.3222728	total: 1m 17s	remaining: 1

<catboost.core.CatBoost at 0x1a26be7250>

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="  (0, 3759)	0.38238602120282206
  (0, 1428)	0.0803760159530479
  (0, 3002)	0.08635618739881304
  (0, 2957)	0.30170652179285884
  (0, 4215)	0.3069881381852508
  (0, 3184)	0.24663419572758954
  (0, 1449)	0.28138096484055686
  (0, 1072)	0.3195862770209015
  (0, 2947)	0.32731369513755276
  (0, 1588)	0.33645329090582615
  (0, 2625)	0.34763925208985474
  (0, 6265)	0.2669597526798915
  (1, 142)	0.30357030831148446
  (1, 6442)	0.30357030831148446
  (1, 4259)	0.30357030831148446
  (1, 5688)	0.30357030831148446
  (1, 2854)	0.28743416512506764
  (1, 4469)	0.24371310296260745
  (1, 6960)	0.22907889748687302
  (1, 6115)	0.2322643271729809
  (1, 6558)	0.20637313724658282
  (1, 2403)	0.20999350706311207
  (1, 3839)	0.2139764910130811
  (1, 5425)	0.2759853893354411
  (1, 2460)	0.1240533204488741
  :	:
  (1508, 5990)	0.12684663063411666
  (1509, 232)	0.3784956138914816
  (1509, 316)	0.3784956138914816
  (1509, 3227)	0.3784956138914816
  (1509, 3441)	0.2939077565527633
  (1509, 5431)	0.2939077565527633
  (1509, 1254)	0.32398359539002675
  (1509, 4969)	0.3038648312883775
  (1509, 3545)	0.2939077565527633
  (1509, 3003)	0.18340244604115147
  (1509, 3119)	0.12279455389039566
  (1509, 1146)	0.18010898298445768
  (1509, 1428)	0.1591165357175241
  (1509, 3002)	0.08547759685363819
  (1510, 6292)	0.38526001232319906
  (1510, 2798)	0.364781689665671
  (1510, 1030)	0.364781689665671
  (1510, 5036)	0.3297737658546885
  (1510, 4522)	0.31524416470123406
  (1510, 466)	0.294765842043706
  (1510, 1252)	0.31524416470123406
  (1510, 1616)	0.2907232244143646
  (1510, 5967)	0.30397413048280253
  (1510, 1428)	0.08098011741945013
  (1510, 3002)	0.0870052354863794": Cannot convert obj   (0, 3759)	0.38238602120282206
  (0, 1428)	0.0803760159530479
  (0, 3002)	0.08635618739881304
  (0, 2957)	0.30170652179285884
  (0, 4215)	0.3069881381852508
  (0, 3184)	0.24663419572758954
  (0, 1449)	0.28138096484055686
  (0, 1072)	0.3195862770209015
  (0, 2947)	0.32731369513755276
  (0, 1588)	0.33645329090582615
  (0, 2625)	0.34763925208985474
  (0, 6265)	0.2669597526798915
  (1, 142)	0.30357030831148446
  (1, 6442)	0.30357030831148446
  (1, 4259)	0.30357030831148446
  (1, 5688)	0.30357030831148446
  (1, 2854)	0.28743416512506764
  (1, 4469)	0.24371310296260745
  (1, 6960)	0.22907889748687302
  (1, 6115)	0.2322643271729809
  (1, 6558)	0.20637313724658282
  (1, 2403)	0.20999350706311207
  (1, 3839)	0.2139764910130811
  (1, 5425)	0.2759853893354411
  (1, 2460)	0.1240533204488741
  :	:
  (1508, 5990)	0.12684663063411666
  (1509, 232)	0.3784956138914816
  (1509, 316)	0.3784956138914816
  (1509, 3227)	0.3784956138914816
  (1509, 3441)	0.2939077565527633
  (1509, 5431)	0.2939077565527633
  (1509, 1254)	0.32398359539002675
  (1509, 4969)	0.3038648312883775
  (1509, 3545)	0.2939077565527633
  (1509, 3003)	0.18340244604115147
  (1509, 3119)	0.12279455389039566
  (1509, 1146)	0.18010898298445768
  (1509, 1428)	0.1591165357175241
  (1509, 3002)	0.08547759685363819
  (1510, 6292)	0.38526001232319906
  (1510, 2798)	0.364781689665671
  (1510, 1030)	0.364781689665671
  (1510, 5036)	0.3297737658546885
  (1510, 4522)	0.31524416470123406
  (1510, 466)	0.294765842043706
  (1510, 1252)	0.31524416470123406
  (1510, 1616)	0.2907232244143646
  (1510, 5967)	0.30397413048280253
  (1510, 1428)	0.08098011741945013
  (1510, 3002)	0.0870052354863794 to float

In [280]:
cb = Pipeline([('tfidf', TfidfVectorizer()), ('cb', CatBoost())])

In [282]:
cb.fit(X_train, y_train)

Learning rate set to 0.053757
0:	learn: 0.4917419	total: 99.9ms	remaining: 1m 39s
1:	learn: 0.4895844	total: 176ms	remaining: 1m 27s
2:	learn: 0.4874599	total: 262ms	remaining: 1m 27s
3:	learn: 0.4852030	total: 338ms	remaining: 1m 24s
4:	learn: 0.4833599	total: 415ms	remaining: 1m 22s
5:	learn: 0.4815071	total: 492ms	remaining: 1m 21s
6:	learn: 0.4797251	total: 572ms	remaining: 1m 21s
7:	learn: 0.4780246	total: 652ms	remaining: 1m 20s
8:	learn: 0.4763516	total: 727ms	remaining: 1m 20s
9:	learn: 0.4746688	total: 804ms	remaining: 1m 19s
10:	learn: 0.4730872	total: 884ms	remaining: 1m 19s
11:	learn: 0.4717372	total: 962ms	remaining: 1m 19s
12:	learn: 0.4704617	total: 1.04s	remaining: 1m 19s
13:	learn: 0.4695168	total: 1.12s	remaining: 1m 18s
14:	learn: 0.4683126	total: 1.19s	remaining: 1m 18s
15:	learn: 0.4672580	total: 1.27s	remaining: 1m 18s
16:	learn: 0.4664007	total: 1.36s	remaining: 1m 18s
17:	learn: 0.4656256	total: 1.44s	remaining: 1m 18s
18:	learn: 0.4648786	total: 1.51s	remaining

160:	learn: 0.4153696	total: 12.5s	remaining: 1m 5s
161:	learn: 0.4151142	total: 12.6s	remaining: 1m 5s
162:	learn: 0.4148951	total: 12.7s	remaining: 1m 5s
163:	learn: 0.4147055	total: 12.8s	remaining: 1m 5s
164:	learn: 0.4144508	total: 12.8s	remaining: 1m 4s
165:	learn: 0.4142226	total: 12.9s	remaining: 1m 4s
166:	learn: 0.4139470	total: 13s	remaining: 1m 4s
167:	learn: 0.4137025	total: 13.1s	remaining: 1m 4s
168:	learn: 0.4135028	total: 13.1s	remaining: 1m 4s
169:	learn: 0.4132631	total: 13.2s	remaining: 1m 4s
170:	learn: 0.4130137	total: 13.3s	remaining: 1m 4s
171:	learn: 0.4127354	total: 13.4s	remaining: 1m 4s
172:	learn: 0.4125278	total: 13.4s	remaining: 1m 4s
173:	learn: 0.4123051	total: 13.5s	remaining: 1m 4s
174:	learn: 0.4121018	total: 13.6s	remaining: 1m 4s
175:	learn: 0.4118573	total: 13.7s	remaining: 1m 3s
176:	learn: 0.4116157	total: 13.7s	remaining: 1m 3s
177:	learn: 0.4113303	total: 13.8s	remaining: 1m 3s
178:	learn: 0.4110820	total: 13.9s	remaining: 1m 3s
179:	learn: 0.

322:	learn: 0.3829910	total: 25s	remaining: 52.4s
323:	learn: 0.3828350	total: 25.1s	remaining: 52.3s
324:	learn: 0.3826830	total: 25.1s	remaining: 52.2s
325:	learn: 0.3825694	total: 25.2s	remaining: 52.1s
326:	learn: 0.3823743	total: 25.3s	remaining: 52.1s
327:	learn: 0.3822259	total: 25.4s	remaining: 52s
328:	learn: 0.3820016	total: 25.4s	remaining: 51.9s
329:	learn: 0.3818509	total: 25.5s	remaining: 51.8s
330:	learn: 0.3817060	total: 25.6s	remaining: 51.7s
331:	learn: 0.3815903	total: 25.7s	remaining: 51.7s
332:	learn: 0.3814919	total: 25.8s	remaining: 51.6s
333:	learn: 0.3813547	total: 25.8s	remaining: 51.5s
334:	learn: 0.3811737	total: 25.9s	remaining: 51.4s
335:	learn: 0.3810644	total: 26s	remaining: 51.3s
336:	learn: 0.3809134	total: 26.1s	remaining: 51.3s
337:	learn: 0.3808169	total: 26.2s	remaining: 51.2s
338:	learn: 0.3806629	total: 26.2s	remaining: 51.1s
339:	learn: 0.3805179	total: 26.3s	remaining: 51.1s
340:	learn: 0.3803722	total: 26.4s	remaining: 51s
341:	learn: 0.380129

481:	learn: 0.3627938	total: 37s	remaining: 39.8s
482:	learn: 0.3626691	total: 37.1s	remaining: 39.7s
483:	learn: 0.3625554	total: 37.2s	remaining: 39.6s
484:	learn: 0.3624648	total: 37.2s	remaining: 39.5s
485:	learn: 0.3624011	total: 37.3s	remaining: 39.5s
486:	learn: 0.3623533	total: 37.4s	remaining: 39.4s
487:	learn: 0.3621744	total: 37.5s	remaining: 39.3s
488:	learn: 0.3620863	total: 37.5s	remaining: 39.2s
489:	learn: 0.3620316	total: 37.6s	remaining: 39.1s
490:	learn: 0.3619038	total: 37.7s	remaining: 39.1s
491:	learn: 0.3617857	total: 37.8s	remaining: 39s
492:	learn: 0.3616788	total: 37.8s	remaining: 38.9s
493:	learn: 0.3615453	total: 37.9s	remaining: 38.8s
494:	learn: 0.3614213	total: 38s	remaining: 38.8s
495:	learn: 0.3612877	total: 38.1s	remaining: 38.7s
496:	learn: 0.3611853	total: 38.1s	remaining: 38.6s
497:	learn: 0.3609874	total: 38.2s	remaining: 38.5s
498:	learn: 0.3608671	total: 38.3s	remaining: 38.4s
499:	learn: 0.3607468	total: 38.4s	remaining: 38.4s
500:	learn: 0.3606

640:	learn: 0.3474570	total: 49s	remaining: 27.4s
641:	learn: 0.3473508	total: 49s	remaining: 27.3s
642:	learn: 0.3472657	total: 49.1s	remaining: 27.3s
643:	learn: 0.3472112	total: 49.2s	remaining: 27.2s
644:	learn: 0.3471343	total: 49.2s	remaining: 27.1s
645:	learn: 0.3470296	total: 49.3s	remaining: 27s
646:	learn: 0.3469856	total: 49.4s	remaining: 26.9s
647:	learn: 0.3469221	total: 49.5s	remaining: 26.9s
648:	learn: 0.3468341	total: 49.6s	remaining: 26.8s
649:	learn: 0.3467324	total: 49.6s	remaining: 26.7s
650:	learn: 0.3466351	total: 49.7s	remaining: 26.6s
651:	learn: 0.3465300	total: 49.8s	remaining: 26.6s
652:	learn: 0.3463897	total: 49.9s	remaining: 26.5s
653:	learn: 0.3462768	total: 49.9s	remaining: 26.4s
654:	learn: 0.3461665	total: 50s	remaining: 26.3s
655:	learn: 0.3460633	total: 50.1s	remaining: 26.3s
656:	learn: 0.3460075	total: 50.1s	remaining: 26.2s
657:	learn: 0.3458970	total: 50.2s	remaining: 26.1s
658:	learn: 0.3458312	total: 50.3s	remaining: 26s
659:	learn: 0.3457982	

802:	learn: 0.3342323	total: 1m 1s	remaining: 15s
803:	learn: 0.3341394	total: 1m 1s	remaining: 14.9s
804:	learn: 0.3340720	total: 1m 1s	remaining: 14.8s
805:	learn: 0.3340130	total: 1m 1s	remaining: 14.7s
806:	learn: 0.3339717	total: 1m 1s	remaining: 14.7s
807:	learn: 0.3339395	total: 1m 1s	remaining: 14.6s
808:	learn: 0.3338109	total: 1m 1s	remaining: 14.5s
809:	learn: 0.3337582	total: 1m 1s	remaining: 14.4s
810:	learn: 0.3336985	total: 1m 1s	remaining: 14.4s
811:	learn: 0.3336690	total: 1m 1s	remaining: 14.3s
812:	learn: 0.3336228	total: 1m 1s	remaining: 14.2s
813:	learn: 0.3335757	total: 1m 1s	remaining: 14.1s
814:	learn: 0.3335041	total: 1m 1s	remaining: 14.1s
815:	learn: 0.3334316	total: 1m 2s	remaining: 14s
816:	learn: 0.3333910	total: 1m 2s	remaining: 13.9s
817:	learn: 0.3332733	total: 1m 2s	remaining: 13.8s
818:	learn: 0.3331502	total: 1m 2s	remaining: 13.8s
819:	learn: 0.3330782	total: 1m 2s	remaining: 13.7s
820:	learn: 0.3329614	total: 1m 2s	remaining: 13.6s
821:	learn: 0.33

961:	learn: 0.3233313	total: 1m 12s	remaining: 2.88s
962:	learn: 0.3232493	total: 1m 13s	remaining: 2.81s
963:	learn: 0.3231606	total: 1m 13s	remaining: 2.73s
964:	learn: 0.3230935	total: 1m 13s	remaining: 2.65s
965:	learn: 0.3230638	total: 1m 13s	remaining: 2.58s
966:	learn: 0.3229748	total: 1m 13s	remaining: 2.5s
967:	learn: 0.3229417	total: 1m 13s	remaining: 2.43s
968:	learn: 0.3229025	total: 1m 13s	remaining: 2.35s
969:	learn: 0.3228664	total: 1m 13s	remaining: 2.28s
970:	learn: 0.3228425	total: 1m 13s	remaining: 2.2s
971:	learn: 0.3227661	total: 1m 13s	remaining: 2.12s
972:	learn: 0.3226988	total: 1m 13s	remaining: 2.05s
973:	learn: 0.3226651	total: 1m 13s	remaining: 1.97s
974:	learn: 0.3225662	total: 1m 13s	remaining: 1.9s
975:	learn: 0.3225192	total: 1m 14s	remaining: 1.82s
976:	learn: 0.3224866	total: 1m 14s	remaining: 1.74s
977:	learn: 0.3224149	total: 1m 14s	remaining: 1.67s
978:	learn: 0.3223874	total: 1m 14s	remaining: 1.59s
979:	learn: 0.3222728	total: 1m 14s	remaining: 1.

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('cb', <catboost.core.CatBoost object at 0x1a2630b7d0>)],
         verbose=False)

In [283]:
cb_preds = cb.predict(X_test)