In [7]:
import pandas as pd

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

from hyperparameter_tuning import random_search
import datetime

from feature_builder import process_dataset, add_text_embeddings
import nltk

from sklearn.model_selection import cross_val_score
from sklearn import metrics

import nbimporter
import Exporter as exporter

In [8]:
x_train = pd.read_csv('train.csv')
y_train=x_train[['id','target']]
test=pd.read_csv('test.csv')

In [9]:
params = {'depth':[2,3,4,5,6,7,8],
          'iterations':[800,900,1000,1100,1200],
          'learning_rate':[0.01,0.02,0.03,0.05,0.08,0.1,0.2], 
          'l2_leaf_reg':[1,2,3,4,5,6],
          'border_count':[32,5,10,20,50,100,200],
          'random_strength':[42],
          'thread_count': [0,1,4]
         }

Variables procesadas

In [10]:
x_train_features = process_dataset(x_train,use_spacy=True,use_manual_features=False)

Embeddings loaded!
Percentage of words covered in the embeddings = 0.4875485193423176
Embeddings loaded!
Percentage of words covered in the embeddings = 0.5959707770644233


In [11]:
x_train_feat, x_test_feat, y_train_feat, y_test_feat = train_test_split(x_train_features, y_train, test_size = .33, random_state = 17)

<h3>Busqueda de hiperparametros</h3>

In [12]:
catboost = CatBoostClassifier(verbose=False)

In [28]:
grid = GridSearchCV(estimator=catboost, param_grid=params, cv = 3, n_jobs=-1)

In [29]:
grid_res = grid.fit(x_train_feat, y_train_feat)

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
random_search(x_train_features,y_train['target'],catboost,params,5,20)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 31.2min finished


<h3>Pruebas locales</h3>

In [None]:
x_train_feat, x_test_feat, y_train_feat, y_test_feat = train_test_split(x_train_features, y_train, test_size = .33, random_state = 17)

In [None]:
catboost.fit(x_train_feat, y_train_feat['target'])

In [None]:
predictions_features = catboost.predict(x_test_feat)
accuracy_score(y_test_feat['target'], predictions_features)

In [None]:
f1_score(y_test_feat['target'], predictions_features)

In [5]:
scores = cross_val_score(catboost, x_train_features, y_train['target'], cv=5, scoring='f1_macro')

In [9]:
print(scores)
print(scores.mean())

[0.78162458 0.7526497  0.76699247 0.76744869 0.79811896]
0.7733668832955798


<h3>Export</h3>

In [12]:
catboost = CatBoostClassifier(verbose=False)

In [15]:
BC_ctb = BaggingClassifier(base_estimator= catboost, n_estimators=10, random_state=0)

In [16]:
test_features=process_dataset(test)

Embeddings loaded!
Percentage of words covered in the embeddings = 0.5653890824622532
Embeddings loaded!
Percentage of words covered in the embeddings = 0.6646569049669825


In [18]:
exporter.export_model_csv(BC_ctb,x_train_features,test_features,test,y_train['target'],'Catboost')