# Imports

In [75]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Train

In [76]:
train_set = pd.read_csv('train/train_encoded.csv')

In [77]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Ensamble

## Modelo 1

In [81]:
model1 = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='manhattan')
model2 = CatBoostClassifier(silent=True)
model3 = XGBClassifier(objective ='reg:logistic', subsample=0.3, colsample_bytree = 0.7,\
                           learning_rate = 0.1, max_depth = 37, n_estimators = 60)
model4 = LGBMClassifier(max_depth=20, metric='binary_logloss', learning_rate=0.1)
model5 = LGBMClassifier()
model6 = KNeighborsClassifier(n_neighbors=49, weights='distance', metric='manhattan')
model7 = CatBoostClassifier(silent=True,learning_rate=0.1, subsample=0.6, n_estimators=57, random_state=42)
model8 = MLPClassifier(activation='tanh', alpha=0.5, beta_1=0.1, beta_2=0.7, early_stopping=True,\
                      hidden_layer_sizes=(50,), learning_rate='invscaling', learning_rate_init= 0.5,\
                      max_iter=600, momentum=0.8, power_t=0.1, random_state=42, solver='lbfgs', tol=0.01,\
                      validation_fraction= 0.3)
model9 = MLPClassifier()
model10 = KNeighborsClassifier(n_neighbors=17, weights='distance', metric='manhattan')
model11 = KNeighborsClassifier(n_neighbors=109, weights='distance', metric='manhattan')

In [82]:
estimators = [('knn5', model1), ('catboost', model2), ('xgb', model3), ('lgbm', model4), ('lgbm2', model5),\
             ('knn49', model6), ('catboost_2000', model7), ('mlperceptron', model8), ('mlp', model9),\
             ('knn17', model10), ('knn109', model11)]
model = VotingClassifier(estimators=estimators, voting='hard')

model.fit(X_train, y_train)

VotingClassifier(estimators=[('knn5',
                              KNeighborsClassifier(metric='manhattan',
                                                   weights='distance')),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x7feb8954a670>),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constrai...
                              <catboost.core.CatBoostClassifier object at 0x7feb88e8cee0>),
                             ('mlperceptron',
                              MLPClassifier(activation='tanh', alpha=0.5

In [83]:
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score %f" % (f1))

F1-Mean Score 0.803494


## Modelo 2

In [99]:
model1 = KNeighborsClassifier(n_neighbors=109, weights='distance', metric='manhattan')
model2 = CatBoostClassifier(silent=True)
model3 = XGBClassifier(objective ='reg:logistic', subsample=0.3, colsample_bytree = 0.7,\
                           learning_rate = 0.1, max_depth = 40, n_estimators = 60)
model4 = LGBMClassifier(max_depth=20, metric='binary_logloss', learning_rate=0.1)
model5 = MLPClassifier(activation='tanh', alpha=0.5, beta_1=0.1, beta_2=0.7, early_stopping=True,\
                      hidden_layer_sizes=(50,), learning_rate='invscaling', learning_rate_init= 0.5,\
                      max_iter=600, momentum=0.8, power_t=0.1, random_state=42, solver='lbfgs', tol=0.01,\
                      validation_fraction= 0.3)

In [100]:
estimators = [('knn', model1), ('catboost', model2), ('xgb', model3), ('lgbm', model4), ('mlp', model5)]
model = VotingClassifier(estimators=estimators, voting='hard')

model.fit(X_train, y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(metric='manhattan',
                                                   n_neighbors=109,
                                                   weights='distance')),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x7feb88ea69a0>),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            inter...
                                            verbosity=None)),
                             ('lgbm',
                              LGBMClassifier(max_depth=20,

In [101]:
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score %f" % (f1))

F1-Mean Score 0.813474


# Test

In [102]:
test_set = pd.read_csv('test/test_encoded.csv')

In [103]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [104]:
preds = model.predict(test_set)
preds

array([0, 1, 0, ..., 1, 1, 1])

In [105]:
test_set['id'] = col_id

In [106]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,0
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [107]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)