# Imports

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Train

In [36]:
train_set = pd.read_csv('train/train_encoded.csv')

In [37]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Ensamble

## Modelo 1

In [38]:
estimators = []

estimators.append(('knn5', KNeighborsClassifier(n_neighbors=5, weights='distance', metric='manhattan')))
estimators.append(('knn49', KNeighborsClassifier(n_neighbors=49, weights='distance', metric='manhattan')))

estimators.append(('catboost', CatBoostClassifier(silent=True)))
estimators.append(('catboost57', CatBoostClassifier(silent=True,learning_rate=0.1, subsample=0.6,\
                                               n_estimators=57, random_state=42)))

estimators.append(('xgb', XGBClassifier(objective ='reg:logistic', subsample=0.3, colsample_bytree = 0.7,\
                           learning_rate = 0.1, max_depth = 37, n_estimators = 60)))

estimators.append(('lgbm20', LGBMClassifier(max_depth=20, metric='binary_logloss', learning_rate=0.1)))
estimators.append(('lgbm', LGBMClassifier()))

estimators.append(('mlp', MLPClassifier(activation='tanh', alpha=0.5, beta_1=0.1, beta_2=0.7, early_stopping=True,\
                      hidden_layer_sizes=(50,), learning_rate='invscaling', learning_rate_init= 0.5,\
                      max_iter=1000, momentum=0.8, power_t=0.1, random_state=42, solver='lbfgs', tol=0.01,\
                      validation_fraction= 0.3)))
estimators.append(('mlp2', MLPClassifier()))

estimators.append(('adaboost10', AdaBoostClassifier(n_estimators=10, random_state=42)))
estimators.append(('adaboost_lgbm', AdaBoostClassifier(base_estimator=LGBMClassifier(max_depth=10, metric='binary_logloss'),\
                                                       random_state=111)))

In [39]:
model = VotingClassifier(estimators=estimators, voting='hard')

model.fit(X_train, y_train)

VotingClassifier(estimators=[('knn5',
                              KNeighborsClassifier(metric='manhattan',
                                                   weights='distance')),
                             ('knn49',
                              KNeighborsClassifier(metric='manhattan',
                                                   n_neighbors=49,
                                                   weights='distance')),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x7f2747ef4e20>),
                             ('catboost57',
                              <catboost.core.CatBoostClassifier object at 0x7f2747ef4a90>),
                             ('xgb',
                              XGBClassifier(base...
                                            learning_rate='invscaling',
                                            learning_rate_init=0.5,
                                            max_iter=1000, momentum=0

In [40]:
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score %f" % (f1))

F1-Mean Score 0.829093


## Modelo 2

In [41]:
estimators = []

estimators.append(('catboost', CatBoostClassifier(silent=True, )))

estimators.append(('xgb', XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.5,\
                           learning_rate = 0.01, max_depth = 100, n_estimators = 100)))

estimators.append(('lgbm', LGBMClassifier(max_depth=20, metric='binary_logloss', learning_rate=0.1)))

estimators.append(('adaboost10', AdaBoostClassifier(n_estimators=10, random_state=42)))
estimators.append(('knn3', KNeighborsClassifier(n_neighbors=3, metric='manhattan')))

In [42]:
model = VotingClassifier(estimators=estimators, voting='hard')

model.fit(X_train, y_train)

VotingClassifier(estimators=[('catboost',
                              <catboost.core.CatBoostClassifier object at 0x7f2750966a90>),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.5, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.01,
                                            max_delta_step=None, max_depth=100,
                                            min_chi...
                                            objective='reg:logistic',
                                            random_state=None, reg_alpha=None,
                                      

In [43]:
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score %f" % (f1))

F1-Mean Score 0.828137


## Modelo 3

In [44]:
estimators = []

estimators.append(('catboost', CatBoostClassifier(silent=True)))
estimators.append(('catboost1000', CatBoostClassifier(silent=True, iterations=1000, learning_rate=0.1,\
                                                      subsample=0.6, random_state=42)))

estimators.append(('xgb', XGBClassifier(objective ='reg:logistic', subsample=0.4, colsample_bytree = 0.8,\
                           learning_rate = 0.1, max_depth = 15, n_estimators = 10)))

estimators.append(('lgbm', LGBMClassifier(max_depth=5, metric='binary_logloss', learning_rate=0.1)))

estimators.append(('adaboost', AdaBoostClassifier(n_estimators=1, random_state=42)))

estimators.append(('knn', KNeighborsClassifier(n_neighbors=109, weights='distance', metric='manhattan')))
estimators.append(('knn3', KNeighborsClassifier(n_neighbors=17, weights='distance', metric='manhattan')))

In [45]:
model = VotingClassifier(estimators=estimators, voting='hard')

model.fit(X_train, y_train)

VotingClassifier(estimators=[('catboost',
                              <catboost.core.CatBoostClassifier object at 0x7f275097abb0>),
                             ('catboost1000',
                              <catboost.core.CatBoostClassifier object at 0x7f275097ac10>),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.8, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_con...
                                            subsample=0.4, tree_method=None,
                                            validate_parameters=None,
                                            verbosity=None)),
                             ('lgbm',
            

In [46]:
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score %f" % (f1))

F1-Mean Score 0.829561


# Test

In [47]:
test_set = pd.read_csv('test/test_encoded.csv')

In [48]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [49]:
preds = model.predict(test_set)
preds

array([0, 0, 1, ..., 1, 1, 1])

In [50]:
test_set['id'] = col_id

In [51]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [52]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)