# Imports

In [9]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import sklearn
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# Train

In [3]:
train_set = pd.read_csv('train/train_encoded_sin_location.csv')

In [4]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Bagging

In [19]:
lgbm_model = lgb.LGBMClassifier()

model = BaggingClassifier(base_estimator=lgbm_model)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
results = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1')

print(results.mean())

0.6439449378562999


In [20]:
lgbm_model = lgb.LGBMClassifier(objective='binary', num_leaves=40, learning_rate=0.1, max_depth=100)

model = BaggingClassifier(base_estimator=lgbm_model)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
results = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1')

print(results.mean())

0.6443063087931455


In [21]:
lgbm_model = lgb.LGBMClassifier(objective='binary', num_leaves=40, learning_rate=0.1, max_depth=100)

model = BaggingClassifier(base_estimator=lgbm_model)

cv = RepeatedStratifiedKFold(random_state=42)
results = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1')

print(results.mean())

0.6368600910002535


In [9]:
seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=2)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)

results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.4791895022596453


In [7]:
kfold = model_selection.KFold(n_splits=3)
estimators = []

model1 = lgb.LGBMClassifier(objective='binary', num_leaves=20, learning_rate=0.1)
estimators.append(('lgbm', model1))

model2 = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.5,\
                           learning_rate = 0.01, max_depth = 100, n_estimators = 100)
estimators.append(('xgb', model2))

model3 = CatBoostClassifier(silent=True)
estimators.append(('catboost', model3))

model4 = AdaBoostClassifier(n_estimators=100, random_state=42)
estimators.append(('adaboost', model4))

ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.5943740028715023


0.6081618329246296

In [8]:
kfold = model_selection.KFold(n_splits=2)
estimators = []

model1 = lgb.LGBMClassifier()
estimators.append(('lgbm', model1))

model2 = xgb.XGBClassifier()
estimators.append(('xgb', model2))

model3 = CatBoostClassifier(silent=True)
estimators.append(('catboost', model3))

ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.590698799857165


In [6]:
kfold = model_selection.KFold(n_splits=10)
estimators = []


model4 = AdaBoostClassifier(n_estimators=100, random_state=42)
estimators.append(('adaboost', model4))

model1 = lgb.LGBMClassifier(objective='binary', num_leaves=20, learning_rate=0.1)
estimators.append(('lgbm', model1))

model3 = CatBoostClassifier(silent=True)
estimators.append(('catboost', model3))

model2 = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.5,\
                           learning_rate = 0.01, max_depth = 100, n_estimators = 100)
estimators.append(('xgb', model2))

ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.6081618329246296


## Old model

In [11]:
model = BaggingClassifier(base_estimator=lgb.LGBMClassifier(objective='binary', num_leaves=40,\
                                                           learning_rate=0.1, max_depth=100))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(model, X, y, scoring='f1', cv=cv)

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
print(n_scores)

Accuracy: 0.642 (0.019)
[0.67802385 0.64802632 0.6422629  0.63681592 0.63356164 0.64321608
 0.65051903 0.6271777  0.63481229 0.63497453 0.6609589  0.64965986
 0.60988075 0.65313029 0.6746167  0.66126418 0.65897858 0.63439065
 0.62605753 0.58741259 0.6409396  0.6272578  0.64371773 0.63959391
 0.64418212 0.66438356 0.64493997 0.6523888  0.60899654 0.66225166]
