# Imports

In [1]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Train

In [2]:
train_set = pd.read_csv('train/train_encoded.csv')

In [29]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [4]:
def binary_preds(preds):
    len_preds = len(preds)
    for i in range(len_preds):
        if preds[i] >= 0.5:
            preds[i] = 1
        else:
            preds[i] = 0

In [18]:
params = {
    'learning_rate': [0.3, 0.1, 0.01, 0.001],
    'num_leaves': [10, 20, 40, 70, 150, 300],
    'max_depth': [20, 50, 70, 100, 200],
    'metric': ['binary_logloss'],
    'sub_feature': [0.2, 0.3, 0.5, 0.7, 0.8],
}

## Grid Search

In [21]:
grid_model = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=params, cv=7)
result = grid_model.fit(X_train, y_train)

In [22]:
grid_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'metric': 'binary_logloss',
 'num_leaves': 40,
 'sub_feature': 0.5}

In [23]:
preds = grid_model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.814046


## Random search

In [19]:
lgbm_random = RandomizedSearchCV(estimator = lgb.LGBMClassifier(),\
                               param_distributions=params, n_iter=50, cv=7, random_state=111)
result = lgbm_random.fit(X_train, y_train)

In [20]:
lgbm_random.best_params_

{'sub_feature': 0.5,
 'num_leaves': 70,
 'metric': 'binary_logloss',
 'max_depth': 20,
 'learning_rate': 0.1}

In [21]:
preds = lgbm_random.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.795112


## Pruebas manuales

In [30]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 50}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.799420


In [31]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 200}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.810411


In [32]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.2, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.807088


In [33]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.4, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.787912


In [34]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.6, 'max_depth': 100}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.804648


In [35]:
params = {'objective': 'binary', 'metric': 'accuracy', 'num_leaves': 20,\
          'learning_rate': 0.1, 'feature_fraction': 0.9}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.808207


In [36]:
gbm = lgb.train({}, train_set=lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.811630


Best score w/ test_size 0.4: 81,1 (6)

Best score w/ test_size 0.3: 80,5 (1)

Best score w/ test_size 0.2: 81,2 (6)

Best score w/ test_size 0.1: 84,7 (2)

## Test

In [37]:
test_set = pd.read_csv('test/test_encoded.csv')

In [38]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [39]:
preds = gbm.predict(test_set)
binary_preds(preds)
preds

array([0., 1., 1., ..., 1., 1., 1.])

In [40]:
test_set['id'] = col_id

In [41]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [42]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)