# Imports

In [1]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Train

In [20]:
train_set = pd.read_csv('train/train_encoded.csv')

In [21]:
train_set

Unnamed: 0,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value_sin_stemming,keyword_value_con_stemming,keyword_value_mult,keyword_value_diff,keyword_sum,...,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff,target
0,0,0,43,69,26,3.267254,0.03008,0.098278,3.237175,1.0,...,0.039129,0.000000,0.000000,0.000000,0.000000,6.328488,27.200895,1.034131,12.431300,1
1,0,0,36,38,2,3.267254,0.03008,0.098278,3.237175,1.0,...,0.039828,0.000000,0.000000,0.000000,0.000000,2.183374,14.948923,0.482472,7.746799,1
2,0,0,72,133,61,3.267254,0.03008,0.098278,3.237175,1.0,...,0.058736,0.000000,0.000000,0.000000,0.000000,15.902513,50.867533,2.458198,21.204076,1
3,0,0,50,65,15,3.267254,0.03008,0.098278,3.237175,1.0,...,0.040829,0.000000,0.000000,0.000000,0.000000,4.867260,24.192476,0.804374,11.435785,1
4,0,0,52,88,36,3.267254,0.03008,0.098278,3.237175,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,9.930953,38.388048,1.239480,15.629609,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,3.267254,0.03008,0.098278,3.237175,1.0,...,0.081231,0.200066,0.076401,0.175942,0.074368,6.251264,23.519079,0.907812,10.779492,1
7609,0,0,82,125,43,3.267254,0.03008,0.098278,3.237175,1.0,...,0.085929,0.255415,0.090223,0.228933,0.070228,10.658496,33.534944,1.475837,15.424894,1
7610,0,0,60,65,5,3.267254,0.03008,0.098278,3.237175,1.0,...,0.112936,0.238182,0.086771,0.206868,0.077030,8.823204,30.397755,2.377698,15.240896,1
7611,0,0,96,137,41,3.267254,0.03008,0.098278,3.237175,1.0,...,0.111862,0.433861,0.109249,0.406871,0.112182,25.300413,51.636186,2.709726,21.319579,1


In [22]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [23]:
def binary_preds(preds):
    len_preds = len(preds)
    for i in range(len_preds):
        if preds[i] >= 0.5:
            preds[i] = 1
        else:
            preds[i] = 0

In [33]:
params = {
    'learning_rate': [0.3, 0.1, 0.01, 0.001],
    'num_leaves': [10, 20, 40, 70, 150, 300],
    'max_depth': [20, 50, 70, 100, 200],
    'metric': ['binary_logloss'],
    'sub_feature': [0.2, 0.3, 0.5, 0.7, 0.8],
}

## Grid Search

In [21]:
grid_model = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=params, cv=7)
result = grid_model.fit(X_train, y_train)

In [22]:
grid_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'metric': 'binary_logloss',
 'num_leaves': 40,
 'sub_feature': 0.5}

In [23]:
preds = grid_model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.814046


## Random search

In [7]:
lgbm_random = RandomizedSearchCV(estimator = lgb.LGBMClassifier(),\
                               param_distributions=params, n_iter=30, cv=7, random_state=111)
result = lgbm_random.fit(X_train, y_train)

In [8]:
lgbm_random.best_params_

{'sub_feature': 0.5,
 'num_leaves': 20,
 'metric': 'binary_logloss',
 'max_depth': 200,
 'learning_rate': 0.1}

In [9]:
preds = lgbm_random.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.811377


## Pruebas manuales

In [24]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 50}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.825928


In [25]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 200}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.823880


In [26]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.2, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.830831


In [27]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.4, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.826128


In [28]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.6, 'max_depth': 100}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.821571


In [29]:
params = {'objective': 'binary', 'metric': 'accuracy', 'num_leaves': 20,\
          'learning_rate': 0.1, 'feature_fraction': 0.9}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.827063


In [30]:
gbm = lgb.train({}, train_set=lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.822674


## Test

In [14]:
test_set = pd.read_csv('test/test_encoded.csv')

In [15]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [16]:
preds = gbm.predict(test_set)
binary_preds(preds)
preds

array([0., 1., 1., ..., 1., 1., 1.])

In [17]:
test_set['id'] = col_id

In [18]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [19]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)