# Imports

In [1]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Train

In [2]:
train_set = pd.read_csv('train/train_encoded.csv')

In [3]:
train_set

Unnamed: 0,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_median_w2v,text_value_1_gram,text_norm_value_1_gram,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,target
0,0,0,43,69,26,3.267254,1.0,1.0,1.0,1.0,...,0.121727,0.227191,0.077967,0.056394,0.040931,0.000000,0.000000,0.000000,0.000000,1
1,0,0,36,38,2,3.267254,1.0,1.0,1.0,1.0,...,0.055061,0.143954,0.061261,0.056181,0.042664,0.000000,0.000000,0.000000,0.000000,1
2,0,0,72,133,61,3.267254,1.0,1.0,1.0,1.0,...,0.095317,0.313103,0.117570,0.055537,0.055537,0.000000,0.000000,0.000000,0.000000,1
3,0,0,50,65,15,3.267254,1.0,1.0,1.0,1.0,...,0.082558,0.197538,0.078924,0.059248,0.043484,0.000000,0.000000,0.000000,0.000000,1
4,0,0,52,88,36,3.267254,1.0,1.0,1.0,1.0,...,0.086994,0.265268,0.083965,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,3.267254,1.0,1.0,1.0,1.0,...,0.090719,0.252617,0.087472,0.230477,0.095556,0.200416,0.077877,0.179269,0.062804,1
7609,0,0,82,125,43,3.267254,1.0,1.0,1.0,1.0,...,0.158652,0.317482,0.090036,0.290268,0.092334,0.254335,0.087856,0.226830,0.074986,1
7610,0,0,60,65,5,3.267254,1.0,1.0,1.0,1.0,...,0.046951,0.290424,0.150312,0.259849,0.107001,0.231129,0.087631,0.200353,0.075999,1
7611,0,0,96,137,41,3.267254,1.0,1.0,1.0,1.0,...,0.085732,0.489610,0.136034,0.463238,0.137259,0.440136,0.121416,0.403074,0.118529,1


In [41]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [32]:
def binary_preds(preds):
    len_preds = len(preds)
    for i in range(len_preds):
        if preds[i] >= 0.5:
            preds[i] = 1
        else:
            preds[i] = 0

In [33]:
params = {
    'learning_rate': [0.3, 0.1, 0.01, 0.001],
    'num_leaves': [10, 20, 40, 70, 150, 300],
    'max_depth': [20, 50, 70, 100, 200],
    'metric': ['binary_logloss'],
    'sub_feature': [0.2, 0.3, 0.5, 0.7, 0.8],
}

## Grid Search

In [21]:
grid_model = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=params, cv=7)
result = grid_model.fit(X_train, y_train)

In [22]:
grid_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'metric': 'binary_logloss',
 'num_leaves': 40,
 'sub_feature': 0.5}

In [23]:
preds = grid_model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.814046


## Random search

In [7]:
lgbm_random = RandomizedSearchCV(estimator = lgb.LGBMClassifier(),\
                               param_distributions=params, n_iter=30, cv=7, random_state=111)
result = lgbm_random.fit(X_train, y_train)

In [8]:
lgbm_random.best_params_

{'sub_feature': 0.5,
 'num_leaves': 20,
 'metric': 'binary_logloss',
 'max_depth': 200,
 'learning_rate': 0.1}

In [9]:
preds = lgbm_random.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.811377


## Pruebas manuales

## Este es

In [49]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 50}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.818110


In [43]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 200}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.816822


In [44]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.2, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.808042


In [45]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.4, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.821656


In [46]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.6, 'max_depth': 100}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.813822


In [47]:
params = {'objective': 'binary', 'metric': 'accuracy', 'num_leaves': 20,\
          'learning_rate': 0.1, 'feature_fraction': 0.9}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.812504


In [48]:
gbm = lgb.train({}, train_set=lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.816161


## Test

In [50]:
test_set = pd.read_csv('test/test_encoded.csv')

In [51]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [52]:
preds = gbm.predict(test_set)
binary_preds(preds)
preds

array([0., 1., 1., ..., 1., 1., 1.])

In [53]:
test_set['id'] = col_id

In [54]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [55]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)