# Imports

In [32]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Train

In [33]:
train_set = pd.read_csv('train/train_encoded.csv')

In [43]:
train_set

Unnamed: 0,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,text_value_1_gram,text_value_2_gram,text_value_3_gram,text_value_4_gram,target
0,0,0,43,69,26,3.267254,1.0,1.0,1.0,1.0,...,8.481658,0.000850,0.615265,0.154212,0.121727,0.226814,0.055999,0.000000,0.000000,1
1,0,0,36,38,2,3.267254,1.0,1.0,1.0,1.0,...,0.790154,-0.103024,0.308946,0.079015,0.055061,0.140340,0.059299,0.000000,0.000000,1
2,0,0,72,133,61,3.267254,1.0,1.0,1.0,1.0,...,29.265612,-0.044882,1.000000,0.139360,0.095317,0.323950,0.061498,0.000000,0.000000,1
3,0,0,50,65,15,3.267254,1.0,1.0,1.0,1.0,...,2.733583,-0.039545,0.317247,0.097628,0.082558,0.199979,0.056540,0.000000,0.000000,1
4,0,0,52,88,36,3.267254,1.0,1.0,1.0,1.0,...,13.153737,-0.041156,1.000000,0.125274,0.086994,0.258605,0.000000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,3.267254,1.0,1.0,1.0,1.0,...,3.762588,-0.026625,0.312744,0.104516,0.090719,0.258534,0.227854,0.203221,0.177035,1
7609,0,0,82,125,43,3.267254,1.0,1.0,1.0,1.0,...,25.374036,-0.006511,1.000000,0.211450,0.158652,0.309828,0.291124,0.264497,0.238252,1
7610,0,0,60,65,5,3.267254,1.0,1.0,1.0,1.0,...,13.296097,-0.082743,1.000000,0.201456,0.046951,0.291732,0.260418,0.234472,0.199919,1
7611,0,0,96,137,41,3.267254,1.0,1.0,1.0,1.0,...,22.747623,-0.091922,1.000000,0.119724,0.085732,0.484819,0.463784,0.438507,0.400071,1


In [34]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [35]:
def binary_preds(preds):
    len_preds = len(preds)
    for i in range(len_preds):
        if preds[i] >= 0.5:
            preds[i] = 1
        else:
            preds[i] = 0

In [16]:
params = {
    'learning_rate': [0.3, 0.1, 0.01, 0.001],
    'num_leaves': [10, 20, 40, 70, 150, 300],
    'max_depth': [20, 50, 70, 100, 200],
    'metric': ['binary_logloss'],
    'sub_feature': [0.2, 0.3, 0.5, 0.7, 0.8],
}

## Grid Search

In [21]:
grid_model = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=params, cv=7)
result = grid_model.fit(X_train, y_train)

In [22]:
grid_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'metric': 'binary_logloss',
 'num_leaves': 40,
 'sub_feature': 0.5}

In [23]:
preds = grid_model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.814046


## Random search

In [19]:
lgbm_random = RandomizedSearchCV(estimator = lgb.LGBMClassifier(),\
                               param_distributions=params, n_iter=50, cv=7, random_state=111)
result = lgbm_random.fit(X_train, y_train)

In [20]:
lgbm_random.best_params_

{'sub_feature': 0.5,
 'num_leaves': 70,
 'metric': 'binary_logloss',
 'max_depth': 20,
 'learning_rate': 0.1}

In [21]:
preds = lgbm_random.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.795112


## Pruebas manuales

In [36]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 50}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.820480


In [37]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.5, 'max_depth': 200}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.815503


In [38]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.2, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.820721


In [39]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 20,\
          'learning_rate': 0.1, 'sub_feature': 0.4, 'max_depth': 20}

gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.822378


In [40]:
params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 40,\
          'learning_rate': 0.1, 'sub_feature': 0.6, 'max_depth': 100}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval, verbose_eval=False)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)

f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.819878


In [41]:
params = {'objective': 'binary', 'metric': 'accuracy', 'num_leaves': 20,\
          'learning_rate': 0.1, 'feature_fraction': 0.9}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.811416


In [42]:
gbm = lgb.train({}, train_set=lgb_train, num_boost_round=100, valid_sets=lgb_eval)
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

binary_preds(preds)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.822212


Best score w/ test_size 0.4: 81,1 (6)

Best score w/ test_size 0.3: 80,5 (1)

Best score w/ test_size 0.2: 81,2 (6)

Best score w/ test_size 0.1: 84,7 (2)

## Test

In [37]:
test_set = pd.read_csv('test/test_encoded.csv')

In [38]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [39]:
preds = gbm.predict(test_set)
binary_preds(preds)
preds

array([0., 1., 1., ..., 1., 1., 1.])

In [40]:
test_set['id'] = col_id

In [41]:
test_set['target'] = preds.astype('int64')
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [42]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)