In [1]:
# load libraries
import numpy as np
import pandas as pd
from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm

In [2]:
# load data
df_train = pd.read_csv("../DERIVED/train.csv")
df_test = pd.read_csv("../DERIVED/test.csv")

In [3]:
print(df_train.shape)
print(df_test.shape)
# print(df_train.dtypes)
# print(df_test.dtypes)

(7377418, 49)
(2556790, 49)


In [4]:
# transfrom columns
ids = df_test['id'].values
df_test.drop(['id'], axis=1, inplace=True)



cols = list(df_train.columns)
cols.remove('target')
df_test = df_test[cols]



df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)



import gc
gc.collect()



for col in tqdm(cols):
    if df_train[col].dtype == 'object':
        df_train[col] = df_train[col].apply(str)
        df_test[col] = df_test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(df_train[col].unique())
        test_vals = list(df_test[col].unique())
        le.fit(train_vals + test_vals)
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])

        
        
X = np.array(df_train.drop(['target'], axis=1))
y = df_train['target'].values
X_test = np.array(df_test)
del df_train, df_test; gc.collect();



X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state = 123)
del X, y; gc.collect();



d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid) 
watchlist = [d_valid]

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [01:21<00:00,  1.69s/it]


In [5]:
# light gbm parameters
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        #'min_sum_hessian_in_leaf': 100,
        'max_depth': 10,
        'metric' : 'auc',
        }

In [6]:
# create log files
import sys
logfilename = "../LOGS/log_model_build_Lightgbm_" + str(dt.now().strftime('%Y%m%d_%H%M%S')) + ".txt"
%logstart -o logfilename
sys.stdout = open(logfilename, 'a')

Activating auto-logging. Current session state plus future input saved.
Filename       : logfilename
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : False
State          : active


In [7]:
# light gbm model by 3 fold
print(str(dt.now())+"\n")
bst = lgb.train(params, d_train, num_boost_round = 1501, valid_sets = watchlist, early_stopping_rounds=50, verbose_eval=100)
print(str(dt.now())+"\n")

In [8]:
sys.stdout = sys.__stdout__

In [9]:
# make predicions
p_test = bst.predict(X_test)

subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
subm = subm.sort_values(['id'])
subm.to_csv('../SUBMISSION/submission_10_lightgbm.csv.gz', compression = 'gzip', index=False, float_format = '%.6f')
subm.to_csv('../SUBMISSION/submission_10_lightgbm.csv', index = False)
print('Done!')

In [None]:
d = {'Variable': cols, 'Importance': bst.feature_importance(importance_type='split')}
d = pd.DataFrame(d)
d.sort_values(['Importance'], ascending = False)