In [1]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

In [4]:
train.columns

Index(['X00', 'X01', 'X02', 'X03', 'X04', 'X05', 'X06', 'X07', 'X08', 'X09',
       'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
       'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29',
       'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39',
       'Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09',
       'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17', 'Y18'],
      dtype='object')

In [5]:
trainNotNull = train[train['Y18'].notnull()]
trainNotNull = trainNotNull.drop(['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17'], axis=1)
trainNotNull.head()

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X31,X32,X33,X34,X35,X36,X37,X38,X39,Y18
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4320,19.3,987.7,0.9,2.2,0.0,1007.7,988.0,20.8,1007.8,1007.4,...,18.0,18.7,1008.3,0.0,353.5,0.0,84.0,77.6,0.0,20.5
4321,19.0,987.6,1.9,2.2,0.0,1007.7,988.0,20.6,1007.8,1007.4,...,17.7,19.0,1008.3,0.0,303.1,0.0,84.4,79.2,0.0,20.5
4322,19.1,987.6,2.0,1.4,0.0,1007.8,988.1,20.5,1007.9,1007.4,...,17.4,19.2,1008.3,0.0,307.3,0.0,84.9,79.9,0.0,20.5
4323,19.2,987.7,1.8,1.5,0.0,1007.9,988.1,20.5,1007.9,1007.5,...,17.5,19.3,1008.4,0.0,315.6,0.0,84.9,80.7,0.0,20.5
4324,19.2,987.8,1.4,1.4,0.0,1007.9,988.1,20.5,1007.8,1007.6,...,17.4,19.5,1008.4,0.0,321.2,0.0,84.9,80.9,0.0,20.5


In [6]:
columnList = trainNotNull.columns
target_var = ['Y18']
input_var = list(filter(lambda x:x != 'Y18', columnList))

In [7]:
X_train = trainNotNull[input_var]
y_train = trainNotNull[target_var]

In [9]:
from sklearn.model_selection import GridSearchCV
import xgboost as XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(1000,2000,500),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=5, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'colsample_bylevel': 1, 'colsample_bytree': 1, 'max_depth': 3, 'n_estimators': 1000}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)


In [19]:
xgbr = xgb.XGBRegressor(n_estimators=2000, learning_rate=0.05, colsample_bylevel=1,  colsample_bytree=1, max_depth=3)
xgbr.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=True, early_stopping_rounds=5)

[0]	validation_0-rmse:25.6059
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:24.3493
[2]	validation_0-rmse:23.1536
[3]	validation_0-rmse:22.0171
[4]	validation_0-rmse:20.9375
[5]	validation_0-rmse:19.9121
[6]	validation_0-rmse:18.9366
[7]	validation_0-rmse:18.0096
[8]	validation_0-rmse:17.1294
[9]	validation_0-rmse:16.2917
[10]	validation_0-rmse:15.496
[11]	validation_0-rmse:14.7392
[12]	validation_0-rmse:14.0201
[13]	validation_0-rmse:13.336
[14]	validation_0-rmse:12.6846
[15]	validation_0-rmse:12.0658
[16]	validation_0-rmse:11.4779
[17]	validation_0-rmse:10.9205
[18]	validation_0-rmse:10.3887
[19]	validation_0-rmse:9.88364
[20]	validation_0-rmse:9.40311
[21]	validation_0-rmse:8.94813
[22]	validation_0-rmse:8.51415
[23]	validation_0-rmse:8.1019
[24]	validation_0-rmse:7.7096
[25]	validation_0-rmse:7.33668
[26]	validation_0-rmse:6.98303
[27]	validation_0-rmse:6.64599
[28]	validation_0-rmse:6.32548
[29]	validation_0-rmse:6.02033
[30]	validation_0-rm

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [27]:
from lightgbm import LGBMRegressor

param = {
    'max_depth':[10,50,200],
    'num_leaves': [10, 128, 160],
    'min_child_samples': [60, 100, 200]
    
}
lgbm_model = LGBMRegressor()
lgbm_grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param, cv=5, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

lgbm_grid_search.fit(X_train, y_train)
print(lgbm_grid_search.best_params_)
print(lgbm_grid_search.best_estimator_)

{'max_depth': 10, 'min_child_samples': 100, 'num_leaves': 10}
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=10,
              min_child_samples=100, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=10, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [21]:
lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)
lgbmr.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bylevel=0.5,
              colsample_bytree=0.7, importance_type='split', learning_rate=0.1,
              max_depth=3, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=1500, n_jobs=-1, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [28]:
xgbrPred = xgbr.predict(test)

In [29]:
xgbrPred

array([20.87361 , 20.848234, 20.91702 , ..., 26.66345 , 26.54755 ,
       26.66345 ], dtype=float32)

In [22]:
lgbmrPred = lgbmr.predict(test)

In [30]:
lgbmrPred

array([20.56786152, 20.63438558, 20.44054575, ..., 26.56063923,
       26.23965685, 26.41305126])

In [31]:
sumPred = (xgbrPred + lgbmrPred) * 0.5

In [32]:
sumPred

array([20.72073553, 20.74130988, 20.67878328, ..., 26.61204474,
       26.39360353, 26.53825075])

In [33]:
outputDf = pd.DataFrame()
outputDf['id'] = test.index
outputDf['Y18'] = sumPred

In [34]:
outputDf.to_csv('submission.csv', index=False)