In [78]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [79]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [80]:
train.head(10)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5
5,6,2579,132,6,300,-15,67,230,237,140,...,0,0,0,0,0,0,0,0,0,2
6,7,2606,45,7,270,5,633,222,225,138,...,0,0,0,0,0,0,0,0,0,5
7,8,2605,49,4,234,7,573,222,230,144,...,0,0,0,0,0,0,0,0,0,5
8,9,2617,45,9,240,56,666,223,221,133,...,0,0,0,0,0,0,0,0,0,5
9,10,2612,59,10,247,11,636,228,219,124,...,0,0,0,0,0,0,0,0,0,5


Feature Creation for train

In [81]:
train['Hillshade_mean'] = train['Hillshade_9am']+train['Hillshade_Noon']+train['Hillshade_3pm']
train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
train['log_elevation']= np.log(train.Elevation)
train['Hillshade_9am_sq'] = train['Hillshade_9am']**2
train['Hillshade_noon_sq'] = train['Hillshade_Noon']**2
train['Hillshade_3pm_sq'] = train['Hillshade_3pm']**2

train['cosine_slope'] = np.cos(train['Slope'])

a = pd.cut(train['Aspect'],bins=[-1, 45, 90, 135, 180, 225, 270, 315, 360],
           labels=['Aspect_1','Aspect_2','Aspect_3','Aspect_4','Aspect_5','Aspect_6','Aspect_7','Aspect_8'])
b = pd.get_dummies(a)
train.merge(b,right_index=True,left_index=True)
 

train['interaction_9amnoon'] = train['Hillshade_9am']*train['Hillshade_Noon']
train['interaction_noon3pm'] = train['Hillshade_3pm']*train['Hillshade_Noon']
train['interaction_9am3pm'] = train['Hillshade_3pm']*train['Hillshade_9am']

train['HF1'] = train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Fire_Points']
train['HF2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Fire_Points'])
train['HR1'] = abs(train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Roadways'])
train['HR2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Roadways'])
train['FR1'] = abs(train['Horizontal_Distance_To_Fire_Points']+train['Horizontal_Distance_To_Roadways'])
train['FR2'] = abs(train['Horizontal_Distance_To_Fire_Points']-train['Horizontal_Distance_To_Roadways'])
train['ele_vert'] = train.Elevation-train.Vertical_Distance_To_Hydrology

train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
train['Mean_Amenities']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology + train.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
train['Mean_Fire_Hyd']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology) / 2

Feature Creation for test

In [82]:
test['Hillshade_mean'] = test['Hillshade_9am']+test['Hillshade_Noon']+test['Hillshade_3pm']
test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
test['log_elevation']= np.log(test.Elevation)
test['Hillshade_9am_sq'] = test['Hillshade_9am']**2
test['Hillshade_noon_sq'] = test['Hillshade_Noon']**2
test['Hillshade_3pm_sq'] = test['Hillshade_3pm']**2

test['cosine_slope'] = np.cos(test['Slope'])

a = pd.cut(test['Aspect'],bins=[-1, 45, 90, 135, 180, 225, 270, 315, 360],
           labels=['Aspect_1','Aspect_2','Aspect_3','Aspect_4','Aspect_5','Aspect_6','Aspect_7','Aspect_8'])
b = pd.get_dummies(a)
test.merge(b,right_index=True,left_index=True)

test['interaction_9amnoon'] = test['Hillshade_9am']*test['Hillshade_Noon']
test['interaction_noon3pm'] = test['Hillshade_3pm']*test['Hillshade_Noon']
test['interaction_9am3pm'] = test['Hillshade_3pm']*test['Hillshade_9am']

test['HF1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']
test['HF2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])
test['HR1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])
test['HR2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])
test['FR1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])
test['FR2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])
test['ele_vert'] = test.Elevation-test.Vertical_Distance_To_Hydrology

test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
test['Mean_Amenities']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology + test.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
test['Mean_Fire_Hyd']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology) / 2

In [83]:
# a = pd.cut(train['Aspect'],bins=[-1, 45, 90, 135, 180, 225, 270, 315, 360],
#            labels=['Aspect_1','Aspect_2','Aspect_3','Aspect_4','Aspect_5','Aspect_6','Aspect_7','Aspect_8'])
# b = pd.get_dummies(a)
# train.merge(b,right_index=True,left_index=True)

In [84]:
X_train = train.drop(['Id','Cover_Type'],1)
Y_train = train['Cover_Type']
X_test = test.drop(['Id'],1)

In [85]:
scaler = StandardScaler()
X_train_tf = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_tf = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)

In [86]:
X_train_tf.head(10)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,interaction_9am3pm,HF1,HF2,HR1,HR2,FR1,FR2,ele_vert,Mean_Amenities,Mean_Fire_Hyd
0,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,4.334805,...,0.690441,4.165194,4.390517,-0.848236,-0.959357,1.702247,5.505993,-0.246574,1.673476,4.165194
1,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,4.28571,...,0.760204,4.078395,4.38309,-0.968254,-1.016605,1.619137,5.580595,-0.246574,1.571055,4.078395
2,0.130912,-0.160577,-0.887379,0.194243,0.227369,1.106379,0.696843,0.834797,-0.002005,4.191156,...,0.538108,4.036731,4.234547,1.089401,1.098462,2.902092,2.30939,0.098281,2.847589,4.036731
3,0.085421,-0.015231,0.17725,0.070474,1.092853,1.038455,0.827731,0.834797,-0.285268,4.272981,...,0.190113,4.092283,4.34224,1.005533,1.04895,2.902092,2.512851,-0.075352,2.835485,4.092283
4,-0.369489,-1.014485,-1.715424,-0.353198,-0.850404,-0.998491,0.238732,0.659368,0.324838,4.237524,...,0.730228,3.98118,4.38866,-1.010188,-0.970188,1.594299,5.519557,-0.246574,1.519379,3.98118
5,-0.407798,-0.224166,-1.242255,0.346574,-1.079023,-1.243015,0.565954,0.79094,0.106943,4.10933,...,0.621224,3.986388,4.121283,-1.138159,-0.974056,1.372194,5.726409,-0.251397,1.371335,3.986388
6,-0.343152,-1.014485,-1.123963,0.203764,-0.752425,-0.815853,0.304176,0.264652,0.063364,4.313894,...,0.408121,4.155646,4.358023,-0.750631,-0.873486,1.750011,5.340963,-0.234516,1.725617,4.155646
7,-0.345547,-0.978148,-1.478839,0.032391,-0.719765,-0.861135,0.304176,0.483939,0.194101,4.288437,...,0.589613,4.100095,4.36545,-0.820039,-0.892053,1.707979,5.377134,-0.241751,1.667889,4.100095
8,-0.316816,-1.014485,-0.887379,0.060953,0.0804,-0.790947,0.336898,0.089223,-0.045584,4.302984,...,0.275,4.11919,4.374734,-0.748462,-0.824748,1.760042,5.290098,-0.330979,1.721427,4.11919
9,-0.328787,-0.887307,-0.769087,0.094276,-0.654445,-0.813589,0.500509,0.001508,-0.241689,4.290256,...,0.086014,4.113114,4.355238,-0.765091,-0.853372,1.739026,5.308183,-0.234516,1.704202,4.113114


## **************Start
## Light GBM       boosting_type='gbdt'

In [None]:
m5=lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=63,
                       n_estimators=2000, 
                       max_depth=50,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.7,
                       min_split_gain=0.1,
                       classes=[1,2,3,4,5,6,7]
                       )

m5.fit(X_train_tf, Y_train)

In [None]:
Y_pred=m5.predict(X_train_tf)
accuracy=accuracy_score(Y_pred,Y_train)
accuracy

### Cross Val with n_estimators=[200,300,400, 1000]

Best 400
Better 1000, but not a significant change

Default = 100

In [16]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=63,
                       n_estimators=1000, 
                       max_depth=50,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.7,
                       min_split_gain=0.1,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result
# Cross Val Outcome
# n_estimators=200
# array([ 0.71858466,  0.73478836,  0.74702381,  0.75429894,  0.82903439])
# n_estimators=300
# array([ 0.73247354,  0.74702381,  0.75231481,  0.76653439,  0.84325397])
# n_estimators=400
# array([ 0.74603175,  0.75330688,  0.75198413,  0.77149471,  0.84722222])
# n_estimators=1000
# array([ 0.7473545 ,  0.75628307,  0.75529101,  0.77149471,  0.84887566])

array([ 0.7473545 ,  0.75628307,  0.75529101,  0.77149471,  0.84887566])

### Cross Val with min_split_gain= [0.0, 0.05, 0.2, 0.3]

Best 0.05


Default = 0.0

In [15]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=63,
                       n_estimators=400, 
                       max_depth=50,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.7,
                       min_split_gain=0.05,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result

# Cross Val Outcome
# min_split_gain=0.0
# array([ 0.74933862,  0.74966931,  0.75793651,  0.7744709 ,  0.84920635])
# min_split_gain=0.05
# array([ 0.75066138,  0.75231481,  0.75165344,  0.77281746,  0.85019841])
# min_split_gain=0.2
# array([ 0.73115079,  0.74239418,  0.75727513,  0.76719577,  0.84093915])
# min_split_gain=0.3
# array([ 0.7228836 ,  0.73511905,  0.75330688,  0.75496032,  0.8369709 ])

array([ 0.74933862,  0.74966931,  0.75793651,  0.7744709 ,  0.84920635])

### Cross Val with colsample_bytree= [0.6, 0.7, 0.8, 0.9]
Best 0.9 but very close to 0.7


Default = 1 (all)

In [29]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=63,
                       n_estimators=400, 
                       max_depth=50,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=1.0,
                       min_split_gain=0.05,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result

# Cross Val Outcome
# colsample_bytree= 1.0
# array([ 0.74834656,  0.75165344,  0.75760582,  0.77480159,  0.84722222])
# colsample_bytree= 0.9
# array([ 0.75297619,  0.75628307,  0.75859788,  0.77513228,  0.85019841])
# colsample_bytree= 0.8
# array([ 0.74768519,  0.75066138,  0.76256614,  0.76917989,  0.84986772])
# colsample_bytree= 0.7
# array([ 0.75066138,  0.75231481,  0.75165344,  0.77281746,  0.85019841])
# colsample_bytree= 0.6
# array([ 0.74272487,  0.74537037,  0.75396825,  0.77414021,  0.8505291 ])

array([ 0.74834656,  0.75165344,  0.75760582,  0.77480159,  0.84722222])

### Cross Val with max_depth= [10, 20, 40, 50, 60]

Best is achieved @ 20


Default= -1 (No limit)

In [26]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=63,
                       n_estimators=400, 
                       max_depth=20,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.9,
                       min_split_gain=0.05,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result

# Cross Val Outcome
# max_depth=10
# array([ 0.74603175,  0.75      ,  0.75628307,  0.76752646,  0.84887566])
# max_depth=[20,40,50,60]
# array([ 0.75297619,  0.75628307,  0.75859788,  0.77513228,  0.85019841])

array([ 0.74603175,  0.75      ,  0.75628307,  0.76752646,  0.84887566])

### Cross Val with num_leaves= [20, 31, 40, 63]

Default seems to work well

Default= 31

In [87]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=31,
                       n_estimators=400, 
                       max_depth=20,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.9,
                       min_split_gain=0.05,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result

# Cross Val Outcome
# num_leaves= [20,]
# array([ 0.74537037,  0.75297619,  0.75958995,  0.77612434,  0.85185185])
# num_leaves= [31,40,63]
# array([ 0.75297619,  0.75628307,  0.75859788,  0.77513228,  0.85019841])

array([ 0.7473545 ,  0.75033069,  0.76124339,  0.7718254 ,  0.84821429])

### Cross Val with bagging

Bagging is not giving a better result than the current configuration


Default is no bagging
bagging_freq= None
bagging_fraction=0

In [95]:

result=cross_val_score(lgb.LGBMClassifier(boosting_type='gbdt',
                       objective='multiclass',
                       num_leaves=31,
                       n_estimators=400, 
                       max_depth=20,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=0.9,
                       min_split_gain=0.05,
                       bagging_freq=3,
                       bagging_fraction=0.9,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result

# Cross Val Outcome
# Best so far
# array([ 0.74537037,  0.75297619,  0.75958995,  0.77612434,  0.85185185])

# bagging_freq=5
# bagging_fraction=0.8
# array([ 0.73710317,  0.73974868,  0.75396825,  0.75892857,  0.83399471])

# bagging_freq=5
# bagging_fraction=0.9
# array([ 0.74503968,  0.75165344,  0.75661376,  0.77149471,  0.84457672])

# bagging_freq=8
# bagging_fraction=0.9
# array([ 0.73875661,  0.74570106,  0.75595238,  0.77050265,  0.84292328])

# bagging_freq=3
# bagging_fraction=0.9
# array([ 0.72982804,  0.75396825,  0.75628307,  0.77314815,  0.84259259])

array([ 0.72982804,  0.75396825,  0.75628307,  0.77314815,  0.84259259])

### **************Start
## Light GBM       boosting_type='dart'

When boosting_type='rf' then colsample_bytree=1.0

In [88]:
result=cross_val_score(lgb.LGBMClassifier(boosting_type='dart',
                       objective='multiclass',
                       num_leaves=31,
                       n_estimators=400, 
                       max_depth=20,
                       learning_rate=0.1,
                       min_data_in_leaf=500,
                       colsample_bytree=1.0,
                       min_split_gain=0.05,
                       classes=[1,2,3,4,5,6,7]
                       )
                       ,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)

result
# Cross Val Outcome
# boosting_type='gbdt'
# array([ 0.74537037,  0.75297619,  0.75958995,  0.77612434,  0.85185185])
# boosting_type='dart'
# array([ 0.68584656,  0.70899471,  0.73974868,  0.74041005,  0.8042328 ])

array([ 0.68584656,  0.70899471,  0.73974868,  0.74041005,  0.8042328 ])

In [91]:
lgb.LGBMClassifier?

## Previous Attempt

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train_tf,label=Y_train_temp)
params = {}
params['learning_rate'] = 0.003
params['device'] = 'cpu'
params['boosting_type'] = 'dart'
#params['boosting_type'] = 'rf'
#params['boosting_type'] = 'gbdt'
params['application'] = 'multiclass'
params['num_class']= 7
params['metric'] = 'multi_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 30
params['min_data'] = 100
params['max_depth'] = 15
clf = lgb.train(params, d_train, 500)