In [39]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Feature Creation for train

In [3]:
train['Hillshade_mean'] = train['Hillshade_9am']+train['Hillshade_Noon']+train['Hillshade_3pm']
train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
train['log_elevation']= np.log(train.Elevation)
train['Hillshade_9am_sq'] = train['Hillshade_9am']**2
train['Hillshade_noon_sq'] = train['Hillshade_Noon']**2
train['Hillshade_3pm_sq'] = train['Hillshade_3pm']**2

train['cosine_slope'] = np.cos(train['Slope'])

a = pd.cut(train['Aspect'],bins=[-1, 45, 90, 135, 180, 225, 270, 315, 360],
           labels=['Aspect_1','Aspect_2','Aspect_3','Aspect_4','Aspect_5','Aspect_6','Aspect_7','Aspect_8'])
b = pd.get_dummies(a)
train.merge(b,right_index=True,left_index=True)
 

train['interaction_9amnoon'] = train['Hillshade_9am']*train['Hillshade_Noon']
train['interaction_noon3pm'] = train['Hillshade_3pm']*train['Hillshade_Noon']
train['interaction_9am3pm'] = train['Hillshade_3pm']*train['Hillshade_9am']

train['HF1'] = train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Fire_Points']
train['HF2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Fire_Points'])
train['HR1'] = abs(train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Roadways'])
train['HR2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Roadways'])
train['FR1'] = abs(train['Horizontal_Distance_To_Fire_Points']+train['Horizontal_Distance_To_Roadways'])
train['FR2'] = abs(train['Horizontal_Distance_To_Fire_Points']-train['Horizontal_Distance_To_Roadways'])
train['ele_vert'] = train.Elevation-train.Vertical_Distance_To_Hydrology

train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
train['Mean_Amenities']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology + train.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
train['Mean_Fire_Hyd']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology) / 2

Feature Creation for test

In [4]:
test['Hillshade_mean'] = test['Hillshade_9am']+test['Hillshade_Noon']+test['Hillshade_3pm']
test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
test['log_elevation']= np.log(test.Elevation)
test['Hillshade_9am_sq'] = test['Hillshade_9am']**2
test['Hillshade_noon_sq'] = test['Hillshade_Noon']**2
test['Hillshade_3pm_sq'] = test['Hillshade_3pm']**2

test['cosine_slope'] = np.cos(test['Slope'])

a = pd.cut(test['Aspect'],bins=[-1, 45, 90, 135, 180, 225, 270, 315, 360],
           labels=['Aspect_1','Aspect_2','Aspect_3','Aspect_4','Aspect_5','Aspect_6','Aspect_7','Aspect_8'])
b = pd.get_dummies(a)
test.merge(b,right_index=True,left_index=True)

test['interaction_9amnoon'] = test['Hillshade_9am']*test['Hillshade_Noon']
test['interaction_noon3pm'] = test['Hillshade_3pm']*test['Hillshade_Noon']
test['interaction_9am3pm'] = test['Hillshade_3pm']*test['Hillshade_9am']

test['HF1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']
test['HF2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])
test['HR1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])
test['HR2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])
test['FR1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])
test['FR2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])
test['ele_vert'] = test.Elevation-test.Vertical_Distance_To_Hydrology

test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
test['Mean_Amenities']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology + test.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
test['Mean_Fire_Hyd']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology) / 2

In [5]:
X_train = train.drop(['Id','Cover_Type'],1)
Y_train = train['Cover_Type']
X_test = test.drop(['Id'],1)

In [6]:
scaler = StandardScaler()
X_train_tf = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_tf = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)

In [86]:
X_train_tf.head(10)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,interaction_9am3pm,HF1,HF2,HR1,HR2,FR1,FR2,ele_vert,Mean_Amenities,Mean_Fire_Hyd
0,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,4.334805,...,0.690441,4.165194,4.390517,-0.848236,-0.959357,1.702247,5.505993,-0.246574,1.673476,4.165194
1,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,4.28571,...,0.760204,4.078395,4.38309,-0.968254,-1.016605,1.619137,5.580595,-0.246574,1.571055,4.078395
2,0.130912,-0.160577,-0.887379,0.194243,0.227369,1.106379,0.696843,0.834797,-0.002005,4.191156,...,0.538108,4.036731,4.234547,1.089401,1.098462,2.902092,2.30939,0.098281,2.847589,4.036731
3,0.085421,-0.015231,0.17725,0.070474,1.092853,1.038455,0.827731,0.834797,-0.285268,4.272981,...,0.190113,4.092283,4.34224,1.005533,1.04895,2.902092,2.512851,-0.075352,2.835485,4.092283
4,-0.369489,-1.014485,-1.715424,-0.353198,-0.850404,-0.998491,0.238732,0.659368,0.324838,4.237524,...,0.730228,3.98118,4.38866,-1.010188,-0.970188,1.594299,5.519557,-0.246574,1.519379,3.98118
5,-0.407798,-0.224166,-1.242255,0.346574,-1.079023,-1.243015,0.565954,0.79094,0.106943,4.10933,...,0.621224,3.986388,4.121283,-1.138159,-0.974056,1.372194,5.726409,-0.251397,1.371335,3.986388
6,-0.343152,-1.014485,-1.123963,0.203764,-0.752425,-0.815853,0.304176,0.264652,0.063364,4.313894,...,0.408121,4.155646,4.358023,-0.750631,-0.873486,1.750011,5.340963,-0.234516,1.725617,4.155646
7,-0.345547,-0.978148,-1.478839,0.032391,-0.719765,-0.861135,0.304176,0.483939,0.194101,4.288437,...,0.589613,4.100095,4.36545,-0.820039,-0.892053,1.707979,5.377134,-0.241751,1.667889,4.100095
8,-0.316816,-1.014485,-0.887379,0.060953,0.0804,-0.790947,0.336898,0.089223,-0.045584,4.302984,...,0.275,4.11919,4.374734,-0.748462,-0.824748,1.760042,5.290098,-0.330979,1.721427,4.11919
9,-0.328787,-0.887307,-0.769087,0.094276,-0.654445,-0.813589,0.500509,0.001508,-0.241689,4.290256,...,0.086014,4.113114,4.355238,-0.765091,-0.853372,1.739026,5.308183,-0.234516,1.704202,4.113114


## **************Start
## ExtraTreeClassifier

In [12]:
m1 = ExtraTreesClassifier(n_estimators=500,
                          #criterion='gini',
                          criterion='entropy',
                          max_features='log2',
                          max_depth=31,
                          min_samples_split=5,
                         )
result=cross_val_score(m1,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)
result

# criterion='gini'
# array([ 0.78703704,  0.77149471,  0.77943122,  0.8098545 ,  0.87268519])
# criterion='entropy'
# array([ 0.7853836 ,  0.76884921,  0.77843915,  0.81580688,  0.87334656])

array([ 0.7853836 ,  0.76884921,  0.77843915,  0.81580688,  0.87334656])

## ExtraTreeClassifier min_samples_split

In [21]:
m1 = ExtraTreesClassifier(n_estimators=500,
                          #criterion='gini',
                          criterion='entropy',
                          max_features='log2',
                          max_depth=31,
                          min_samples_split=3
                         )
result=cross_val_score(m1,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)
result

# min_samples_split=5
#array([ 0.7853836 ,  0.76884921,  0.77843915,  0.81580688,  0.87334656])
# min_samples_split=3
# array([ 0.79464286,  0.77480159,  0.7827381 ,  0.8207672 ,  0.88062169])

array([ 0.79464286,  0.77480159,  0.7827381 ,  0.8207672 ,  0.88062169])

In [37]:
m1 = ExtraTreesClassifier(n_estimators=500,
                          #criterion='gini',
                          criterion='entropy',
                          max_features='log2',
                          max_depth=31,
                          min_samples_split=3,
                          class_weight='balanced'
                         )
result=cross_val_score(m1,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)
result

array([ 0.78240741,  0.75793651,  0.78108466,  0.81911376,  0.86805556])

In [28]:
from sklearn.ensemble import AdaBoostClassifier

m2 = AdaBoostClassifier(m1,
                        n_estimators=500, 
                        learning_rate=0.01, 
                        algorithm='SAMME') 

result=cross_val_score(m2,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)
result

array([ 0.79563492,  0.7744709 ,  0.7827381 ,  0.8207672 ,  0.87665344])

In [16]:
ExtraTreesClassifier?

In [40]:
m2 = RandomForestClassifier(n_estimators=150,
                            max_depth=50,
                            max_features=20,
                            criterion='entropy',
                            n_jobs=4)

m4 = AdaBoostClassifier(m2,
                        n_estimators=500, 
                        learning_rate=0.01, 
                        algorithm='SAMME') 

result=cross_val_score(m4,X=X_train_tf,y=Y_train,scoring='accuracy',cv=5,n_jobs=4)
result

array([ 0.78670635,  0.77546296,  0.78505291,  0.80886243,  0.86574074])