In [None]:
import os
import csv
import sys 
import numpy as np 
import pandas as pd 
from math import sqrt
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 
from sklearn.grid_search import RandomizedSearchCV 
from sklearn.metrics import confusion_matrix 
from sklearn import preprocessing 
from scipy.stats import randint as sp_randint
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import AdaBoostClassifier 

path = '/home/nate_black/Dropbox/MIDS/ML/Project/' 

from time import time 
t0 = time() 

print "Let's grab for our ingredients to start making a delicious dish!" 

train_data = pd.read_csv(os.path.join(path,'train.csv'), header=0) 
train_labels = train_data['Cover_Type'].as_matrix() 
train_id = train_data['Id'].as_matrix() 
train_data.drop(['Cover_Type', 'Id'], axis=1, inplace=True) 
test_data = pd.read_csv(os.path.join(path,'test.csv'), header=0) 
test_id = test_data['Id'].as_matrix() 
test_data.drop(['Id'], axis=1, inplace=True)  


print "Now, let's use some of our ingredients to create a wonderful mix!" 

#Azimuth Adjustment
def adjust(x): 
    if x+180>360: 
        return x-180 
    else: 
        return x+180 

#Feature Engineering
train_data['Aspect2'] = train_data.Aspect.map(adjust) 
test_data['Aspect2'] = test_data.Aspect.map(adjust) 

train_data['Energy'] = ((train_data['Hillshade_9am']+train_data['Hillshade_Noon'])/2)*10800 + ((train_data['Hillshade_Noon']+train_data['Hillshade_3pm'])/2)*10800
test_data['Energy'] = ((test_data['Hillshade_9am']+test_data['Hillshade_Noon'])/2)*10800 + ((test_data['Hillshade_Noon']+test_data['Hillshade_3pm'])/2)*10800

train_data.drop('Hillshade_9am', axis=1, inplace=True)
test_data.drop('Hillshade_9am', axis=1, inplace=True)

train_data.drop('Hillshade_Noon', axis=1, inplace=True)
test_data.drop('Hillshade_Noon', axis=1, inplace=True)

train_data.drop('Hillshade_3pm', axis=1, inplace=True)
test_data.drop('Hillshade_3pm', axis=1, inplace=True)

train_data['Above_Sea_Level'] = train_data.Vertical_Distance_To_Hydrology < 0 
test_data['Above_Sea_Level'] = test_data.Vertical_Distance_To_Hydrology < 0 

train_data['Vertical_To_Water'] = train_data.Elevation-train_data.Vertical_Distance_To_Hydrology 
test_data['Vertical_To_Water'] = test_data.Elevation-test_data.Vertical_Distance_To_Hydrology 

train_data['Horizontal_To_Water'] = train_data.Elevation-train_data.Horizontal_Distance_To_Hydrology*0.2 
test_data['Horizontal_To_Water'] = test_data.Elevation-test_data.Horizontal_Distance_To_Hydrology*0.2 
      
train_data['Pythagorean_To_Water'] = (train_data['Horizontal_Distance_To_Hydrology']**2+train_data['Vertical_Distance_To_Hydrology']**2)**0.5 
test_data['Pythagorean_To_Water'] = (test_data['Horizontal_Distance_To_Hydrology']**2+test_data['Vertical_Distance_To_Hydrology']**2)**0.5 

train_data['Water_And_Fire'] = train_data['Horizontal_Distance_To_Hydrology']+train_data['Horizontal_Distance_To_Fire_Points'] 
test_data['Water_And_Fire'] = test_data['Horizontal_Distance_To_Hydrology']+test_data['Horizontal_Distance_To_Fire_Points'] 
 
train_data['Water_Less_Fire'] = abs(train_data['Horizontal_Distance_To_Hydrology']-train_data['Horizontal_Distance_To_Fire_Points']) 
test_data['Water_Less_Fire'] = abs(test_data['Horizontal_Distance_To_Hydrology']-test_data['Horizontal_Distance_To_Fire_Points']) 

train_data['Water_And_Roadway'] = abs(train_data['Horizontal_Distance_To_Hydrology']+train_data['Horizontal_Distance_To_Roadways']) 
test_data['Water_And_Roadway'] = abs(test_data['Horizontal_Distance_To_Hydrology']+test_data['Horizontal_Distance_To_Roadways']) 
 
train_data['Water_Less_Roadway'] = abs(train_data['Horizontal_Distance_To_Hydrology']-train_data['Horizontal_Distance_To_Roadways']) 
test_data['Water_Less_Roadway'] = abs(test_data['Horizontal_Distance_To_Hydrology']-test_data['Horizontal_Distance_To_Roadways']) 
 
train_data['Fire_And_Roadway'] = abs(train_data['Horizontal_Distance_To_Fire_Points']+train_data['Horizontal_Distance_To_Roadways']) 
test_data['Fire_And_Roadway'] = abs(test_data['Horizontal_Distance_To_Fire_Points']+test_data['Horizontal_Distance_To_Roadways']) 

train_data['Fire_Less_Roadway'] = abs(train_data['Horizontal_Distance_To_Fire_Points']-train_data['Horizontal_Distance_To_Roadways']) 
test_data['Fire_Less_Roadway'] = abs(test_data['Horizontal_Distance_To_Fire_Points']-test_data['Horizontal_Distance_To_Roadways']) 

train_data['Is_Roadway_Closer_than_Water'] = (train_data['Horizontal_Distance_To_Roadways'] < train_data['Horizontal_Distance_To_Hydrology'])
test_data['Is_Roadway_Closer_than_Water'] = (test_data['Horizontal_Distance_To_Roadways'] < test_data['Horizontal_Distance_To_Hydrology'])

train_data['Is_Firepoint_Closer_than_Water'] = (train_data['Horizontal_Distance_To_Fire_Points'] < train_data['Horizontal_Distance_To_Hydrology'])
test_data['Is_Firepoint_Closer_than_Water'] = (test_data['Horizontal_Distance_To_Fire_Points'] < test_data['Horizontal_Distance_To_Hydrology'])

train_matrix = train_data.as_matrix() 
test_X = test_data.as_matrix() 
n_features = int(sqrt(train_matrix.shape[1])) 
 
print "With our best magnifying glass, lets find the right combination of ingredients and get ready to bake!"  

#clf = ExtraTreesClassifier()   
#clf = clf.fit(train_matrix, train_labels) 
#param_grid = { 
    #"n_estimators" : sp_randint(10, 1000), 
    #"max_features": sp_randint(0,20), 
    #"min_samples_leaf": sp_randint(1,10), 
    #"min_samples_split": sp_randint(1,10), 
    #"criterion": ["gini", "entropy"], 
#} 

#grid_search = RandomizedSearchCV(clf, n_iter=20, param_distributions=param_grid) 
#grid_search.fit(train_matrix, train_labels) 
#best_parameters = grid_search.best_params_ 

#the following parameters are good  
#best_parameters = {'n_estimators' : 35, 'max_features' : 17, 'min_samples_split' : 3, \ 
#'min_samples_leaf' : 3, 'bootstrap' : False, 'criterion': "entropy"} 

#print best_parameters 

#clfa = ExtraTreesClassifier(n_estimators = 858, max_features = 7, min_samples_leaf = 1, min_samples_split = 2, criterion = 'entropy') 
clfa = ExtraTreesClassifier(n_estimators = 600, max_features = 0.3, criterion = 'entropy')

#clf = AdaBoostClassifier(clfa, n_estimators = 600) 
 
clfa.fit(train_matrix, train_labels)
 
print 'With the oven preheated and our ingredients ready, pop it in the oven!'
 
predictions = clfa.predict(test_X).astype(int) 
print predictions.shape 
 
print "Just need to cut our delicious baked good into slices..." 

prediction_write = open(os.path.join(path,"sampleSubmissionGSCV_New1.csv"), "wb") 
open_file_object = csv.writer(prediction_write) 
open_file_object.writerow(["Id","Cover_Type"]) 
open_file_object.writerows(zip(test_id, predictions)) 
prediction_write.close() 
print 'Ready to eat! (Be very careful! Contents hot!)'  

print 'Wait... how long did that take?'

print("... in %0.3fs" % (time() - t0))

Let's grab for our ingredients to start making a delicious dish!
Now, let's use some of our ingredients to create a wonderful mix!
With our best magnifying glass, lets find the right combination of ingredients and get ready to bake!
With the oven preheated and our ingredients ready, pop it in the oven!