In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
test_ids = test['Id']

def r(x):
    if x+180>360:
        return x-180
    else:
        return x+180

## Add new columns for training data
df['EVDtH'] = df.Elevation-df.Vertical_Distance_To_Hydrology
df['EHDtH'] = df.Elevation-df.Horizontal_Distance_To_Hydrology*0.2
df['Highwater'] = df.Vertical_Distance_To_Hydrology < 0
df['Aspect2'] = df.Aspect.map(r)
df.ix[df.Hillshade_3pm==0, 'Hillshade_3pm'] = df.Hillshade_3pm.median()

## Add new columns for test data
test['EVDtH'] = test.Elevation-test.Vertical_Distance_To_Hydrology
test['EHDtH'] = test.Elevation-test.Horizontal_Distance_To_Hydrology*0.2
test['Highwater'] = test.Vertical_Distance_To_Hydrology < 0
test['Aspect2'] = test.Aspect.map(r)
test.ix[test.Hillshade_3pm==0, 'Hillshade_3pm'] = test.Hillshade_3pm.median()

features = [col for col in df.columns if col not in ['Cover_Type','Id','is_train']]

X_train = df[features]
X_test = test[features]
y_training = df['Cover_Type']

len(X_test)

In [2]:
## Make GBM
rt = .36
md = 5
ne = 400
mf = None
gbm = GradientBoostingClassifier(learning_rate=rt, max_depth=md, n_estimators=ne, max_features=mf)
gbm.fit(X_train, y_training)

GradientBoostingClassifier(init=None, learning_rate=0.36, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=400,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [5]:
## Make Nearest Neighbor
clf = RandomForestClassifier()
clf.fit(X_train, y_training)

scaler = StandardScaler()
scaled_train = scaler.fit_transform(X_train.astype(np.float))
scaled_test = scaler.transform(X_test.astype(np.float))

scaled_train = pd.DataFrame(scaled_train, columns=features)
scaled_test = pd.DataFrame(scaled_test, columns=features)

feature_importances = []
for i in range(0, len(clf.feature_importances_)):
    feature_importances.append((clf.feature_importances_[i] + 0.005)*5)

col_index = 0
for col in features:
    scaled_train[col] = scaled_train[col]*feature_importances[col_index]
    scaled_test[col] = scaled_test[col]*feature_importances[col_index]
    col_index += 1

neighbor = KNeighborsClassifier(n_neighbors=5) #weights="distance"
neighbor_fitted = neighbor.fit(scaled_train, y_training)

MemoryError: 

In [None]:
## Make SVM
scaler = StandardScaler()
scaled_train2 = scaler.fit_transform(X_train.astype(np.float))
scaled_test2 = scaler.transform(X_test.astype(np.float))

scaled_train2 = pd.DataFrame(scaled_train2, columns=features)
scaled_test2 = pd.DataFrame(scaled_test2, columns=features)

rbf_svc = svm.SVC(C=1500, kernel="rbf", max_iter=-1) #5.5, 200
svm_fitted = rbf_svc.fit(scaled_train2, y_training)

In [None]:
## Combine predictions
gbm_predict = gbm.predict(X_test)
nn_predict = neighbor_fitted.predict(scaled_test)
svm_predict = svm_fitted.predict(scaled_test2)

voted_predict = []
for i in range(0,len(svm_predict)):
    a = np.array([gbm_predict[i], svm_predict[i], nn_predict[i]])
    (values,counts) = np.unique(a,return_counts=True)
    ind=np.argmax(counts)
    voted_predict.append(values[ind])

with open('output.csv', "w") as outfile:
    outfile.write("Id,Cover_Type\n")
    for e, val in enumerate(voted_predict):
        outfile.write(str(test_ids[e])+","+str(val))
