In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn import metrics
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import RandomizedSearchCV

frame = pd.read_csv('/home/nate_black/Dropbox/MIDS/ML/Project/train.csv')
frame = frame.reindex(np.random.permutation(frame.index))
print frame.shape
test_frame = pd.read_csv('/home/nate_black/Dropbox/MIDS/ML/Project/test.csv')
print test_frame.shape

test_ids = test_frame['Id'].values

frame = frame.drop(['Id'], 1)
test_frame = test_frame.drop(['Id'], 1)

frame_cover_type = frame['Cover_Type'].values
frame = frame.drop(['Cover_Type'], 1)

X = frame.values
y = frame_cover_type

n = int(X.shape[0] * 0.8)
y_train, X_train = y[:n], X[:n, :]
y_train = np.hstack(y_train)

y_dev, X_dev = y[n:], X[n:]
y_dev = np.hstack(y_dev)

(15120, 56)
(565892, 55)


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

del train_data_scaled['Cover_Type']
del train_data_scaled['Id']
del test_data_scaled['Id']

min_max_scaler = preprocessing.MinMaxScaler()

for col in train_data_scaled.keys():
    x = train_data_scaled[col].values.astype(float)
    train_data_scaled[col] = preprocessing.scale(min_max_scaler.fit_transform(x))
    y = test_data_scaled[col].values.astype(float)
    test_data_scaled[col] = preprocessing.scale(min_max_scaler.transform(y))

In [3]:
from sklearn.svm import SVC

svc = SVC(C = 6, gamma= 0.02, probability=True)
svc.fit(train_data_scaled, train_data['Cover_Type'].values)
svc.score(train_data_scaled, train_data['Cover_Type'].values)
pred = svc.predict(train_data_scaled)
metrics.confusion_matrix(train_data['Cover_Type'], pred)

array([[1651,  303,    1,    0,   74,    9,  122],
       [ 388, 1420,   50,    1,  228,   58,   15],
       [   0,   10, 1516,  209,   26,  399,    0],
       [   0,    0,   46, 2077,    0,   37,    0],
       [   5,  108,   68,    0, 1937,   42,    0],
       [   1,   17,  342,  129,   14, 1657,    0],
       [  98,    5,    0,    0,    0,    0, 2057]])

In [4]:
rfc = RandomForestClassifier(n_estimators=400, max_features=10, max_depth=20, criterion='entropy')
rfc.fit(train_data_scaled, train_data['Cover_Type'].values)
pred_dev = rfc.predict(train_data_scaled)
metrics.confusion_matrix(train_data['Cover_Type'], pred_dev)

array([[2159,    0,    0,    0,    1,    0,    0],
       [   0, 2154,    0,    0,    6,    0,    0],
       [   0,    0, 2160,    0,    0,    0,    0],
       [   0,    0,    0, 2160,    0,    0,    0],
       [   0,    0,    0,    0, 2160,    0,    0],
       [   0,    0,    0,    0,    0, 2160,    0],
       [   0,    0,    0,    0,    0,    0, 2160]])

In [64]:
rf_prob = rfc.predict_proba(train_data_scaled)
sv_prob = svc.predict_proba(train_data_scaled)

df = pd.DataFrame(train_data['Cover_Type'].values, columns = ['Actual_Cover_Type'])
df['SVC_Prediction'] = pred
df['RF_Prediction'] = pred_dev
df['SVC_Prob'] = np.max(sv_prob, axis = 1)*100
df['RF_Prob'] = np.max(rf_prob, axis = 1)*100
df['Combo'] = np.where((df.SVC_Prob > 50.0) & (df.RF_Prob < 50.0) & (df.RF_Prediction < 3), df.SVC_Prediction, df.RF_Prediction)



In [65]:
df[df.SVC_Prediction != df.RF_Prediction].head()

Unnamed: 0,Actual_Cover_Type,SVC_Prediction,RF_Prediction,SVC_Prob,RF_Prob,Combo
5,2,5,2,55.507239,75.0,2
15,5,2,5,89.333614,86.75,5
39,5,2,5,90.423881,70.75,5
43,5,1,5,75.798356,64.75,5
51,1,2,1,57.830246,70.5,1


In [66]:
print metrics.confusion_matrix(train_data['Cover_Type'], df['Combo'])
print metrics.accuracy_score(train_data['Cover_Type'], df['Combo'])

[[2160    0    0    0    0    0    0]
 [   0 2160    0    0    0    0    0]
 [   0    0 2160    0    0    0    0]
 [   0    0    0 2160    0    0    0]
 [   0    0    0    0 2160    0    0]
 [   0    0    0    0    0 2160    0]
 [   0    0    0    0    0    0 2160]]
1.0


In [5]:
rf_pred_test = rfc.predict(test_data_scaled)

In [6]:
out = pd.DataFrame(test_data.Id, columns=['Id'])
out['Cover_Type'] = rf_pred_test
out.to_csv('scaled_rf.csv', index = False, index_col = False)