In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

%matplotlib inline
sns.set_style('white')

In [2]:
# Using the documentation, let's name our columns appropriately. We have 10 quantitative, numeric features,
# followed by 4 qualitative binary features for wilderness area, 40 qualitative binary features for soil type,
# and finally our outcome variable, 'Cover', which is an integer 1-7.

soil_types = ['Soil_Type_%.2d' %x for x in range(1, 41)]

col_names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 
             'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
             'Horizontal_Distance_To_Fire_Points', 'Wild_Area_1', 'Wild_Area_2', 'Wild_Area_3', 'Wild_Area_4']

col_names = col_names + soil_types + ['Cover']

In [3]:
# Load in the data.

raw_data = pd.read_csv('/Users/maxcalabro/Coding/Thinkful/Unit 3 Capstone/Land Cover Type Classification/covtype.data', names=col_names)
raw_data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [5]:
# Create a new column for the cosine of the Aspect.

raw_data['Aspect_Cos'] = np.cos(np.radians(raw_data.Aspect))

In [6]:
# Split the data frame first.

data_train, data_test = train_test_split(raw_data, test_size=0.3)
data_train_nosamp = data_train.copy()
data_test_nosamp = data_test.copy()

In [8]:
# And finally, we'll create our X and Y arrays for processing.

X_train = data_train.loc[:, ~data_train.columns.isin(['Cover'])]
Y_train = data_train.Cover
X_test = data_test.loc[:, ~data_train.columns.isin(['Cover'])]
Y_test = data_test.Cover

print('Sanity check on array shapes:\n', X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

Sanity check on array shapes:
 (406708, 55) (406708,) (174304, 55) (174304,)


In [7]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix, classification_report

from sklearn.neural_network import MLPClassifier

In [9]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu')
mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

0.80671390776675156

In [19]:
mlp = MLPClassifier(hidden_layer_sizes=(20, 20, 20), activation='relu')
% timeit -r1 -n1 mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

4min 49s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


0.75758445015604925

In [20]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50, 50), activation='relu')
% timeit -r1 -n1 mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

16min 29s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


0.8200385533321094

In [21]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu')
% timeit -r1 -n1 mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

19min 28s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


0.84388195336882688

We're getting better and better accuracy the more complicated we make our neural net, which makes sense. It's also taking longer and longer to run. With our KNN and random forest analyses from a previous run on this data set, we were able to achieve up to 96% accuracy on the test set. Let's run one more even more dense neural network and see how well we can do without (hopefully) taking all day to run. 

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(200, 100, 100), activation='relu')
% timeit -r1 -n1 mlp.fit(X_train, Y_train)
mlp.score(X_test, Y_test)

1h 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


0.86463305489260145

Slowly getting better, but at this rate it IS going to take all day to get up into the high 90% accuracy range. Also, this data set has 500,000 points, and we'd be better off with more for a neural network. 