# Purpose

Learning more about NN's by this example of predicting forest cover type. As a baseline we're using a simple random forest, with default parameters. Right now that one is doing better, with an accuracy of ~0.95, whereas the NN is currently sitting at ~0.87

## To do

Grid over batch sizes and epochs

Try difference structures / depths for the NN

Study the confusion matrix to see what errors we're making

See if there's a connection between the errors and the features, in order to guide possible engineering of new features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.metrics import CategoricalAccuracy
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import accuracy_score


In [None]:
df0 = pd.read_csv('cover_data.csv') #Everything looks ok, no missing data, no unexpected data types
y = df0['class']
y = y - 1 #For the sake of the sparse categorical cross entropy we need labels between 0 and 6
df1 = df0.drop(['class'],axis=1) #Drop target variable

In [None]:
num_cols = df1.loc[:,df1.nunique()>=50]
binary_cols = df1.loc[:,df1.nunique()<50]

In [None]:
for col in num_cols:
    plt.figure(figsize = (4,3))
    plt.hist(df1[col])
    plt.title(col + ', skew = ' + str(skew(df1[col])))
    plt.show()
    
#From the looks of it, these transformations should be useful:
#Square: Elevation, Hillshade_9am, Hillshade_Noon
#Something for bimodal distributions: Aspect
#Square root: Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Horizontal_Distance_To_Fire_Points
#No Transformation: Hillshade_3pm

left_skewed_transform_cols = ['Elevation', 'Hillshade_9am', 'Hillshade_Noon']
right_skewed_transform_cols = ['Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points']
bimodal_cols = ['Aspect']

In [None]:
df2 = df1.copy() #Transformed columns

for col in left_skewed_transform_cols:
    df2[col] = np.square(df2[col])
    
for col in right_skewed_transform_cols: #sqrt worked better than log and log1p
    df2[col] = np.sqrt(df2[col]+(-np.min(df2[col]))+1)

In [None]:
#Not perfect, but certainly better
for col in num_cols:
    #plt.figure(figsize = (4,6))
    fig, axs = plt.subplots(nrows=1, ncols=2)
    fig.set_figwidth(10)
    fig.set_figheight(4)
    axs[0].hist(df1[col])
    axs[0].set_xlabel(col + ', ' + str(skew(df1[col])))
    plt.title(col)
    axs[1].hist(df2[col])
    axs[1].set_xlabel(col + ', ' + str(skew(df2[col])))
    plt.show()

In [None]:
#Normalize data
df3 = pd.DataFrame(StandardScaler().fit_transform(df2)) #Normalize
df3.columns = df2.columns #Take the name of the columns

In [None]:
#Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(df3, y, test_size=0.2, random_state=42, stratify=y)

## Random Forest

In [None]:
#Building a random forest model, to serve as a baseline

#Random Forest
rf_clf = RandomForestClassifier()

#If we want to do a grid search

#rf_param_grid = {
#    'n_estimators': [50, 100],
#    'max_depth': [None, 10, 20],
#    'min_samples_split': [2, 10],
#    'min_samples_leaf': [1, 4],
#    #'max_features': ['auto', 'sqrt']
#}

#rf_grid_search = GridSearchCV(rf_clf, rf_param_grid,cv=2,scoring='accuracy',verbose=1)
#rf_grid_search.fit(X_train,y_train)
#best_rf_clf = rf_grid_search.best_estimator_

rf_clf.fit(X_train,y_train)
#Scoring the best random forest
print(rf_clf.score(X_test,y_test))

#~0.95 without any parameter optimization, not too bad...

## NN

In [None]:
#Building the DNN
#Creating the model
model = keras.Sequential()
#input_layer = layers.InputLayer(input_shape=(X_train.shape[1],))
#hidden_layer_1 = layers.Dense(512, activation="relu")
hidden_layer_1 = layers.Dense(64, input_dim=X_train.shape[1], activation="relu")
hidden_layer_2 = layers.Dense(32, activation="relu")
#output_layer = layers.Dense(y.nunique(), activation="softmax")
output_layer = layers.Dense(7, activation="softmax")

model.add(hidden_layer_1)
#model.add(hidden_layer_1)
model.add(hidden_layer_2)
model.add(output_layer)

model.compile(optimizer="Adam",
    loss="sparse_categorical_crossentropy",
    metrics=[SparseCategoricalAccuracy()]
)

print(model.summary())

In [None]:
EPOCHS = 100
BATCH_SIZE = 1024

history = model.fit(
    x=X_train,
    y=y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_data=(X_test,y_test)
)

y_true = np.asarray(y_test)
probabilities = model.predict(X_test)
y_pred = np.argmax(probabilities, axis=1)

accuracy_score(y_true, y_pred) #Sitting at 0.866, ways away from the random forest

In [None]:
plt.plot(history.history['sparse_categorical_accuracy'])
plt.plot(history.history['val_sparse_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred)

#Display the confusion matrix using seaborn
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()