# Problem Session 12
## Neural Networks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras import layers

### Carseats Regression Problem

First run the following code to clean the data.

In [None]:
cars = pd.read_csv("../../Data/car_sales.csv")
cars.dropna(inplace = True)

def clean_column(text):
    return float(text.split()[0])

## Cleaning the mileage, engine and max_power columns
cars['mileage'] = cars['mileage'].apply(clean_column)
cars['engine'] = cars['engine'].apply(clean_column)
cars['max_power'] = cars['max_power'].apply(clean_column)

## creating the age column
cars['age'] = 2020 - cars['year']

## performing the log transform on selling_price and km_driven
cars['log_sell'] = np.log10(cars['selling_price'])
cars['log_km'] = np.log10(cars['km_driven'])

## making one-hot encoded variables for transmission, dealer and owner
cars['automatic'] = 1
cars.loc[cars.transmission=='Manual', 'automatic'] = 0

cars[['first_owner', 'second_owner', 'third_owner']] = pd.get_dummies(cars['owner'])[['First Owner', 
                                                                                      'Second Owner',
                                                                                      'Third Owner']]

cars['dealer'] = 1
cars.loc[cars.seller_type == 'Individual', 'dealer'] = 0

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
cars_train, cars_test = train_test_split(cars.copy(),
                                            test_size=.2,
                                            shuffle=True,
                                            random_state=440)
                                    

In [None]:
# Remind yourself about what the data looks like.

features = ['max_power', 'age', 'engine', 'log_km', 'seats', 'dealer', 'automatic', 'mileage']
            
target = ['log_sell']

cars_train[features + target].sample(10)

In [None]:
# Convert everything to numpy arrays

X_train = cars_train[features].values
y_train = cars_train[target].values
X_test = cars_test[features].values
y_test = cars_test[target].values
X_tt, X_val, y_tt, y_val = train_test_split(X_train, y_train, test_size = 0.2)

Train a vanilla linear regression model on `(X_tt, y_tt)`.

In [None]:
lr = 

Now try to train a feed forward neural network which has better validation MSE.

Some things to consider:
* How will you address normalization of the input data, if at all?
* How many intermediate layers will you use?  What dimensions will they be?
* What activation functions should you use in the hidden layer?
* What final activation function should you use, if any?
* What loss function should you use?
* What are some other hyperparameters you could play with?

If you are not able to beat linear regression after playing for 20 minutes you should move on.  If you successfully train a neural network and observe the loss going down: count this as a victory!

We will use [early stopping callback](https://keras.io/api/callbacks/early_stopping/).

In [None]:
# Mine took about 4 seconds to run.

model = 

callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model.compile(optimizer = , loss = )

history = model.fit(X_tt, y_tt, epochs = , validation_data = (X_val, y_val), batch_size = , callbacks=[callback])

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
print(mean_squared_error(y_val, model.predict(X_val)))
print(mean_squared_error(y_val, lr.predict(X_val)))

In [None]:
# Alternatively use model.evaluate
model.evaluate(X_val, y_val)

In [None]:
## Plotting the training and validation loss
plt.figure(figsize = (8,6))
epochs = len(history.history['val_loss'])
plt.scatter(range(1, epochs + 1),history.history['loss'], label = "Training Loss")
plt.scatter(range(1, epochs + 1),history.history['val_loss'], label = "Validation Loss")

plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Loss Function Value", fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.legend(fontsize=12)

plt.show()

Once you are satisfied, check to see if the test MSE of the neural network model still beats linear regression.

In [None]:
print(model.evaluate(X_test, y_test))
print(mean_squared_error(y_test, lr.predict(X_test)))

### Forest Cover Classification Problem

In this problem we will build a feed forward neural network to classify forest cover type.

In [None]:
#!pip install ucimlrepo

In [None]:
# This cell took about 7 minutes to run for me!

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
covertype = fetch_ucirepo(id=31) 
  
# data (as pandas dataframes) 
X = covertype.data.features 
y = covertype.data.targets 
  
# metadata 
print(covertype.metadata) 
  
# variable information 
print(covertype.variables) 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 216)
X_tt, X_val, y_tt, y_val = train_test_split(X_train,y_train, test_size=0.2, random_state = 216)
X_tt.sample(5)

In [None]:
# Making everything into numpy arrays
X_train = X_train.values
X_tt = X_tt.values
X_val = X_val.values
X_test = X_test.values

# Adjusting class labels to go from 0 to 6 instead of 1 to 7.  Keras expects this.
y_train = y_train.values.reshape(-1) - 1
y_tt = y_tt.values.reshape(-1) - 1
y_val = y_val.values.reshape(-1) - 1
y_test = y_test.values.reshape(-1) - 1

Neural Networks are not a good choice for tabular data like this:  something like XGBoost is much better.  A "default settings" random forest classifier gets around 95% accuracy for this problem.

The first dense feed forward NN I wrote down got 78% validation accuracy.  Can you beat that?

Some things to consider:
* Since this is a multiclass classification problem you want to use 'softmax' as your final activation function.
    * Discussion question:  why do we need a final activation function?  Why not use 'sigmoid'?
* Since your class labels are encoded as the integers $0,1,2,3,4,5,6$, you want to use  ['sparse_categorical_crossentropy'](https://www.tensorflow.org/api_docs/python/tf/keras/losses/sparse_categorical_crossentropy) as your loss and ['sparse_categorical_accuracy'](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/sparse_categorical_accuracy) as your metric when you compile the model.

In [None]:
# Mine took about 5 minutes to run.

clf = 

callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

clf.compile(optimizer = , loss = "sparse_categorical_crossentropy", metrics = 'sparse_categorical_accuracy')

history = clf.fit(X_tt, y_tt, epochs = , validation_data = (X_val, y_val), batch_size = , callbacks=[callback])

In [None]:
clf.evaluate(X_val, y_val.reshape(-1) )

In [None]:
clf.evaluate(X_test, y_test.reshape(-1) )