# Underfitting and overfitting

![under_over_fitting](https://i.imgur.com/eP0gppr.png)


![under_over_fit](https://i.imgur.com/eUF6mfo.png)

## Illustration with examples

This dataset contains 13 attributes and one target variable, that refers to the presence of heart disease in the patient as an integer value from 0 (no presence) to 4 (severe heart disease).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras.metrics

from keras.layers import Dense
from keras.models import Sequential

In [None]:
DATASET_URL = 'https://raw.githubusercontent.com/ne1s0n/coding_excercises/master/data/processed.cleveland.data'

### Reading the data

In [None]:
heart_data = pd.read_csv(DATASET_URL)
heart_data.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','disease_severity']
# heart_data = heart_data.iloc[:,[2,3,4,5,6,7,8,9,10,11,12]]
print(heart_data)

### Preprocessing

Remove missing data, clean and subset

In [None]:
heart_data = heart_data.dropna()
heart_data = heart_data[(heart_data['ca'] != '?') & (heart_data['thal'] != '?')]
print(len(heart_data))

In [None]:
heart_data['disease_severity'].value_counts()

#### Normalize features

(this is suboptimal: do you remember why?)

In [None]:
target_variable = heart_data[["disease_severity"]]
features = heart_data.loc[:, heart_data.columns != 'disease_severity']
print("Size of target variable: ", target_variable.shape)
print("Size of feature matrix: ", features.shape)

In [None]:
print(features)

In [None]:
#the "utils" subpackage is very useful, take a look to it when you have time
from tensorflow.keras.utils import to_categorical

#converting to categorical
target_multi_cat = to_categorical(target_variable)

#since everything else is a Pandas dataframe, let's stick to the format
#for consistency
# target_multi_cat = pd.DataFrame(target_multi_cat)

#let's take a look
print(target_multi_cat)

In [None]:
features = features.astype({'ca':float, 'thal':float})

In [None]:
features=(features-features.mean())/features.std()
print(features)

### Training and test sets

In [None]:
## setting seeds
np.random.seed(600)

import random
random.seed(13)

import tensorflow as tf
tf.random.set_seed(13) ## 166
tf.config.experimental.enable_op_determinism()

In [None]:
#we import a function to perform the split
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(features, target_multi_cat, test_size=0.20)

#let's print some shapes to get an idea of the resulting data structure
print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)

In [None]:
df = pd.DataFrame(target_train, columns = ['0','1','2','3','4'])
df.sum()

In [None]:
df = pd.DataFrame(target_test, columns = ['0','1','2','3','4'])
df.sum()

### Tiny model

In [None]:
## # Configuration options
input_shape = (features_train.shape[1],) ## tuple that specifies the number of features
num_classes = 5
loss_function = 'categorical_crossentropy'
optimizer_used = 'rmsprop' ## or keras.optimizers.adam(lr=0.001)? maybe for softmax regression?
num_epochs = 20

In [None]:
# softmax regression shallow neural network model in Keras
model = Sequential()
model.add(Dense(units=4, input_shape=input_shape, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
#the model is declared, but we still need to compile it to actually
#build all the data structures
model.compile(optimizer=optimizer_used, loss=loss_function, metrics=keras.metrics.CategoricalAccuracy())

In [None]:
history = model.fit(features_train, target_train, epochs=num_epochs, validation_data=(features_test, target_test), verbose = 1)

In [None]:
def plot_loss_history(h, title):
    plt.plot(h.history['loss'], label = "Train loss")
    plt.plot(h.history['val_loss'], label = "Validation loss")
    plt.xlabel('Epochs')
    plt.title(title)
    plt.legend()
    plt.show()

plot_loss_history(history, 'Logistic ({} epochs)'.format(num_epochs))

In [None]:
from sklearn.metrics import confusion_matrix

predictions = model.predict(features_test)
print("predictions:")
print(predictions[0:4])

predicted_classes = np.argmax(predictions,axis=1)
predicted_classes = predicted_classes.reshape(len(predicted_classes),1)

target_classes = np.argmax(target_test, axis=1)

con_mat_df = confusion_matrix(target_classes, predicted_classes, labels = [0,1,2,3,4])
print("\nConfusion matrix:")
print(con_mat_df)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(target_classes, predicted_classes)
print("Overall accuracy is: ", accuracy)

confusion_matrix(target_classes, predicted_classes, normalize='true')

## Small model


In [None]:
num_epochs = 100

# softmax regression shallow neural network model in Keras
model = Sequential()
model.add(Dense(units=4, input_shape=input_shape, activation='relu'))
model.add(Dense(units=8, input_shape=input_shape, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer=optimizer_used, loss=loss_function, metrics=keras.metrics.CategoricalAccuracy())
history = model.fit(features_train, target_train, epochs=num_epochs, validation_data=(features_test, target_test), verbose = 0)

In [None]:
plot_loss_history(history, 'Logistic ({} epochs)'.format(num_epochs))

In [None]:
predictions = model.predict(features_test)
predicted_classes = np.argmax(predictions,axis=1)
predicted_classes = predicted_classes.reshape(len(predicted_classes),1)

target_classes = np.argmax(target_test, axis=1)

con_mat_df = confusion_matrix(target_classes, predicted_classes, labels = [0,1,2,3,4])
print("\nConfusion matrix:")
print(con_mat_df)

In [None]:
accuracy = accuracy_score(target_classes, predicted_classes)
print("Overall accuracy is: ", accuracy)

confusion_matrix(target_classes, predicted_classes, normalize='true')

### Medium model

In [None]:
num_epochs = 50

model = Sequential()
model.add(Dense(units=16, input_shape=input_shape, activation='relu'))
model.add(Dense(units=32, input_shape=input_shape, activation='relu'))
model.add(Dense(units=16, input_shape=input_shape, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer=optimizer_used, loss=loss_function, metrics=keras.metrics.CategoricalAccuracy())
history = model.fit(features_train, target_train, epochs=num_epochs, validation_data=(features_test, target_test), verbose = 0)

In [None]:
plot_loss_history(history, 'Softmax ({} epochs)'.format(num_epochs))

In [None]:
predictions = model.predict(features_test)
predicted_classes = np.argmax(predictions,axis=1)
predicted_classes = predicted_classes.reshape(len(predicted_classes),1)

target_classes = np.argmax(target_test, axis=1)

con_mat_df = confusion_matrix(target_classes, predicted_classes, labels = [0,1,2,3,4])
print("\nConfusion matrix:")
print(con_mat_df)

In [None]:
accuracy = accuracy_score(target_classes, predicted_classes)
print("Overall accuracy is: ", accuracy)

confusion_matrix(target_classes, predicted_classes, normalize='true')

### Large model

In [None]:
model = Sequential()
model.add(Dense(units=32, input_shape=input_shape, activation='relu'))
model.add(Dense(units=64, input_shape=input_shape, activation='relu'))
model.add(Dense(units=128, input_shape=input_shape, activation='relu'))
model.add(Dense(units=64, input_shape=input_shape, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
num_epochs = 100

model.compile(optimizer=optimizer_used, loss=loss_function, metrics=keras.metrics.CategoricalAccuracy())
history = model.fit(features_train, target_train, epochs=num_epochs, validation_data=(features_test, target_test), verbose = 0)

In [None]:
plot_loss_history(history, 'Softmax ({} epochs)'.format(num_epochs))

In [None]:
predictions = model.predict(features_test)
predicted_classes = np.argmax(predictions,axis=1)
predicted_classes = predicted_classes.reshape(len(predicted_classes),1)

target_classes = np.argmax(target_test, axis=1)

accuracy = accuracy_score(target_classes, predicted_classes)
print("Overall accuracy is: ", accuracy)

con_mat_df = confusion_matrix(target_classes, predicted_classes, labels = [0,1,2,3,4])
print("\nConfusion matrix:")
print(con_mat_df)

In [None]:
con_mat_df = confusion_matrix(target_classes, predicted_classes, labels = [0,1,2,3,4], normalize='true')
print("\nConfusion matrix:")
print(con_mat_df)

## A second example [OPTIONAL]

In [None]:
DATASET_URL = 'https://raw.githubusercontent.com/ne1s0n/coding_excercises/master/data/DNA_methylation_data.csv'

In [None]:
# pandas can read a csv directly from a url
bat_data = pd.read_csv(DATASET_URL)
print(bat_data)

In [None]:
bat_data = bat_data.iloc[:,[1,3,4,5,6,7,8,9]]
print(bat_data.head())
print("N. of records is: ",len(bat_data))


Removing missing data

In [None]:
bat_data = bat_data.dropna()