In [None]:
%load_ext autoreload
%autoreload 2

import os
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from context import ml_project
from ml_project.io import DataHandler
from ml_project.train import neural_nets
from sklearn.preprocessing import StandardScaler


# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
_ = np.seterr(divide='ignore', invalid='ignore')

print(f'Tensorflow version: {tf.__version__}')

# How-To

1) Download the zip folder holding the data  
2) Create a directory inside the PROJECT_ROOT_DIR/data and give it a suitable name DIR_NAME, e.g. "task1b_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct DIR_NAME in the following cell...  (no need for full absolute path)

In [None]:
DIR_NAME = 'task3_data'

# Load Data and aggregate feature matrix

In [None]:
data_handler = DataHandler(DIR_NAME)
train_data = data_handler.load_train_data('train.h5', 'h5')
final_test_data = data_handler.load_test_data('test.h5', 'h5')  # only used for submission
final_indices = final_test_data.index

In [None]:
HELD_OUT_TEST_SET_SIZE = 0.2  # used for out of sample classifier performance evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['y'], axis=1), 
                                                    train_data['y'], 
                                                    test_size=HELD_OUT_TEST_SET_SIZE,
                                                    random_state=42)

X_train, X_test, y_train, y_test = X_train.as_matrix(), X_test.as_matrix(), y_train.as_matrix(), y_test.as_matrix()

In [None]:
# StandardScaler
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#_test = scaler.transform(X_test)
#final_test_data = scaler.transform(final_test_data)

# Helper functions to build, compile and fit models

In [None]:
def three_layer_model(units_first, units_second, units_third, n_final_classes):
    """Creates three layer model with dropout and regularization."""
    model = keras.Sequential([
        keras.layers.Dense(units_first, activation=tf.nn.relu),

        keras.layers.Dense(units_second, activation=tf.nn.relu, kernel_regularizer=keras.regularizers.l2(0.01)),
        keras.layers.Dropout(0.3, noise_shape=None, seed=None),

        keras.layers.Dense(units_third, activation=tf.nn.relu, kernel_regularizer=keras.regularizers.l2(0.01)),
        keras.layers.Dropout(0.3, noise_shape=None, seed=None),

        keras.layers.Dense(n_final_classes, activation=tf.nn.softmax)
    ])
    return model

def compile_model(model):
    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model
    
def fit_model(model, epochs, callbacks):
    model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=1, shuffle=True, workers=-1,
             callbacks=callbacks)

# Define callbacks and tensorboard logging

In [None]:
from common import DATA_DIR_PATH
from tensorflow.keras import callbacks

In [None]:
LOG_DIR = os.path.join(DATA_DIR_PATH, DIR_NAME, 'logs')
tensorboard = callbacks.TensorBoard()
early_stopping = callbacks.EarlyStopping(patience=10)

# Build models

In [None]:
# U can define multiple models and compare them
"""
models = [('small',  compile_model(three_layer_model(128, 128, 128, 5))),
          ('medium', compile_model(three_layer_model(256, 256, 256, 5))),
          ('large',  compile_model(three_layer_model(512, 512, 512, 5)))]
"""
# Or if u decided which one to use just define one and use it
models = [('baseline', compile_model(three_layer_model(128, 128, 128, 5)))]

# Fit

In [None]:
EPOCHS = 300

for name, model in models:
    print(f'Fitting model {name}...')
    log_path = os.path.join(LOG_DIR, name + '_' + neural_nets.get_date_time_tag())
    tensorboard.log_dir = log_path
    fit_model(model, epochs=EPOCHS, callbacks=[tensorboard, early_stopping])

In [None]:
neural_nets.plot_history(models)  # can add more models as needed

In [None]:
def evaluate_models(models):
    results = {}
    for name, model in models:
        test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
        results[name] = {'test accuracy': test_acc, 'test loss': test_loss}

    return pd.DataFrame(results).transpose()

results_df = evaluate_models(models)
results_df

In [None]:
def select_best_model(results_df, models):
    best_model_name = results_df.idxmax()['test accuracy']
    best_model = None
    for name, model in models:
        if name == best_model_name:
            best_model = model
            break
    return best_model_name, best_model

best_model_name, model = select_best_model(results_df, models)
print(f'Best model: {best_model_name}')

# Performance evaluation on held out test dat
This is the section where we get a sense of how well our trained model is doing on the part of the training set we did not touch during training.

In [None]:
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.metrics import plot_roc

In [None]:
try:
    y_pred_proba = model.predict(X_test)
    y_pred = [np.argmax(proba) for proba in y_pred_proba]
    _ = plot_confusion_matrix(y_test, y_pred, figsize=(12, 8))
    _ = plot_roc(y_test, y_pred_proba, figsize=(8, 8))
except:
    pass

# Perform prediction on provided test data set
Now we perform predictions on the provided, unlabelled data set for submission

In [None]:
y_pred_final = [np.argmax(proba) for proba in model.predict(final_test_data)]
y_pred_ids = final_indices

# Store the data
Putting everything into the right format and storing the results in the working data directory

In [None]:
data_handler.store_results_task3(y_pred_final, y_pred_ids)