### Validation of different models

Here, we're going to compare the performance of various models to each other.

In [None]:
#Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import os
from collections import Counter
import json
from pathlib import Path
from datetime import datetime
import pickle

#Tensorflow impocarts
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import CSVLogger

Function for loading models

In [None]:
def load_trained_model(modelname, weightsname):
    json_file = open('{}.json'.format(modelname), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights("{}.h5".format(weightsname)) # load weights into new model
    return model

Load the validation datasets:

In [None]:
#Load tokenizer, word_index, vocab_size:
fp_tokenizer = 'tokenizer_cuDNN_230203-023518.pickle'
fp_wordindex = 'wordindex_cuDNN_230203-023518.json'
with open(fp_tokenizer, "rb") as fp:
    tokenizer = pickle.load(fp)
with open(fp_wordindex, "rb") as fp:
    word_index = json.load(fp)
vocab_size = len(word_index) + 1

#Load datasets
val_set_route = tf.data.Dataset.load('Data/val_set_route')
val_set_ndays = tf.data.Dataset.load('Data/val_set_ndays')
val_set_all = tf.data.Dataset.load('Data/val_set_all')

val_ datasets = [val_set_route, val_set_ndays, val_set_all]

Load other variables for comparison:

In [None]:
#Load the output_index
with open('Data/output_index.json', "rb") as fp:
    output_index = json.load(fp)

#load the hyperparameters
with open('Data/hyper_params.json', "rb") as fp:
    hyper_params = json.load(fp)

#### Define functions for evaluation

Our network is sequence to sequence, so to get a prediction for the last tick in a sequence, we need to predict the entire sequence and then take the last element. The dimmension of the last element will be `vocab_size_routes` and the values will be probabilities of each route, so we simply use `argmax` to get the max. probability route:

In [None]:
#Evaluation metrics for models
def get_model_eval_metrics(model, test_set):
    
    #Get predicitons and true values
    y_true = np.array([])
    y_preds = np.array([])

    for X,Y in test_set:
        x=X
        y=Y

        #Get predictions
        preds = model.predict(x)
        preds = np.argmax(preds, axis=2)[:,-1] #argmaxes for every user in a batch

        #Get true
        trues = y.numpy()[:,-1] #true vals for every user in a batch

        #Append
        y_preds = np.append(y_preds, preds)
        
        
        y_true = np.append(y_true, trues)

    #Convert to integers
    y_preds = y_preds.astype(int)
    y_true = y_true.astype(int)
    
    report = classification_report(y_true, y_preds, output_dict=True)
    df_report = pd.DataFrame(report).transpose()[['precision','recall']]

    # #Get the metrics
    accuracy = df_report.loc['accuracy'].precision
    precision = df_report.loc['weighted avg'].precision
    recall = df_report.loc['weighted avg'].recall 
    
    return accuracy, precision, recall

We also need evaluation metrics for the baseline data:

In [None]:
#Functions for baseline eval metrics

def convert_baseline_to_pred(df_baseline,tokenizer, col='target'):
    y_pred = tokenizer.texts_to_sequences([str(x) for x in df_baseline[col].values])
    y_pred = [item for sublist in y_pred for item in sublist]
    y_pred =np.array(y_pred)
    return y_pred

def get_model_eval_metrics(df_baseline, test_set, tokenizer, col):
    
    #Get predicitons and true values
    y_true = np.array([])
    y_preds = np.array([])

    for X,Y in test_set:
        #Get true
        trues = Y.numpy()[:,-1] #true vals for every user in a batch
        #Append        
        y_true = np.append(y_true, trues)

    #Get baseline prediction
    y_preds = convert_baseline_to_pred(df_baseline, tokenizer, col)

    #Convert to integers
    y_preds = y_preds.astype(int)
    y_true = y_true.astype(int)
    
    
    #metrics
    report = classification_report(y_true, y_preds, output_dict=True)
    df_report = pd.DataFrame(report).transpose()[['precision','recall']]

    # #Get the metrics
    accuracy = df_report.loc['accuracy'].precision
    precision = df_report.loc['weighted avg'].precision
    recall = df_report.loc['weighted avg'].recall 
    
    return accuracy, precision, recall

### Calculate confusion matrix for model based on test data:

Load the baseline data and prepare:

In [None]:
df_baseline = pd.read_json('baselines.json')
df_baseline = df_baseline.iloc[ouput_index] #filter by index 
df_baseline = df_baseline.iloc[0:n_records//10] #take the same size

Load all models

In [None]:
#Load all models with weights
routerecrnn_routes = load_trained_model('Data/route_rec_rnn_route_023518',
                                        'Data/route_rec_rnn_weights_route_023518')
routerecrnn_days = load_trained_model('Data/route_rec_rnn_ndays_174708',
                                      'Data/route_rec_rnn_weights_ndays_174708')
routerecrnn_all = load_trained_model('Data/route_rec_rnn_all_092653',
                                     'Data/route_rec_rnn_weights_all_092653')

models = [routerecrnn_routes, routerecrnn_days, routerecrnn_all]

Calculate evaluation metrics:

In [18]:
#Calculate accuracy, precision, and recall for each model on each dataset
mat_models = np.zeros((3,3)) 
for k in range(3):
    mat_models[k,:] = get_model_eval_metrics(models[k], datasets[k])
    
#Calculate accuracy, precision, and recall for each baseline on one dataset (same true values)
mat_baselines = np.zeros((3,3)) 
colnames = ['second', 'popular', 'popular_similar']
for k in range(3):
    mat_baselines[k,:] = get_model_eval_metrics_baseline(df_baseline, datasets[0], tokenizer, col=colnames[k])



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [26]:
#Output eval metric table

mat_df={}
modelnames = ['Route_Model', 'Ndays_Model','AllFeatures_Model']
for k in range(len(modelnames)):
    mat_df[modelnames[k]] = (mat_models)[k,:]
for k in range(len(colnames)):
    mat_df[colnames[k]] = (mat_baselines)[k,:]   

pd.DataFrame(mat_df, index=['Accuracy', 'Precision', 'Recall'])

Unnamed: 0,Route_Model,Ndays_Model,AllFeatures_Model,second,popular,popular_similar
Accuracy,0.825784,0.944251,0.944251,0.045296,0.052265,0.066202
Precision,0.827526,0.945993,0.947735,0.043554,0.041504,0.052846
Recall,0.825784,0.944251,0.944251,0.045296,0.052265,0.066202
