# Evaluate on Test Data

## Setup

In [1]:
from importlib import reload

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas

from hot_dust import preprocess, model

In [2]:
# "reload" to get changes in preprocess.py without restarting the kernel
reload(preprocess)
reload(model)
from hot_dust.preprocess import prepare_training_data, split_training_data, sensitivity_analysis
from hot_dust.model import to_tensorflow

In [3]:
ds = prepare_training_data()
train, validate, test = to_tensorflow(split_training_data(ds))
batch_size = test.cardinality()
test = test.batch(batch_size)   
ds

## Model Summary

In [4]:
network = tf.keras.models.load_model("data/network", compile=False)
network.summary() 


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 9)]               0         
                                                                 
 normalization (Normalizatio  (None, 9)                19        
 n)                                                              
                                                                 
 dense (Dense)               (None, 16)                160       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 2)                 34        
                                                                 
Total params: 485
Trainable params: 466
Non-trainable params: 19
______________________________________________________________

## Metrics

In [5]:
# Calculate metrics (.eg MSE, R2) on test data and tabulate 

# Calculate R2 
def get_R2(y_true, y_pred): 
    y_mean = np.mean(y_true)  # mean of the true values
    SS_total = np.sum((y_true - y_mean) ** 2)  # total sum of squares
    SS_reg = np.sum((y_true - y_pred) ** 2)  #  sum of squares due to regression

    R2 = 1 - (SS_reg / SS_total)
    return R2

y = np.concatenate(list(i[1] for i in test.as_numpy_iterator()))  # true values (y_true)
y_hat = network.predict(test, verbose=0)  # predicted values (y_pred)
y_hat_dust= y_hat[:, 0] 
y_hat_temp = y_hat[:, 1]

# Calculate the MSE for the dust and the temperature
MSE_dust = np.square(np.subtract(y[:,0], y_hat_dust)).mean() 
MSE_temp = np.square(np.subtract(y[:,1], y_hat_temp)).mean()

# Calculate the R2 for the dust and the temperature 
R2_dust = get_R2(y[:,0], y_hat_dust)
R2_temp = get_R2(y[:,1], y_hat_temp)

# Return the metrics
print("Metrics: ") 
print(f"Mean Squared Error (Dust): {MSE_dust:.4f}")
print(f"R2 (Dust): {R2_dust:.4f}")
print(f"Mean Squared Error (Temperature): {MSE_temp:.4f}")
print(f"R2 (Temperature): {R2_temp:.4f}")

Metrics: 
Mean Squared Error (Dust): 0.0717
R2 (Dust): 0.9221
Mean Squared Error (Temperature): 0.0717
R2 (Temperature): 0.9221


In [7]:
y

array([-1.37375   ,  0.32983604, -0.565881  , ..., -1.11436   ,
       -2.63855   , -0.625554  ], dtype=float32)

## Plots

In [6]:
# Plot of y_hat against y, with a one-to-one line overlayed 

# Dust Plot
# plot y_hat against y
plt.scatter(y[:,0], y_hat_dust, alpha=0.5)
# lables
plt.xlabel("True Values (y)", fontsize=20)
plt.ylabel("Predicted Values (y_hat)", fontsize=20)
plt.title("Predicted Values v. True Dust Values", fontsize=25)
# one-to-one line overlay
plt.axline([0, 0], slope=1, color="red") 
plt.show()  

# Temperature Plot
# plot y_hat against y
plt.scatter(y[:,1], y_hat_temp, alpha=0.5)
# lables
plt.xlabel("True Values (y)", fontsize=20)
plt.ylabel("Predicted Values (y_hat)", fontsize=20)
plt.title("Predicted Values v. True Temperature Values", fontsize=25)
# one-to-one line overlay
plt.axline([0, 0], slope=1, color="red") 
plt.show() 



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [8]:
# Plot a histogram of the residuals 

# Plot for Dust
residual_dust = y_hat_dust - y[:,0]
num_bins = 100
plt.hist(residual_dust, num_bins)
plt.title("Histogram of the Dust Residuals", fontsize=25)
plt.xlabel("Residuals", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.show() 

# Plot for Temperature
residual_temp = y_hat_temp - y[:,1]
num_bins = 100
plt.hist(residual_temp, num_bins)
plt.title("Histogram of the Temperature Residuals", fontsize=25)
plt.xlabel("Residuals", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.show()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
# Plot of the residuals against y 

# Dust Residuals against y
plt.scatter(y[:,0], residual_dust, alpha=0.5)
# lables
plt.xlabel("True Values (y)", fontsize=20)
plt.ylabel("Residuals", fontsize=20)
plt.title("Residuals v. True Dust Values", fontsize=25) 
plt.show()

# Temperature Residuals against y
plt.scatter(y[:,0], residual_temp, alpha=0.5)
# lables
plt.xlabel("True Values (y)", fontsize=20)
plt.ylabel("Residuals", fontsize=20)
plt.title("Residuals v. True Temperature Values", fontsize=25) 
plt.show()

## Heat Maps 

In [None]:
# Heat map of y_hat against y

# Dust Heat Map
# create the heatmap of y_hat against y
plt.hexbin(y[:,0], y_hat_dust, cmap="viridis")
# set the size of the graph
plt.axis("auto")
# lables
plt.xlabel("True Values (y)")
plt.ylabel("Predicted Values (y_hat)")
plt.title("Predicted Values v. True Dust Values")
# one-to-one line overlay
plt.axline([0, 0], slope=1, color="red")
plt.colorbar() 
plt.show() 

# Temp Heat Map
# create the heatmap of y_hat against y
plt.hexbin(y[:,1], y_hat_temp, cmap="viridis")
# set the size of the graph
plt.axis("auto")
# lables
plt.xlabel("True Values (y)")
plt.ylabel("Predicted Values (y_hat)")
plt.title("Predicted Values v. True Temperature Values")
# one-to-one line overlay
plt.axline([0, 0], slope=1, color="red")
plt.colorbar() 
plt.show()

In [None]:
# Heat map of the residuals against y

# Dust Heat Map 
# create the heatmap of residuals against y
plt.hexbin(y[:,0], residual_dust, cmap="viridis")
# set the size of the graph
plt.axis("auto")
# lables
plt.xlabel("True Values (y)")
plt.ylabel("Residuals")
plt.title("Residuals v. True Dust Values")
# zero horizontal line overlay
plt.axline([0, 0], slope=0, color="red")
plt.colorbar() 
plt.show() 

# Temperature Heat Map 
# create the heatmap of residuals against y
plt.hexbin(y[:,1], residual_temp, cmap="viridis")
# set the size of the graph
plt.axis("auto")
# lables
plt.xlabel("True Values (y)")
plt.ylabel("Residuals")
plt.title("Residuals v. True Temperature Values")
# zero horizontal line overlay
plt.axline([0, 0], slope=0, color="red")
plt.colorbar() 
plt.show()

## Sensitivity Analysis

In [None]:
percentage = 2 # in %; 2%, 5%, 10%
sensitivity_analysis(ds, network, percentage) #TODO get this working