## Model Evaluation

In [1]:
import sys
sys.path.insert(0, "../src/")
%load_ext autoreload
%autoreload 2

In [5]:
import numpy as np
import altair as alt
import pandas as pd
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from debug import generate_plot_from_csv

# Baseline Model

In [6]:
# We've already saved the predictions, so just load them back
baseline_shha_train_224 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="train",
    cropsize=224
)

baseline_shha_train_300 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="train",
    cropsize=300
)

baseline_shha_train_448 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="train",
    cropsize=448
)

baseline_shha_train_600 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="train",
    cropsize=600
)

In [7]:
ah = alt.hconcat(baseline_shha_train_224, baseline_shha_train_300)
av = alt.hconcat(baseline_shha_train_448 | baseline_shha_train_600)
chart = alt.vconcat(ah, av)
chart

## Dataset: SHHA 

Next, We plot the baseline model performance on the test set

In [8]:
baseline_shha_test_224 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="test",
    cropsize=224
)

baseline_shha_test_300 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="test",
    cropsize=300
)

baseline_shha_test_448 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="test",
    cropsize=448
)

baseline_shha_test_600 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHA", ds_type="test",
    cropsize=600
)

In [9]:
ah = alt.hconcat(baseline_shha_test_224, baseline_shha_test_300)
av = alt.hconcat(baseline_shha_test_448 | baseline_shha_test_600)
chart = alt.vconcat(ah, av)
chart

# Dataset: SHHB

Now, let's see how does the model generalises to unseen dataset. 
We will plot baseline model performance on both train and test set.

Train set

In [10]:
baseline_shhb_train_224 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="train",
    cropsize=224
)

baseline_shhb_train_300 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="train",
    cropsize=300
)

baseline_shhb_train_448 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="train",
    cropsize=448
)

baseline_shhb_train_600 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="test",
    cropsize=600
)


ah = alt.hconcat(baseline_shhb_train_224, baseline_shhb_train_300)
av = alt.hconcat(baseline_shhb_train_448 | baseline_shhb_train_600)
chart = alt.vconcat(ah, av)
chart

Plot baseline model performance on the test set

In [11]:

baseline_shhb_test_224 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="test",
    cropsize=224
)

baseline_shhb_test_300 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="test",
    cropsize=300
)

baseline_shhb_test_448 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="test",
    cropsize=448
)

baseline_shhb_test_600 = generate_plot_from_csv(
    name="vgg16baseline", dataset="SHHB", ds_type="test",
    cropsize=600
)

ah = alt.hconcat(baseline_shhb_test_224, baseline_shhb_test_300)
av = alt.hconcat(baseline_shhb_test_448 | baseline_shhb_test_600)
chart = alt.vconcat(ah, av)
chart

# VGG16 With Decoder

Next, we evaluate the VGG16WithDecoder model and plot its performance against the same set of datasets

In [12]:
crop_sizes = [224, 300, 448, 600]

def layout_plots(plots):
    assert len(plots) == 4
    
    ah = alt.hconcat(plots[0], plots[1])
    av = alt.hconcat(plots[2], plots[3])
    chart = alt.vconcat(ah, av)
    return chart    

In [13]:
plots = []
for cropsize in crop_sizes:
    plot = generate_plot_from_csv(
        name="vgg16decoder", dataset="SHHA", ds_type="train",
        cropsize=cropsize
    )
    plots.append(plot)
    

layout_plots(plots)

## SHHA TEST

In [14]:
plots = []
for cropsize in crop_sizes:
    plot = generate_plot_from_csv(
        name="vgg16decoder", dataset="SHHA", ds_type="test",
        cropsize=cropsize
    )
    plots.append(plot)
    

layout_plots(plots)

## Dataet: SHHB

In [15]:
plots = []
for cropsize in crop_sizes:
    plot = generate_plot_from_csv(
        name="vgg16decoder", dataset="SHHB", ds_type="train",
        cropsize=cropsize
    )
    plots.append(plot)
    

layout_plots(plots)

And, then on test set

In [16]:
plots = []
for cropsize in crop_sizes:
    plot = generate_plot_from_csv(
        name="vgg16decoder", dataset="SHHB", ds_type="test",
        cropsize=cropsize
    )
    plots.append(plot)
    

layout_plots(plots)

# Model Metrics

Next, we review the model metrics on each dataset

In [17]:
from glob import glob
import pandas as pd

In [22]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
# Load CSV Files
fnames = glob("predictions/debug-448/*.csv")

# Initialise Dataframe
df_metrics = pd.DataFrame(columns=["model", "input_size","dataset", "ds_type", "mse", "mae"])
for fname in fnames:
    splits = fname.split("/")[-1].split("_")
    df = pd.read_csv(fname)
    mse = mean_squared_error(df.true_labels.values, df.predicted_labels.values)
    mae = mean_absolute_error(df.true_labels.values, df.predicted_labels.values)    
    name = splits[0]
    size = splits[4][:-4]
    dataset = splits[1]
    ds_type = splits[2]
    name = name.replace("vgg16","").upper()
    df_metrics = df_metrics.append(
        {"model": name,
         "input_size": size,
         "dataset": dataset, 
         "ds_type": ds_type.upper(), 
         "mse": mse, 
         "mae": mae,
#          "group": f"{dataset}-{ds_type}".upper()
         "group": f"{name}-{dataset}-{ds_type}".upper()         
        },
        ignore_index=True
    )

In [23]:
import seaborn as sns
cm = sns.light_palette("salmon", as_cmap=True, reverse=False)
# cm = sns.diverging_palette(150, 10, as_cmap=True)

In [24]:
# Review the list of unique groups
df_metrics.group.unique()

array(['DEBUG-448\\BASELINE-SHHA-TEST', 'DEBUG-448\\BASELINE-SHHA-TRAIN',
       'DEBUG-448\\BASELINE-SHHB-TEST', 'DEBUG-448\\BASELINE-SHHB-TRAIN',
       'DEBUG-448\\DECODER-SHHA-TEST', 'DEBUG-448\\DECODER-SHHA-TRAIN',
       'DEBUG-448\\DECODER-SHHB-TEST', 'DEBUG-448\\DECODER-SHHB-TRAIN'],
      dtype=object)

In [25]:
def plot_df(groupname):
    return df_metrics[df_metrics.group == groupname] \
        .sort_values(['model','dataset','ds_type', 'input_size']) \
        .style.background_gradient(cmap=cm)

In [27]:
plot_df('BASELINE-SHHA-TRAIN')

Unnamed: 0,model,input_size,dataset,ds_type,mse,mae,group


In [28]:
plot_df('BASELINE-SHHA-TEST')

Unnamed: 0,model,input_size,dataset,ds_type,mse,mae,group


In [29]:
# Display All
df_metrics \
        .sort_values(['model','dataset','ds_type', 'input_size']) \
        .style.background_gradient(cmap=cm)

Unnamed: 0,model,input_size,dataset,ds_type,mse,mae,group
0,DEBUG-448\BASELINE,224,SHHA,TEST,706.488665,13.137432,DEBUG-448\BASELINE-SHHA-TEST
1,DEBUG-448\BASELINE,300,SHHA,TEST,2048.874582,21.816246,DEBUG-448\BASELINE-SHHA-TEST
2,DEBUG-448\BASELINE,400,SHHA,TEST,4271.480666,34.004042,DEBUG-448\BASELINE-SHHA-TEST
3,DEBUG-448\BASELINE,448,SHHA,TEST,5052.967156,40.714459,DEBUG-448\BASELINE-SHHA-TEST
4,DEBUG-448\BASELINE,600,SHHA,TEST,9930.256555,58.471941,DEBUG-448\BASELINE-SHHA-TEST
5,DEBUG-448\BASELINE,224,SHHA,TRAIN,780.762986,10.836662,DEBUG-448\BASELINE-SHHA-TRAIN
6,DEBUG-448\BASELINE,300,SHHA,TRAIN,2079.275907,21.233841,DEBUG-448\BASELINE-SHHA-TRAIN
7,DEBUG-448\BASELINE,400,SHHA,TRAIN,1952.542745,22.136019,DEBUG-448\BASELINE-SHHA-TRAIN
8,DEBUG-448\BASELINE,448,SHHA,TRAIN,143.558304,5.788115,DEBUG-448\BASELINE-SHHA-TRAIN
9,DEBUG-448\BASELINE,600,SHHA,TRAIN,5319.818017,39.301404,DEBUG-448\BASELINE-SHHA-TRAIN


# Plot Metrics

The table is fine for individual group but it gets difficult to compare so we instead plot the metrics.

### MSE

Plot Baseline and VGG+Decoder MSE on SHHA dataset

In [30]:
mse_plot = alt.Chart(df_metrics[df_metrics.dataset == 'SHHA']).mark_bar().encode(
    alt.X('input_size', title=""),
    alt.Y('mse', title="MSE"),
    alt.Color('ds_type', legend=alt.Legend(orient='bottom'), title="Model"),
    alt.Column('group', title=""),
)

mse_plot

### MAE

Plot Baseline and VGG+Decoder MAE on SHHA dataset

In [31]:
mae_plot = alt.Chart(df_metrics[df_metrics.dataset == 'SHHA']).mark_bar().encode(
    alt.X('input_size', title=""),
    alt.Y('mae', title="MAE"),
    alt.Color('model', legend=alt.Legend(orient='bottom')),
    alt.Column('group', title="")
)

mae_plot

# Heatmap

Can we identify the sizes where it misses the most?

In [32]:
def plot_histogram(fname="predictions/debug-448/vgg16baseline_SHHA_train_predictions_224.csv", step=50):
    df = pd.read_csv(fname)
    df['diff'] = (df['true_labels'] - df['predicted_labels'])
    max_count = max(df.true_labels.values)
    step = int(max_count / 10.)
#     print(step)
#     df.head()
    heatmap = alt.Chart(df).mark_rect().encode(
        alt.X('true_labels:Q', bin=alt.Bin(extent=[0, max_count], step=step), title="True"),
        alt.Y('predicted_labels:Q', bin=alt.Bin(extent=[0, max_count], step=step), title="Predicted"),
        alt.Color('sum(diff)', title="Difference")
    )

    points = alt.Chart(df).mark_circle(
        color='black',
        size=5
    ).encode(
        alt.X('true_labels:Q'),
        alt.Y('predicted_labels:Q'),
    )

    return (heatmap + points).properties(
        width=500,
        height=400
    ).properties(
        title="Heatmap: True vs. Predicted"
    ).interactive()

In [33]:
plot_histogram("predictions/debug-448/vgg16baseline_SHHA_train_predictions_224.csv")

In [34]:
plot_histogram("predictions/debug-448/vgg16baseline_SHHA_train_predictions_448.csv")

In [35]:
plot_histogram("predictions/debug-448/vgg16baseline_SHHA_test_predictions_448.csv")

In [36]:
plot_histogram("predictions/debug-448/vgg16baseline_SHHA_train_predictions_600.csv")

# Conclusion

The model was trained using 448 x 448PX input images that were center cropped. We observe that:
1. model is able to overfit the training data when input size is kept to 448px
2. model performance start to degrade, even on training data, as we increase the input size
3. model is also able to perform quite well on unseen data as long as we keep the input size to 448px
4. model performs well when crowd count is small, evident in SHHB dataset

Why does model perform better for smaller crowds?

Because when we resize and center-crop, the max amount to crowd it sees is limited to < 1000
