# Visuals to help in model calibration
When using parameter grid-search

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns

In [None]:
google_root = "Q:"
data_path = r"\Shared drives\Pandemic Data"
model_name = "slf_model"
run_name = "slf_grid_broad"
total_runs = 80 # Count of runs expected (this is only needed if you had variable #'s of runs - ie. from two rounds of parameter sampling)

data_dir = f"{google_root}{data_path}\{model_name}"

os.chdir(data_dir)

In [None]:
stats_dir = f"{data_dir}/outputs/summary_stats/{run_name}"
# input_dir = "inputs"
input_dir = f"{data_dir}/inputs/noTWN"

In [None]:
validation_df = pd.read_csv(
        input_dir + "/first_records_validation.csv",
        header=0,
        index_col=0,
    )

In [None]:
stats = pd.read_csv(f"{stats_dir}/summary_stats_wPrecisionRecallF1FBetaAggProb.csv")

In [None]:
# Only needed if you have variable numbers of runs (to get the set you are looking for...)

stats = stats.groupby("sample").filter(lambda x: len(x) == total_runs)

In [None]:
agg_dict = {
    "start":["max"],
    "alpha":["max"],
    "lamda": ["max"],
    "count_known_countries_time_window_fbeta": ["mean","std"]
}

agg_dict = {**agg_dict}

agg_df = stats.groupby("sample").agg(agg_dict)

agg_df.columns = ["_".join(x) for x in agg_df.columns.values]

In [None]:
agg_df = agg_df.rename(columns={"start_max":"start","alpha_max":"alpha","lamda_max":"lamda","count_known_countries_time_window_fbeta_mean":"fbeta"})
agg_df['st_err']=agg_df['count_known_countries_time_window_fbeta_std']/np.sqrt(50)

In [None]:
agg_df.reset_index(inplace=True)
agg_df.sort_values('fbeta', ascending=False).head()

## Visualizing data

### Assessing run convergence

In [None]:
# Top 20 runs
top20 = agg_df.sort_values('fbeta',ascending=False).head(20)

In [None]:
# Convergence of top 20 runs?
samples = list(agg_df.sort_values('fbeta',ascending=False).head(20).reset_index()['sample'])
runs = list(range(0,stats['run_num'].max()))

samples_df = pd.DataFrame({'runs':runs})
i=1
for sample in samples:
    sample_fbeta = []
    stdev = []
    sterr = []
    for run in runs:
        filtered_stats = stats.loc[(stats['run_num']<=run) & (stats['sample']==sample)]
        value = filtered_stats["count_known_countries_time_window_fbeta"].mean()
        sdev = filtered_stats["count_known_countries_time_window_fbeta"].std() # this gives the standard deviation of the sample - mean
        sample_fbeta.append(value)
        stdev.append(sdev)
        sterr.append(np.std(sample_fbeta)) # this gives the standard error of the mean
    samples_df[f"sample {i}"]=sample_fbeta
    samples_df[f"stdev {i}"]=stdev
    samples_df[f"sterr {i}"]=sterr
    i += 1

samples_df.set_index("runs",inplace=True)    
samples_df["all samples"]=samples_df.mean(axis=1)


In [None]:
plt.style.use('ggplot')
ax = samples_df.loc[:,'sample 1':'sample 20':3].plot(ylim=[0.6,0.7],color=['maroon',
 'maroon',
 'maroon',
 'maroon',
 'maroon',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey',
 'darkslategrey'],ylabel="fbeta",title="Mean fbeta convergence \n for the top 20 parameter samples",legend=False)
# for i in range(1, len(samples)):
#     ax.fill_between(samples_df.index, samples_df[f"sample {i+1}"]+samples_df[f"sterr {i+1}"], samples_df[f"sample {i+1}"]-samples_df[f"sterr {i+1}"],color='#366da0',alpha=0.15)
ax.set_xlabel("# of Runs",fontsize=16)
ax.set_ylabel("Fbeta mean",fontsize=16)
ax.tick_params(labelsize=13)
plt.show()

### Assessing alpha/lamda/year value performance

In [None]:
sns.set_context(font_scale=5)

In [None]:
ax = sns.stripplot(x='alpha', y='fbeta', hue='start', palette='mako',linewidth=0.2, data=agg_df, jitter=0.4)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1],bbox_to_anchor=(1.25,1), loc='upper right', borderaxespad=0,title="start year")
ax.set(ylim=(0,1))
ax.axes.set_title("Mean Sample Fbeta, by Alpha Value\n (Color = Year)",fontsize=16)
ax.set_xlabel("Alpha",fontsize=16)
ax.set_ylabel("Fbeta mean",fontsize=16)
ax.tick_params(labelsize=13)
plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title
plt.show()


In [None]:
ax = sns.scatterplot(x='lamda',y='fbeta',hue='start',data=agg_df,palette='mako',edgecolor="black",linewidth=0.2,legend='full') 
ax.set(ylim=(0, 1))
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1],bbox_to_anchor=(1.25,1), loc='upper right', borderaxespad=0,title="start year")
ax.axes.set_title("Mean Sample Fbeta, by Lambda Value\n (Color = Year)",fontsize=16)
ax.set_xlabel("Lambda",fontsize=16)
ax.set_ylabel("Fbeta mean",fontsize=16)
ax.tick_params(labelsize=13)
plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title
plt.show()


In [None]:
ax = sns.stripplot(x='start', y='fbeta', hue='alpha', palette='mako',linewidth=0.2, data=agg_df, jitter=0.3)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1],bbox_to_anchor=(1.25,1), loc='upper right', borderaxespad=0,title="alpha")
ax.set(ylim=(0,1))
ax.axes.set_title("Mean Sample Fbeta, by Start Year\n (Color = Alpha)",fontsize=16)
ax.set_xlabel("Start year",fontsize=16)
ax.set_ylabel("Fbeta mean",fontsize=16)
plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title
plt.show()

### Animated visual of individual sample (multiple runs)

In [None]:
# Getting individual run introductions to make the animated visual
import geopandas

results_dir = "outputs/slf_start_year/year2005_alpha0.2_lamda3.15_6801-6804"
paths = Path(results_dir).glob('**/origin_destination.csv')

countries_path = "inputs/noTWN/countries_slf_hiiMask16.gpkg"
countries_geo = geopandas.read_file(countries_path)
org_dest_all = pd.DataFrame()

lat_lon = countries_geo[["NAME","LON","LAT"]]
paths = Path(results_dir).glob('**/origin_destination.csv')

In [None]:
for path in paths:
    path_in_str = str(path)
    org_dest = (pd.read_csv(path)).iloc[:,1:4]
    org_dest["TS"] = org_dest["TS"].astype(str)
    org_dest_all = org_dest_all.append(org_dest)

In [None]:
origin_lat_lon = lat_lon.rename(columns={'NAME':'Origin'})
# org_dest_all.merge(lat_lon, how='left',on='Origin')

In [None]:
with_origins = org_dest_all.merge(origin_lat_lon, how='left',on='Origin')
with_origins.rename(columns={'LAT':'LatOrigin','LON':'LonOrigin'},inplace=True)

In [None]:
destination_lat_lon = lat_lon.rename(columns={'NAME':'Destination'})
with_orig_dest = with_origins.merge(destination_lat_lon, how='left',on='Destination')
with_orig_dest.rename(columns={'LAT':'LatDest','LON':'LonDest'},inplace=True)

In [None]:
with_orig_dest['date_time'] = pd.to_datetime(with_orig_dest['TS'],format="%Y%m")

In [None]:
with_orig_dest['intros'] = with_orig_dest.groupby('Destination').cumcount()

In [None]:
with_orig_dest.to_csv('or_dest.csv')