In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import torch

import sys
sys.path.append('../prepare_dataset')
sys.path.append('../models')
sys.path.append('../train_model')
import data_aug
import prepare_binary_dataset
import data_aug
import models_classification
import utils
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.nn as nn
import train_model_binary
import model_eval

import seaborn as sns

## Visualize results for Counterfactual Experiment I

### Load the model performance metrics dataframe for each experiment

In [None]:
exp_main_dir='../experiment_gen/'
exp_dirs=['resnet50_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'resnet50_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'resnet50_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_384_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_384_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_384_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'vit_b_16_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'densenet161_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'densenet161_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9',
          'densenet161_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9'
          ]
df_combined=pd.DataFrame()
for i, folder_name in enumerate(exp_dirs):
    experiment_main_folder=os.path.join(exp_main_dir,folder_name)
    df=pd.read_csv(os.path.join(experiment_main_folder,'metrics.csv'))
    if i==0: df_combined=df
    else: df_combined=pd.concat([df_combined,df])
df_combined_long=pd.melt(df_combined, id_vars=['model_name','train_origin','eclipse','eclipse_extent'], value_vars=['val_acc','val_auc','test0_acc','test0_auc','test1_acc','test1_auc'], var_name='metrics', value_name='value')
df_combined_long_auc=df_combined_long[df_combined_long['metrics'].str.contains('auc')].reset_index(drop=True)

### Reproduce Figure 2

In [None]:
sns.set(font_scale=1.5)

g = sns.catplot(
    data=df_combined_long_auc[df_combined_long_auc.train_origin=='Chile'], kind="bar",
    x="metrics", y="value", hue="eclipse_extent",errorbar='sd',palette="Paired", alpha=.6, height=4.5
)
g.set_axis_labels("", "AUC")
g.legend.set_title("Eclipse Extent")
g.set_xticklabels(["Internal", "Ohio", "Turkey"])
g.set(ylim=(0, 1))
g.fig.suptitle('Trained on Chile')
# remove legend
g._legend.remove()

In [None]:
sns.set(font_scale=1.5)
g = sns.catplot(
    data=df_combined_long_auc[df_combined_long_auc.train_origin=='Ohio'], kind="bar",
    x="metrics", y="value", hue="eclipse_extent",errorbar='sd',palette="Paired", alpha=.6, height=4.5
)
g.set_axis_labels("", "AUC")
g.legend.set_title("Eclipse Extent")
g.set_xticklabels(["Internal", "Chile", "Turkey"])
g.set(ylim=(0, 1))
g.fig.suptitle('Trained on Ohio')
g._legend.remove()


In [None]:
sns.set(font_scale=1.5)

g = sns.catplot(
    data=df_combined_long_auc[df_combined_long_auc.train_origin=='Turkey'], kind="bar",
    x="metrics", y="value", hue="eclipse_extent",errorbar='sd',palette="Paired", alpha=.6, height=4.5
)
g.set_axis_labels("", "AUC")
g.legend.set_title("Eclipse Extent")
g.set_xticklabels(["Internal", "Chile", "Ohio"])
g.set(ylim=(0, 1))
g.fig.suptitle('Trained on Turkey')



### Additional visualiation: Model performance comparison
All models were trained on images with Eclipse Extent = 0

In [None]:
df_combined_eclipse0=df_combined.loc[df_combined['eclipse_extent']==0.0]
df_combined_eclipse0_long=pd.melt(df_combined_eclipse0, id_vars=['model_name','train_origin','eclipse','eclipse_extent'], value_vars=['val_acc','val_auc','val_prauc','test0_acc','test0_auc','test0_prauc','test1_acc','test1_auc','test1_prauc'], var_name='metrics', value_name='value')
df_combined_eclipse0_long_auc=df_combined_eclipse0_long.loc[df_combined_eclipse0_long['metrics'].str.contains('_auc')]
sns.set(font_scale=1.5)

g = sns.catplot(
    data=df_combined_eclipse0_long_auc, kind="bar",
    x="metrics", y="value", hue="model_name",palette="Paired", 
    col="train_origin",  alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("", "AUC")
g.legend.set_title("Eclipse Extent")
g.set_titles("Trained on {col_name}")
g.set_xticklabels(["Internal val", "External test I", "External test II"])





## Model evaluation for a new experiment
- Because of github file size limit, we cannot include all the trained models. We included the `metrics.csv` but not the trained models for the following experiments in `experiment_gen`   
    - densenet161_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9
    - densenet161_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9
    - densenet161_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9
    - vit_b_16_384_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9
    - vit_b_16_384_True_1.0_False_32_1234_100_True_False_0.05_0.01_0_0.9
    - vit_b_16_384_True_0.9_False_32_1234_100_True_False_0.05_0.01_0_0.9
- If you want to evaluate your model on a new experiment, you can use the following code to load the model and evaluate it.

In [None]:
exp_main_dir='../experiment_gen'
folder_name='vit_b_16_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9'
experiment_main_folder=os.path.join(exp_main_dir,folder_name)
metrics_df=model_eval.summarize_experiment(experiment_main_folder,cudaID=0,model_state_name='model.pt')
metrics_df.to_csv(os.path.join(experiment_main_folder,'metrics.csv'),index=False)