## Data Import

In [None]:
from itertools import combinations, permutations

import numpy as np
import pandas as pd
from sqlite3 import connect

from matplotlib import pyplot as plt
import seaborn as sns

np.random.seed(707260)

In [None]:
con = connect('../results.db')
tables = pd.read_sql(
    "SELECT * FROM sqlite_master", 
    con=con
).loc[:, 'name']
con.close()

In [None]:
df_map = {}

bad_vals = 0

df_idx = ['dataset', 'model', 'weight', 'ori', 'prep']

con = connect('../results.db')
for t in tables:
    # Pull the dataframe from the database
    df = pd.read_sql(
        f"SELECT * FROM {t}", 
        con=con
    )

    # If the table represents a study which wasn't run to completion, end early and report it
    if df.shape[0] < 1000:
        # print(f"Study {t} was not completed")
        bad_vals += 1
        continue

    # Split the DataFrame's label into its components
    label_comps = t.split('__')

    # Pull the model label from it
    model = label_comps[1]

    # The rest of the components are in the final tag
    final_comps = label_comps[-1].split('_')
    if final_comps[0] == 'full':
        dataset = "full"
        ori = final_comps[1]
        weight = final_comps[2]
        prep = '_'.join(final_comps[3:])
        df_key = "_".join([dataset, model, ori, weight, prep])
    elif final_comps[0] == 'img':
        dataset = 'img'
        ori = final_comps[2]
        weight = final_comps[3]
        prep = '_'.join(final_comps[4:])
        df_key = "_".join([dataset, model, ori, weight, prep])
    elif final_comps[0] == 'clinical':
        dataset = 'clinical'
        ori = 'none'
        weight = 'none'
        prep = '_'.join(final_comps[2:])
        df_key = "_".join([dataset, model, ori, weight, prep])
    
    # Store the components in the dataframe itself
    df['dataset'] = dataset
    df['model'] = model
    df['weight'] = weight
    df['ori'] = ori
    df['prep'] = prep
    # Track the resulting dataframe via the result
    df_map[df_key] = df

con.close()

print(f"\nTotal No. bad values: {bad_vals}")

In [None]:
len(df_map)

# Best By Replicate

## Re-usable Functions

In [None]:
result_df_columns = [*df_idx, 'Mean', 'STD']

In [None]:
# Absolute peak values by replicate, mean and std
def get_peak1_of_value(target_value):
    df_values = []
    for k, df in df_map.items():
        peak_df = df.sort_values(by=target_value).groupby('replicate').last()
        peak_mean = np.mean(peak_df[target_value])
        peak_std = np.std(peak_df[target_value], axis=0)
        df_values.append(
            [*df.loc[0, df_idx], peak_mean, peak_std]
        )
    peak_value_df = pd.DataFrame(data=df_values, index=df_map.keys(), columns=result_df_columns)
    return peak_value_df.sort_values(by='Mean')

In [None]:
# Top 5 values by replicate, mean and std
def get_peak5_of_value(target_value):
    df_values = []
    for k, df in df_map.items():
        peak_df = df.sort_values(by=target_value).groupby('replicate').tail(5)
        peak_mean = np.mean(peak_df[target_value])
        peak_std = np.std(peak_df[target_value], axis=0)
        df_values.append(
            [*df.loc[0, df_idx], peak_mean, peak_std]
        )
    peak_value_df = pd.DataFrame(data=df_values, index=df_map.keys(), columns=result_df_columns)
    return peak_value_df.sort_values(by='Mean')

## Balanced Accuracy

In [None]:
get_peak1_of_value('balanced_accuracy (test)').set_index(df_idx).tail(10)

In [None]:
get_peak5_of_value('balanced_accuracy (test)').set_index(df_idx).tail(10)

# Performance Across Trials

## Utility Functions

In [None]:
def stack_all_df_metrics(cols: list):
    sub_dfs = []
    for df in df_map.values():
        sub_df = df.loc[:, [*df_idx, 'replicate', 'trial', *cols]]
        sub_dfs.append(sub_df)
    return pd.concat(sub_dfs)

In [None]:
def plot_average_performance_across_trials(df, metric, grouping, fname):
    # Plot the average and standard deviation
    sns.lineplot(data=df, x='trial', y=metric, hue=grouping)

    # Add details
    plt.title(f'By {grouping} (Average)')
    plt.tight_layout()

    # Save and show the plot
    plt.savefig(f'figures/{fname}.png')
    plt.show()

In [None]:
def plot_max_performance_across_trials(df, metric, grouping, fname):
    # Reformat the data to be max by trial/replicate grouping
    tmp_df = df.groupby(['replicate', 'trial', grouping])[metric].max().reset_index()
    
    # Plot the average and standard deviation
    sns.lineplot(data=tmp_df, x='trial', y=metric, hue=grouping)

    # Add details
    plt.title(f'By {grouping} (Max)')
    plt.tight_layout()

    # Save and show the plot
    plt.savefig(f'figures/{fname}.png')
    plt.show()

## Balanced Accuracy

In [None]:
bacc_avg_df = stack_all_df_metrics(['balanced_accuracy (test)'])

### Dataset

In [None]:
plot_average_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'dataset', 'bacc_avg_by_dataset')

In [None]:
plot_max_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'dataset', 'bacc_max_by_dataset')

### Model

In [None]:
plot_average_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'model', 'bacc_avg_by_model')

In [None]:
plot_max_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'model', 'bacc_max_by_model')

### Image Contrast (Weight)

In [None]:
plot_average_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'weight', 'bacc_avg_by_weight')

In [None]:
plot_max_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'weight', 'bacc_max_by_weight')

### Image Orientation

In [None]:
plot_average_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'ori', 'bacc_avg_by_ori')

In [None]:
plot_max_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'ori', 'bacc_max_by_ori')

### Pre-Processing

In [None]:
plot_average_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'prep', 'bacc_avg_by_prep')

In [None]:
plot_max_performance_across_trials(bacc_avg_df, 'balanced_accuracy (test)', 'prep', 'bacc_max_by_prep')

# Paired T-Tests

## Setup

In [None]:
from itertools import combinations, permutations

from scipy.stats import normaltest, ranksums

Target metric gathering function

In [None]:
# Absolute peak values by replicate, mean and std
def get_best_per_replicate(target_value):
    component_dfs = []
    for k, df in df_map.items():
        peak_df = df.sort_values(by=target_value).groupby('replicate').last()
        peak_df = peak_df.loc[:, [*df_idx, 'trial', target_value]]
        component_dfs.append(peak_df)
    result_df = pd.concat(component_dfs).reset_index()
    return result_df

In [None]:
def evaluate_normality(df, query_key, target):
    isnormal = {}
    query_set = set(replicate_best_bacc_df[query_key])

    for k in query_set:
        x = df.query(f"{query_key} == '{k}'")[target]
        isnormal[k] = [normaltest(x).pvalue]

    # Save the results as a dataframe
    return_df = pd.DataFrame.from_dict(isnormal).T
    return_df.columns = ['p-value']
    return return_df

In [None]:
alt_keys = {
    'two-sided': '!=',
    'greater':   '>',
    'less':      '<'
}

def paired_rankedsum(df, query, target, alternative='two-sided'):
    pvals = {}
    query_set = set(df[query])

    # Caclulate the native rankedsum p-value for each pair of datasets, testing whether the former's value is greater than the latters
    for v1, v2 in permutations(query_set, 2):
        x1 = df.query(f"{query} == '{v1}'")[target]
        x2 = df.query(f"{query} == '{v2}'")[target]
        p = ranksums(x1, x2, alternative=alternative).pvalue
        pvals[f"{v1} {alt_keys[alternative]} {v2}"] = [p]

    # Save the results as a dataframe
    return_df = pd.DataFrame.from_dict(pvals).T
    return_df.index.name = 'Comparison'
    return_df.columns = ['p']
    return return_df

## Testing Balanced Accuracy

In [None]:
target = 'balanced_accuracy (test)'
replicate_best_bacc_df = get_best_per_replicate(target)
replicate_best_bacc_df

In [None]:
# Calculate the p-values for whether one experimental permutation has greater average balanced accuracy performance than another
sub_dfs = []
for k in df_idx:
    tmp_df = paired_rankedsum(replicate_best_bacc_df, k, target, alternative='greater')
    sub_dfs.append(tmp_df)

sig_df = pd.concat(sub_dfs).sort_values('p')

# Calculate the corrected p-value significance as well
n_samples = sig_df.shape[0]
sig_df['significance'] = ''
for i, t in enumerate([0.05, 0.01, 0.001]):
    sig_df.loc[sig_df['p']*n_samples < t, 'significance'] = '*'*(i+1)

sig_df.reset_index()

## Feature Importance

In [None]:
def format_feature_imp(val):
    # Strip leading and trailing brackets
    val = val[1:-2]

    # Create a dictionary from the remaining components
    imp_dict = dict()
    for v in val.split(', '):
        vcomps = v.split(': ')
        k = ': '.join(vcomps[:-1])
        v = float(vcomps[-1])
        imp_dict[k] = v
        
    return imp_dict

In [None]:
def feature_importance_report(df: pd.DataFrame, weight_col, feature_col):
    # Convert the dictionaries contained with the feature_col dicts into dataframes which can be stacked
    raw_dfs = []
    weighted_dfs = []
    for r in df.iterrows():
        rvals = r[1]
        tmp_df = pd.DataFrame.from_dict({k: [v] for k, v in rvals[feature_col].items()})
        weight = rvals[weight_col]
        raw_dfs.append(tmp_df)
        weighted_dfs.append(tmp_df * weight)
    raw_feature_imps = pd.concat(raw_dfs).fillna(0)
    weighted_feature_imps = pd.concat(weighted_dfs).fillna(0)

    # Interpret the results into a clean report
    feature_imp_report = {
        "Mean (Raw)": raw_feature_imps.mean(),
        "STD (Raw)": raw_feature_imps.std(),
        "Mean (Magnitude)": np.abs(raw_feature_imps).mean(),
        "STD (Magnitude)": np.abs(raw_feature_imps).std(),
        "Mean (Performance Weighted)": weighted_feature_imps.mean(),
        "STD (Performance Weighted)": weighted_feature_imps.std(),
    }
    result_df = pd.DataFrame.from_dict(feature_imp_report)
    
    return result_df

In [None]:
# Isolate and stack the information relative to the value
sub_dfs = []

for df in df_map.values():
    tmp_df = df.loc[:, ['replicate', 'trial', *df_idx, 'balanced_accuracy (test)', 'importance_by_permutation (test)']]
    sub_dfs.append(tmp_df)

feature_imp_df = pd.concat(sub_dfs)

# Isolate only the best trial from each replicate
feature_imp_df = feature_imp_df.sort_values('balanced_accuracy (test)').groupby([*df_idx, 'replicate']).tail(1).set_index(df_idx)

# Parse the feature importance list into a cleaner dictionary
feature_imp_df['importance_by_permutation (test)'] = feature_imp_df['importance_by_permutation (test)'].apply(format_feature_imp)
feature_imp_df

### PCA

In [None]:
def is_pca(val): return 'pca' in val 

pca_feature_imp_df = feature_imp_df.reset_index().loc[feature_imp_df.reset_index()['prep'].apply(is_pca), :].set_index([*df_idx])
pca_feature_imp_df

### Raw Features

In [None]:
nonpca_feature_imp_df = feature_imp_df.drop(pca_feature_imp_df.index)
nonpca_feature_imp_df

In [None]:
full_feature_imp_df = nonpca_feature_imp_df.query("dataset == 'full'")
full_feature_report = feature_importance_report(full_feature_imp_df, 'balanced_accuracy (test)', 'importance_by_permutation (test)')
full_feature_report.sort_values("Mean (Performance Weighted)", ascending=False).head(10)

In [None]:
img_feature_imp_df = nonpca_feature_imp_df.query("dataset == 'img'")
img_feature_report = feature_importance_report(img_feature_imp_df, 'balanced_accuracy (test)', 'importance_by_permutation (test)')
img_feature_report.sort_values("Mean (Performance Weighted)", ascending=False).head(10)

In [None]:
clin_feature_imp_df = nonpca_feature_imp_df.query("dataset == 'clinical'")
clin_feature_report = feature_importance_report(clin_feature_imp_df, 'balanced_accuracy (test)', 'importance_by_permutation (test)')
clin_feature_report.sort_values("Mean (Performance Weighted)", ascending=False).head(10)