## Set up the Environment

In [None]:
from time import time
import os
import pathlib
import pickle
import importlib

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import scipy.stats as st
from plotnine import *
import pprint
pp = pprint.PrettyPrinter(indent=4)

from ectrl.control import split_the_data
from ectrl.control import ClassificationTest, Umbrella, Typicality, DirectNP, TBC, EnsembleTBC
from ectrl.control import ForcedInductiveConformal
from ectrl.ratio import KernelDensityRatio
from ectrl.augment import Interpolator
from ectrl.evaluate import evaluate_once
from ectrl.analyze import plot_3, analyze_numerically, select, style, plot_time

In [None]:
# Where to place the plots and the results
res_dir = os.path.join('hmeq', 'results')

# Make the directory if it doesn't exist
pathlib.Path(res_dir).mkdir(parents=True, exist_ok=True)

In [None]:
res_dir

In [None]:
set_prefix = 'hmeq_'

## Prepare the Data

In [None]:
# Load the data
df = pd.read_csv('hmeq/data/hmeq.csv')

# Create a new category for unknown categorical values
df.loc[df['JOB'].isna(), 'JOB'] = 'Unknown'
df.loc[df['REASON'].isna(), 'REASON'] = 'Unknown'

# Set column MORTDUE to 0 if the value is missing
df.loc[df['MORTDUE'].isna(), 'MORTDUE'] = 0

In [None]:
# Split the data into the features and ground-truth values
y = df['BAD'].values
X = df.loc[:, [col for col in df.columns if col != 'BAD']]

In [None]:
def create_preprocessor():
    # Use the One-Hot Encoder for the categorical columns. 
    # Use the mean imputer and standard scaler for the numerical features.

    categorical = [3, 4]
    numerical = [0, 1, 2, 5, 6, 7, 8, 9, 10, 11]

    # Define the numerical pipeline    
    numerical_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'mean') ),
                                     ('std_scaler', StandardScaler())
                                    ])
    # Define the preprocessor
    preprocessor = ColumnTransformer([
        ('categorical', OneHotEncoder(sparse=False), categorical),
        ('numerical', numerical_pipeline, numerical)
    ])
    
    return preprocessor

In [None]:
target_class = 1

In [None]:
(y == target_class).sum(), (y == 1 - target_class).sum()

## Create Controllers

In [None]:
controllers = {}

### SCT

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', create_preprocessor()),
    ('classifier', SVC(kernel = 'rbf'))
])

# Define a function to extract the score
def svm_statistic(svm_classifier, X):
    X = svm_classifier['preprocessor'].transform(X)
    return svm_classifier['classifier'].decision_function(X)

svm_test = ClassificationTest(clf, svm_statistic,
                              reserve=0.5, # reserve 50% target objects for classification
                              ci=None, # not used
                              augmentor=None, # not used
                              sample_size=None, # not used
                              target_class=target_class
)
controllers['SCT'] = svm_test

### UA

In [None]:
# Add UA controllers
for delta in [0.01, 0.05]:
    for ensemble_size in [1, 5]:
        base_clf = Pipeline(steps=[
            ('preprocessor', create_preprocessor()),
            ('classifier', SVC(kernel = 'rbf'))
        ])
        ua = Umbrella(base_clf, svm_statistic, target_class=target_class,
              delta=delta,
              thresholds_size=0.5, # reserve 50% of target objects for classification
              ensemble_size=ensemble_size
             )
        name = f'UA(delta={delta},m={ensemble_size})'
        controllers[name] = ua

### TBC and WTBC

In [None]:
# Add TBC and WTBC controllers
for k in [3, 5, 7, 10]:
    for test in ['mwu', 'ttest_ind']:
        for distance in ['cityblock', 'cosine']:
            for weights in ['none', 'reciprocal']:
                tbc = TBC(k=k, test=test, distance=distance,
                          weights=weights, target_class=target_class)
                preprocessor = create_preprocessor()
                if weights != 'none':
                    method = 'WTBC'
                else:
                    method = 'TBC'
                name = f'{method}(k={k},test={test},distance={distance})'
                controllers[name] = Pipeline(steps=[('preprocessor', preprocessor),
                                                (method, tbc)])

### CPF

In [None]:
# Add CPF
def svm_statistic_for_cpf(svm_classifier, X):
    return svm_classifier.decision_function(X)

for nonconformity in ['score', 'avgdev', 'nearest_neighbor']:
    clf = SVC(kernel='rbf')
    cpf = ForcedInductiveConformal(clf, svm_statistic_for_cpf,
                               target_class=target_class, alpha=0.05,
                               reserve=0.5, # use 50% as D_proper
                                            # and the remaining 50% for calibration
                               nonconformity=nonconformity,
                               random_state=None)
    cpf = Pipeline(steps=[
     ('preprocessor', create_preprocessor()),
     ('CPF', cpf)
    ])
    name = f'CPF(nonconformity={nonconformity})'
    controllers[name] = cpf

### Typicality

In [None]:
# Add Typicality Indices
# Combine the preprocessor and the density estimator
density_pipeline = Pipeline([
    ('preprocessor', create_preprocessor()),
    ('kde', KernelDensity()),
])

typicality = Typicality(density_pipeline, target_class=target_class)

controllers['TI'] = typicality

### Direct Neyman-Pearson Classification

In [None]:
# Add DNP

# instantiate a kernel density-ratio estimator
kdr = KernelDensityRatio(kernel='polynomial',
                         kernel_parameters={'degree' : 2})

direct_np = DirectNP(kdr,
                     target_class=target_class,
                     threshold_subset_size=0.5) # 50% (use them to estimate the thresholds)

dnp = Pipeline(steps=[('preprocessor', create_preprocessor()),
                      ('NPD', direct_np)])
controllers['DNP'] = dnp

In [None]:
print(len(controllers))

## Evaluate the Controllers

In [None]:
# Settings
seed = 11
eval_size = 0.3
nominal_rates =  np.arange(0.01, 1, 0.01)
confidence_level = 0.99

In [None]:
svm_test.fit(X, y)

df = pd.DataFrame({'d' : svm_test.decision_function(X), 'c' : y})

ggplot(df, aes(x='d', color='factor(c)', linetype='factor(c)')) +\
geom_density(aes(y='..density..'), size=2) +\
theme_classic()

In [None]:
df.describe()

In [None]:
# Run the experiment and evaluate all the controllers
eval_results = evaluate_once(
    controllers, 
    X, y, 
    target_class,
    eval_size,
    seed, 
    nominal_rates,
    confidence_level=confidence_level
)


In [None]:
# Unpack the results
df_results, df_clf_times, df_fit_times = eval_results

In [None]:
# Save the results
filepath = os.path.join(res_dir, 'df_results.csv')
df_results.to_csv(filepath, index=False)

filepath = os.path.join(res_dir, 'df_fit_times.csv')
df_fit_times.to_csv(filepath, index=False)

filepath = os.path.join(res_dir, 'df_clf_times.csv')
df_clf_times.to_csv(filepath, index=False)

## Analyze

### Load the Results

In [None]:
filepath = os.path.join(res_dir, 'df_results.csv')
df_results = pd.read_csv(filepath)

filepath = os.path.join(res_dir, 'df_fit_times.csv')
df_fit_times = pd.read_csv(filepath)

filepath = os.path.join(res_dir, 'df_clf_times.csv')
df_clf_times = pd.read_csv(filepath)

### Choose the Best UA

In [None]:
choices = {}

In [None]:
plot_3(
    df_results[df_results['method'] == 'UA'],
    'nominal',
    'target_estimate', 
    'delta',
    ribbon=('target_lower', 'target_upper'),
    ab = (1, 0), 
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'UA',
    width=7, height=5,
    facet_parameter='ensemble_size'
)

In [None]:
plot_3(
    df_results[df_results['method'] == 'UA'],
    'target_estimate',
    'nontarget_estimate', 
    'delta',
    ribbon=('nontarget_lower', 'nontarget_upper'), 
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'UA',
    width=7, height=5,
    facet_parameter='ensemble_size'
)

In [None]:
r = analyze_numerically(df_results, ['delta', 'ensemble_size'], 'UA')
r.sort_values('D(A, OA)')

In [None]:
r.sort_values('D(A, B, OA)')

In [None]:
choices['UA'] = {
    'exact': {
        'ensemble_size' : 1,
        'delta' : 0.05
    },
    'valid' : {
        'ensemble_size' : 1,
        'delta' : 0.05
    }
}

### Choose the Best TBC

In [None]:
plot_3(
    df_results[df_results['method'] == 'TBC'],
    'nominal',
    'target_estimate', 
    'k',
    ribbon=('target_lower', 'target_upper'),
    ab = (1, 0), 
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'TBC',
    width=7, height=5,
    facet_parameter='test+distance'
)

In [None]:
plot_3(
    df_results[df_results['method'] == 'TBC'],
    'target_estimate',
    'nontarget_estimate',
    'k',
    ribbon=('nontarget_lower', 'nontarget_upper'),
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'TBC',
    width=7, height=5,
    facet_parameter='test+distance'
)

In [None]:
r = analyze_numerically(df_results, ['k', 'test', 'distance'], 'TBC')
r.sort_values('D(A, OA)')

In [None]:
r.sort_values('D(A, B, OA)')

In [None]:
choices['TBC'] = {
    'exact' : {
        'k' : 5,
        'test' : 'ttest_ind',
        'distance' : 'cityblock'
    },
    'valid' : {
        'k' : 3,
        'test' : 'ttest_ind',
        'distance' : 'cityblock'
    }
}

### Choose the Best WTBC

In [None]:
plot_3(
    df_results[df_results['method'] == 'WTBC'],
    'nominal',
    'target_estimate', 
    'k',
    #ribbon=('target_lower', 'target_upper'),
    ab = (1, 0), 
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'WTBC',
    width=7, height=5,
    facet_parameter='test+distance'
)

In [None]:
plot_3(
    df_results[df_results['method'] == 'WTBC'],
    'target_estimate',
    'nontarget_estimate',
    'k',
    ribbon=('nontarget_lower', 'nontarget_upper'),
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'WTBC',
    width=7, height=5,
    facet_parameter='test+distance'
)

In [None]:
plot_3(
    df_results[df_results['method'] == 'WTBC'],
    'nominal',
    'nontarget_estimate',
    'k',
    ribbon=('nontarget_lower', 'nontarget_upper'),
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'WTBC',
    width=7, height=5,
    facet_parameter='test+distance'
)

In [None]:
r = analyze_numerically(df_results, ['k', 'test', 'distance'], 'WTBC')
r.sort_values('D(A, OA)')

In [None]:
r.sort_values('D(A, B, OA)')

In [None]:
choices['WTBC'] = {
    'exact' : {
        'k' : 7,
        'test' : 'mwu',
        'distance' : 'cityblock'
    },
    'valid' : {
        'k' : 5,
        'test' : 'mwu',
        'distance' : 'cosine'
    }
}

### Choose the Best CPF

In [None]:
plot_3(
    df_results[df_results['method'] == 'CPF'],
    'nominal',
    'target_estimate', 
    'nonconformity',
    ribbon=('target_lower', 'target_upper'),
    ab = (1, 0), 
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'CPF',
    width=7, height=5
)

In [None]:
plot_3(
    df_results[df_results['method'] == 'CPF'],
    'target_estimate',
    'nontarget_estimate',
    'nonconformity',
    ribbon=('nontarget_lower', 'nontarget_upper'),
    display_plot=True,
    save_plot=True, location=res_dir, prefix=set_prefix + 'WTBC',
    width=7, height=5
)

In [None]:
r = analyze_numerically(df_results, ['nonconformity'], 'CPF')
r.sort_values('D(A, OA)')

In [None]:
r.sort_values('D(A, B, OA)')

In [None]:
choices['CPF'] = {
    'exact' : {
        'nonconformity' : 'avgdev'
    },
    'valid' : {
        'nonconformity' : 'score'
    }
}

### Filter

In [None]:
choices['SCT'] = {'exact' : {}, 'valid' : {}}
choices['TI'] = {'exact' : {}, 'valid' : {}}
choices['DNP'] = {'exact' : {}, 'valid' : {}}

pp.pprint(choices)

In [None]:
results_dfs = {}
clf_times_dfs = {}
fit_times_dfs = {}

for focus in ['exact', 'valid']:
    focus_choices = {method : choices[method][focus] for method in choices}
    results_dfs[focus] = select(df_results, focus_choices)
    clf_times_dfs[focus] = select(df_clf_times, focus_choices)
    fit_times_dfs[focus] = select(df_fit_times, focus_choices)

In [None]:
f = os.path.join(res_dir, f'{set_prefix}choices.p')
pickle.dump(choices, open(f, 'wb'))

### Analyze Time

#### Average Classification Time

In [None]:
for focus in clf_times_dfs:
    print(focus)
    display(clf_times_dfs[focus][['method', 'time']].sort_values('time'))

In [None]:
for focus in clf_times_dfs:
    plot_time(clf_times_dfs[focus], 'method', 'time', 'method',
         location=res_dir, name=set_prefix + f'{focus}_average_classification_time.jpg')

#### Fit Times

In [None]:
for focus in fit_times_dfs:
    print(focus)
    display(fit_times_dfs[focus][['method', 'time']].sort_values('time'))

In [None]:
for focus in fit_times_dfs:
    plot_time(fit_times_dfs[focus], 'method', 'time', 'method',
         location=res_dir, name=set_prefix + f'{focus}_fit_times.jpg')

### Check the Rates

#### Nominal vs. Target Estimate

In [None]:
for focus in results_dfs:
    print(focus)
    g = plot_3(
        results_dfs[focus],
        'nominal',
        'target_estimate',
        'method',
        ribbon=('target_lower', 'target_upper'),
        ab=(1, 0),
        legend_position=(0.35, 0.8), legend_name='', legend_ncol=2,
        legend_text_size=15, legend_key_width=35,
        display_plot=True,
        save_plot=True, location=res_dir, width=5.35, height=4.35,
        prefix= set_prefix + f'_{focus}_'
    )

#### Target vs. Other (Estimates)

In [None]:
for focus in results_dfs:
    print(focus)
    plot_3(
        results_dfs[focus],
        'target_estimate',
        'nontarget_estimate',
        'method',
        ribbon=('nontarget_lower', 'nontarget_upper'),
        save_plot=True, location=res_dir, width=5, height=4,
        prefix=set_prefix + f'_{focus}_'
    )

#### Nominal vs. Other

In [None]:
import ectrl.analyze
importlib.reload(ectrl.analyze)
from ectrl.analyze import style, plot_3

In [None]:
for focus in results_dfs:
    print(focus)
    g = plot_3(
        results_dfs[focus],
        'nominal',
        'nontarget_estimate',
        'method',
        ribbon=('nontarget_lower', 'nontarget_upper'),
        legend_position=(0.5, 0.9), legend_name='', legend_ncol=4, legend_title=False,
        legend_text_size=13, legend_key_width=35, legend_key_height=10,
        display_plot=True,
        save_plot=True, location=res_dir, width=5.35, height=4.35,
        prefix= set_prefix + f'_{focus}_'
    )

#### Nominal vs. Accuracy

In [None]:
for focus in results_dfs:
    print(focus)
    plot_3(
        results_dfs[focus],
        'nominal',
        'accuracy_estimate',
        'method',
        ribbon=('accuracy_lower', 'accuracy_upper'),
        save_plot=True, location=res_dir, width=5, height=4,
        prefix=set_prefix + f'_{focus}_'
    )

#### Target Estimate vs. Accuracy

In [None]:
for focus in results_dfs:
    print(focus)
    plot_3(
        df_results,
        'target_estimate',
        'accuracy_estimate',
        'method',
        ribbon=('accuracy_lower', 'accuracy_upper'),
        save_plot=True, location=res_dir, width=5, height=4,
        prefix=set_prefix + f'_{focus}_'
)

## Nondeterministic Test

In [None]:
# Make a pipeline containing the preprocessor and classifier
clf = Pipeline(steps=[
    ('preprocessor', create_preprocessor()),
    ('classifier', SVC(kernel = 'rbf'))
])

### Intervals

In [None]:
from datetime import datetime
from statsmodels.stats.proportion import proportion_confint
import math

res_dir = os.path.join(res_dir, 'nondet')
pathlib.Path(res_dir).mkdir(parents=True, exist_ok=True)

In [None]:
def evaluate(target, nominal, epsilon, runs, m, n, X, y, seeds, clf):
    support = 0
    estimates = []
    lower_estimates = []
    upper_estimates = []
    errors = []

    error = 0
    lower_bound = (1 - nominal) / n
    upper_bound = (epsilon * (n + 1) - nominal) / n
        
    for k in range(runs):
        if k % 10 == 9:
            print(k + 1, datetime.now().strftime('%H:%M:%S'), end='\r', flush=True)
        rng = np.random.default_rng(seeds[k])
        X_cv, X_eval, y_cv, y_eval = train_test_split(X, y, 
                                                  test_size=0.3, 
                                                  stratify = y.tolist(),
                                                 random_state=seeds[k])
    
        clf.set_params(**{'classifier__random_state' : seeds[k]})
        clf = clf.fit(X_cv, y_cv)
    
        Z = X_eval.loc[y_eval == target, :].reset_index(drop=True)
        scores = clf.decision_function(Z)
    
        false = 0
        for j in range(m):
            sample = rng.choice(scores, n + 1, replace=False)
            score = sample[0] #rng.choice(scores, 1)
            sample = np.sort(sample[1:])
        
            if target == 1:
                more_extreme = np.searchsorted(sample, score, 'right')
            else:
                i = np.searchsorted(sample, score, 'left')
                more_extreme = n - i + 1
        
            p_value = more_extreme / n
        
            if epsilon > 0:
                correction = rng.uniform(lower_bound, upper_bound)
                if p_value + correction <= nominal:
                    false = false + 1
            else:
                if p_value <= (nominal * (n + 1) - 1) / n:
                    false = false + 1
    
        lower, upper = proportion_confint(false, m, 0.01, 'jeffreys')

        estimate = false / m
        estimates.append(estimate)
    
        lower_estimates.append(lower)
        upper_estimates.append(upper)
    
        if nominal < estimate:
            error = error + (estimate - nominal)
        elif estimate < nominal - epsilon:
            error = error + (nominal - epsilon - estimate)
            
    return (lower_estimates, estimates, upper_estimates, error)

In [None]:
runs = 100
target = 1
seeds = [11*k + 19 for k in range(runs)]
m = 10000

results = {}
errors = []
supports = []
for nominal in [0.05, 0.1, 0.2]: 
    for n in [50, 100]: 
        epsilon = 0.02 
        if nominal <= epsilon or epsilon < 1 / (n + 1):
            continue
        print(nominal, n, epsilon, '\n')

        results[(nominal, n)] = {}
        #continue
        lower, estimates, upper, error = evaluate(target, nominal, epsilon,
                                                 runs, m, n, X, y, seeds, clf)
        
        results[(nominal, n)]['df'] = pd.DataFrame(
            {'lower':lower, 'estimates': estimates, 'upper':upper}
        )
        errors.append([nominal, n, error/runs])
        
        support = 0
        for i in range(runs):
            if nominal - epsilon <= estimates[i] <= nominal:
                support = support + 1

        e = support / runs
        l, u = proportion_confint(support, runs, 0.01, 'jeffreys')
        supports.append([nominal, n, l, e, u])

In [None]:
pickle.dump(results, open(os.path.join(res_dir, 'interval_df.p'), 'wb'))

In [None]:
res_nd = pickle.load(open(os.path.join(res_dir, 'interval_df.p'), 'rb'))
res_nd

In [None]:
for key in res_nd:
    alpha, n = key
    print(key)
    df = res_nd[key]['df']
    runs = df.shape[0]
    fig = plt.figure(figsize=(4, 2), dpi=500)
    plt.rc('font', size=10) 
    x = list(range(runs))
    plt.fill_between(x, df['lower'], df['upper'], color='red', alpha=0.2)
    plt.plot(x, df['estimates'], color='blue', linestyle='--')

    plt.plot(x, np.repeat(alpha, runs), color='black', linewidth=3)
    plt.plot(x, np.repeat(alpha - epsilon, runs), color='black', linewidth=3)
    
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks([])

    filename = os.path.join(res_dir, f'{set_prefix}_interval_{key}.jpg')
    plt.tight_layout()
    plt.savefig(filename, dpi=500, figsize=(4, 2))
    #plt.title(key)
    