In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [2]:
test_dir = Path(r"C:\Users\mvand\Documents\Master EE\Year 4\Thesis\data\test_output")
pca_dir = test_dir / "PCA_analysis_1"

select_set = 'set1'      # 'set1' 'set2'
select_type = 'grid'     # 'grid' 'single'


def get_pca_files(search_dir, dataset, datatype, fmt = '*.csv'): 
    
    files = [file for file in search_dir.glob(fmt) if not 'class' in file.stem and datatype in file.stem and dataset in file.stem] 
    
    pca_set = [file for file in files if not 'noPCA' in file.stem]
    pca_bench = [file for file in files if 'noPCA' in file.stem ]
     
    return pca_set, pca_bench 

In [3]:
## TO DO
## - add std bars to bar plots 
## - check confusion matrix value plots 

In [4]:
fn_bench = [file for file in pca_dir.glob("*.csv") if 'benchmark' in file.stem][0]

df_bench = pd.read_csv(fn_bench)
df_bench['hit_rate'] = df_bench['hit_ratio']

stat_cols = ['accuracy', 'precision', 'recall', 'f1', 'balanced_acc', 'hit_rate',
            'TP', 'TN', 'FP', 'FN', 'n', 'N']

df_bench.groupby(by='model')[stat_cols].mean()

labels = ['60%', '70%', '80%', '90%', '95%', '99%' ]

In [5]:
## base 

def display_performance(benchmark_0, benchmark_1, list_performance_fn, grouper = 'model',
                        stat_cols=['accuracy', 'precision', 'recall', 'f1', 'balanced_acc'], performance_labels = [],
                       out_dir = None, fn_stem = None):
    
    ## benchmark
    sum_bench0 = benchmark_0.groupby(by=grouper)[stat_cols].mean()
    bench0_algs = sum_bench0.index.values

    ## benkchmark performance model
    sum_bench1 = benchmark_1.groupby(by=grouper)[stat_cols].mean() 
    eval_algs = sum_bench1.index.values 
    
    ## load list of df_performance 
    list_performance_df = []
    for fn in list_performance_fn:

        _df = pd.read_csv(fn)
        
        sum_df = _df.groupby(by=grouper)[stat_cols].mean()
        list_performance_df.append(sum_df)


    for col in stat_cols:

        fig = plt.figure(figsize=(16,2))
        x_plot = 0.

        x_tick_loc = []
        x_tick_label = []

        plt.title(col)

        for alg in bench0_algs:
            plt.bar(x_plot, sum_bench0.loc[alg,col], width=3, color='grey')

            x_tick_loc.append(x_plot)
            x_tick_label.append(alg)

            x_plot += 3.5

        x_plot += 2

        for alg in eval_algs:

            plt.bar(x_plot, sum_bench1.loc[alg,col], width=3, color='darkcyan')

#             if len(performance_labels) > 0:
#                 plt.text(x_plot-0.1, 0.1, performance_labels[0], rotation = 90, color = 'black', size=8)

            x_tick_loc.append( (x_plot+ 2*len(list_performance_df)) )
            x_tick_label.append(alg)
            x_plot += 3.5 

            for i, df_alg in enumerate(list_performance_df):
                plt.bar(x_plot, df_alg.loc[alg,col], width = 3, color='cadetblue')

                if len(performance_labels) > 0:
                    plt.text(x_plot-1, 0.1, performance_labels[i], color = 'black', size=8, rotation=90)

                x_plot += 3.5

            x_plot += 2 

#         plt.ylim(0,1)
        plt.xticks(x_tick_loc, x_tick_label, rotation = 40)
        plt.grid()
        plt.xlim(-2, x_plot-1)

        if out_dir != None:
            if fn_stem == None:
                fn = out_dir / 'plot_{}.png'.format(col)
            else:
                fn = out_dir / 'plot_{}_{}.png'.format(fn_stem, col)
            plt.savefig(fn)

    return 

In [6]:
### set 1 - single pixel 
labels = ['60%', '70%', '80%', '90%', '95%', '99%' ]
display_cols = ['accuracy', 'precision', 'recall', 'f1', 'balanced_acc', 'hit_rate', 'TP', 'TN', 'FP']
fig_dir = test_dir 


pca_set, pca_bench = get_pca_files(pca_dir, 'set1', 'single', fmt = '*.csv')

df_pca_bench = pd.read_csv(pca_bench[0])

# display_performance( df_bench, df_pca_bench, pca_set, performance_labels = labels,
#                    stat_cols = display_cols, 
#                     out_dir = fig_dir, fn_stem='set1_pixel'
#                    )

# df_pca_bench.groupby(by='model')[stat_cols].mean()

# print(pca_set[3].stem)
# df = pd.read_csv(pca_set[3])
# df.groupby(by='model')[stat_cols].mean()

# print(pca_set[2].stem)
# df = pd.read_csv(pca_set[2])
# df.groupby(by='model')[stat_cols].mean()


In [7]:
### set 2 - single pixel 
labels = ['60%', '70%', '80%', '90%', '95%', '99%' ]

pca_set, pca_bench = get_pca_files(pca_dir, 'set2', 'single', fmt = '*.csv')

df_pca_bench = pd.read_csv(pca_bench[0])

# display_performance( df_bench, df_pca_bench, pca_set, performance_labels = labels,  
#                     stat_cols = display_cols, 
#                     out_dir = fig_dir, fn_stem='set2_pixel' 
#                    )
    

# df_pca_bench.groupby(by='model')[stat_cols].mean()

# print(pca_set[3].stem)
# df = pd.read_csv(pca_set[3])
# df.groupby(by='model')[stat_cols].mean()

# print(pca_set[2].stem)
# df = pd.read_csv(pca_set[2])
# df.groupby(by='model')[stat_cols].mean()

In [30]:
### set 1- grid

pca_set, pca_bench = get_pca_files(pca_dir, 'set1', 'grid', fmt = '*.csv')

df_pca_bench = pd.read_csv(pca_bench[0])

# display_performance( df_bench, df_pca_bench, pca_set, performance_labels = labels,  
#                     stat_cols = display_cols, 
#                     out_dir = fig_dir, fn_stem='set2_pixel' 
#                    )

# df_pca_bench.groupby(by='model')[stat_cols].mean()

ix=2
print(pca_set[ix].stem)
df = pd.read_csv(pca_set[ix])
df.groupby(by='model')[stat_cols].mean().loc[['RF-2', 'SVM-2', 'k-nn-2', 'LR-2' ]]


alg_pf_grid_80PCA_set1_6_diff


Unnamed: 0_level_0,accuracy,precision,recall,f1,balanced_acc,hit_rate,TP,TN,FP,FN,n,N
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
RF-2,0.988243,0.999999,0.010335,0.010229,0.505167,0.010335,1.0,8051.8,0.0,95.8,96.8,8148.6
SVM-2,0.989716,0.559859,0.161089,0.123515,0.579837,0.142514,13.8,8051.0,11.4,72.4,96.8,8148.6
k-nn-2,0.988194,0.366666,0.006296,0.006167,0.503036,0.006231,0.6,8051.8,1.8,94.4,96.8,8148.6
LR-2,0.990255,0.493918,0.237437,0.158885,0.61749,0.191852,18.6,8050.6,19.8,59.6,96.8,8148.6


In [33]:
### set 2- grid

pca_set, pca_bench = get_pca_files(pca_dir, 'set2', 'grid', fmt = '*.csv')

df_pca_bench = pd.read_csv(pca_bench[0])

# display_performance( df_bench, df_pca_bench, pca_set, performance_labels = labels,  
#                     stat_cols = display_cols, 
#                     out_dir = fig_dir, fn_stem='set2_pixel' 
#                    )

# df_pca_bench.groupby(by='model')[stat_cols].mean().loc[['RF-2', 'SVM-2', 'k-nn-2', 'LR-2' ]]

ix=3
print(pca_set[ix].stem)
df = pd.read_csv(pca_set[ix])
df.groupby(by='model')[stat_cols].mean().loc[['RF-2', 'SVM-2', 'k-nn-2', 'LR-2' ]]

alg_pf_grid_90PCA_set2_6_diff


Unnamed: 0_level_0,accuracy,precision,recall,f1,balanced_acc,hit_rate,TP,TN,FP,FN,n,N
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
RF-2,0.988269,0.799999,0.009015,0.008914,0.504494,0.009015,0.8,7395.8,0.2,87.6,88.6,7484.4
SVM-2,0.989523,0.60014,0.134204,0.107769,0.566616,0.123785,11.0,7395.0,7.2,71.2,88.6,7484.4
k-nn-2,0.988216,0.166667,0.004598,0.004469,0.502164,0.00452,0.4,7395.8,2.0,86.2,88.6,7484.4
LR-2,0.990326,0.417648,0.293774,0.169385,0.645083,0.209793,18.6,7393.4,26.8,45.6,88.6,7484.4


In [None]:
## load classification results - analyse p0 and p1 for TP and FP

class_files = [file for file in pca_dir.glob('*.csv') if 'class' in file.stem]
set1_class = [file for file in class_files if 'set-1' in file.stem]
set2_class = [file for file in class_files if 'set-2' in file.stem]

In [None]:
def plot_confusion(df, range_col, target_col, model, title, save_dir = None,
                  label_1 = 'TP', label_2 = 'FP', id_col = 'tag'):
    
    ## TP 
    ## if prediction in one of range_col 
    if 'TP' in label_1:
        
        ix_TP = []
        
        for ix in df[id_col].unique(): 
            nTP = df[ (df[id_col] ==  ix) & (df[range_col] == 1) ]
            
            predit
            
            if df[ (df[id_col] ==ix) & (df[hat_col]==1.) ].index in nTP.index.values:
                
            
        
        
        ## TP 
        plot_1 = df[ (df[range_col] ==1) & (df[hat_col]==1) ]        
    
    if 'TP' in label_2:
        ## TP if prediction in one of range_col 
        plot_2 = df[ (df[range_col] ==1) & (df[hat_col]==1) ] 
    
    ## FP 
    ## if 0 in range col, but 1 for hat_col 
    if 'FP' in label_1:
        plot_1 = df[ (df[range_col] ==0) & (df[hat_col]==1) ] 
    
    if 'FP' in label_2:
        plot_2 = df[ (df[range_col] ==0) & (df[hat_col]==1) ] 
    
    ## TN 
    ## if 0 in range col and 0 for hat_col 
    if 'TN' in label_1:
        plot_1 = df[ (df[range_col] ==0) & (df[hat_col]==0) ] 
    
    if 'TN' in label_2:
        plot_2 = df[ (df[range_col] ==0) & (df[hat_col]==0) ] 
    
    ## FN 
    ## if 0 in range col and 0 for hat_col 
    if 'FN' in label_1:
        plot_1 = df[ (df[range_col] ==1) & (df[hat_col]==0) ] 
    
    if 'FN' in label_2:
        plot_2 = df[ (df[range_col] ==1) & (df[hat_col]==0) ] 

        
    ## plot
    fig, ax1 = plt.subplots()
    ax1.hist(plot_1[p1_col], color = 'b', label = label_1, bins = bin_edges )
    ax2 = ax1.twinx()
    ax2.hist(plot_2[p1_col], color = 'r', alpha=0.3, label = label_2, bins = bin_edges )
    
    ax1.tick_params(axis='y', labelcolor='b')
    ax2.tick_params(axis='y', labelcolor='r')
    
    ax1.set_xlim(0,1.05)
    ax1.set_xlabel('Class 1 probability')
    ax1.set_ylabel('n {}'.format(label_1), color='b')
    ax2.set_ylabel('n {}'.format(label_2), color='r')
    
    fig.legend()
    fig.suptitle(title)
    
    if save_dir != None:
        fn = save_dir / '{}_{}_{}.png'.format(title, label_1, label_2)
        print('Save: ', fn.stem)
        plt.savefig(fn)
        return fn 
    
    return 

In [None]:
## SET -1 - TN and FP 
range_col = 'range_target'
target_col = 'target'

model = 'RF-2'
p0_col = '{}_p0'.format(model)
p1_col = '{}_p1'.format(model)
hat_col = '{}_target'.format(model)

bin_edges = np.arange(0,1.01, 0.02)

label_files = ['set_1_60PCA_{}'.format(model),
              'set_1_70PCA_{}'.format(model),
              'set_1_80PCA_{}'.format(model),
              'set_1_90PCA_{}'.format(model),
              'set_1_95PCA_{}'.format(model),
              'set_1_99PCA_{}'.format(model),
              'set_1_noPCA_{}'.format(model)]


for n, fn in enumerate(set1_class):
    
    df = pd.read_csv(fn, index_col=0)
    
    plot_confusion(df, range_col, target_col, model, label_files[n], save_dir = None,
                  label_1 = 'TP', label_2 = 'FP')


In [None]:
label_files = ['set_2_60PCA',
              'set_2_70PCA',
              'set_2_80PCA',
              'set_2_90PCA',
              'set_2_95PCA',
              'set_2_99PCA',
              'set_2_noPCA',]


for n, fn in enumerate(set2_class):
    
    df = pd.read_csv(fn, index_col=0)
    
    plot_confusion(df, range_col, target_col, model, label_files[n], save_dir = None,
                  label_1 = 'TP', label_2 = 'FP')
