# Statistical analysis - experiment 2

### Import the libraries 

In [1]:
import os
import numpy as np
import pandas as pd
from csv import reader
from statsmodels.stats.anova import AnovaRM
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline 

### Function to collect all csv-files in the folder

In [2]:
def list_csv_files(resultsdir):
    csv_files = []

    if os.path.isdir(resultsdir):
        for root, dirs, files in list(os.walk(resultsdir)):
            for name in files:
                subdir = root.split(resultsdir)
                all('' == s for s in subdir)
                
                if subdir[1].startswith('/'):
                    subdirname = subdir[1][1:]
                else:
                    subdirname = subdir[1]

                if name.lower().endswith('.csv'):
                    if all('' == s for s in subdir):
                        csv_files.append(name)
                    else:
                        csv_files.append(os.path.join(subdirname, name))
    
        csv_files.sort()

    return csv_files

### Function to get all unique combinations

In [3]:
def get_unique_combinations(csv_files):
    unique_combinations = []
    for c in range(len(csv_files)):
        csv_file = csv_files[c]
        splits = csv_file.split("/")
        run_no = splits[0].split("run")[-1]
        sam_mt = splits[1].split("_")[0]
        unc_mt = splits[1].split("_")[1]
        prob = splits[1].split("_")[2]
        fwp = splits[1].split("_")[3]
        sam_sz = splits[1].split("_")[4]
        
        if unc_mt == 'mean':
            unc_mt = 'average'
        
        unique_combination = sam_mt + '_' + unc_mt + '_' + prob + '_' + fwp + '_' + sam_sz
        unique_combinations.append(unique_combination)
    return list(set(unique_combinations))

### Load the csv-files and store the data in a pandas-dataframe 

In [4]:
resultsdir = "results/exp2"
two_up = os.path.abspath(os.path.join(os.getcwd(),"../.."))
resultsdir = os.path.join(two_up, resultsdir)
csv_files = list_csv_files(resultsdir)

clmns = ["sampling_method", "certainty_method", "dropout_probability", "number_forward_passes", "sampling_size", "sampling_frequency", "unique_combination", "run", "number_images", "mAP"]
df = pd.DataFrame(columns=clmns)
unique_combinations = get_unique_combinations(csv_files)
ucs = [None] * len(unique_combinations)

for c in range(len(csv_files)):
    csv_file = csv_files[c]
    splits = csv_file.split("/")
    run_no = splits[0].split("run")[-1]
    sam_mt = splits[1].split("_")[0]
    unc_mt = splits[1].split("_")[1]
    prob = splits[1].split("_")[2]
    fwp = splits[1].split("_")[3]
    sam_sz = splits[1].split("_")[4]
    sam_freq = 2400 / int(sam_sz)
    
    if unc_mt == 'mean':
        unc_mt = 'average'
        
    unique_combination = sam_mt + '_' + unc_mt + '_' + prob + '_' + fwp + '_' + sam_sz
    uc_id = unique_combinations.index(unique_combination) 
    ucs[uc_id] = unique_combination

    with open(os.path.join(resultsdir, csv_file), 'r') as read_obj:
        csv_reader = reader(read_obj)
        next(csv_reader)
        for row in csv_reader:
            data = [sam_mt, unc_mt, float(prob), int(fwp), int(sam_sz), int(sam_freq), int(uc_id), int(run_no), int(row[0]), float(row[1])]
            df.loc[len(df)] = data
            
df

Unnamed: 0,sampling_method,certainty_method,dropout_probability,number_forward_passes,sampling_size,sampling_frequency,unique_combination,run,number_images,mAP
0,uncertainty,average,0.25,20,100,24,0,1,100,24.9
1,uncertainty,average,0.25,20,100,24,0,1,200,29.5
2,uncertainty,average,0.25,20,100,24,0,1,300,35.5
3,uncertainty,average,0.25,20,100,24,0,1,400,39.2
4,uncertainty,average,0.25,20,100,24,0,1,500,41.5
...,...,...,...,...,...,...,...,...,...,...
465,uncertainty,average,0.25,20,50,48,2,5,2300,52.9
466,uncertainty,average,0.25,20,50,48,2,5,2350,58.7
467,uncertainty,average,0.25,20,50,48,2,5,2400,58.7
468,uncertainty,average,0.25,20,50,48,2,5,2450,58.7


In [5]:
numimg = [100, 500, 900, 1300, 1700, 2100, 2500]
df1 = df.loc[df['number_images'].isin(numimg)]
num_img = df1["number_images"].values.ravel()
num_img = pd.unique(num_img)

for ni in range(len(num_img)):
    cur_num_img = num_img[ni]
    sel = df1[df1["number_images"] == cur_num_img]
    print("Number of sampled images: {:d}\n".format(cur_num_img))
    mean = sel.groupby(['sampling_frequency']).mean().drop(columns=['dropout_probability'])
    print(mean)
    
    aovrm = AnovaRM(data=sel, depvar='mAP', subject='run', within=['sampling_frequency'], aggregate_func='mean')
    res = aovrm.fit()
    print(res)

Number of sampled images: 100

                      mAP
sampling_frequency       
6                   22.44
12                  22.44
24                  22.44
48                  22.44
                     Anova
                   F Value Num DF  Den DF Pr > F
------------------------------------------------
sampling_frequency -1.3333 3.0000 12.0000 1.0000

Number of sampled images: 500

                      mAP
sampling_frequency       
6                   36.74
12                  42.24
24                  41.62
48                  40.86
                     Anova
                   F Value Num DF  Den DF Pr > F
------------------------------------------------
sampling_frequency  5.5559 3.0000 12.0000 0.0126

Number of sampled images: 900

                      mAP
sampling_frequency       
6                   48.42
12                  50.50
24                  48.84
48                  47.50
                     Anova
                   F Value Num DF  Den DF Pr > F
-------------

In [6]:
df1.to_csv('/mnt/nvme2n1p2/PieterBlok/PhD/Paper 04 - Active Learning/Results/exp2/exp2.csv', index=False)