In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
params_df = pd.read_excel("C:\\Users\\Nefeli\\Desktop\\opti2_stat_scripts\\pso_results\\pso_full.xlsx")

In [3]:
def excelListCleanup(inputDfRow):
    toClean = inputDfRow
    toRemove=['[',']','\n'] 
    for i in toRemove:
        toClean = toClean.replace(i,'')
    toClean = toClean.split(' ')
    toClean = [i for i in toClean if i!='']
    toClean = [float(i)for i in toClean]
    #print(toClean)
    return toClean
#params_df.iat[65, params_df.columns.get_loc('costVal_list')]
#print(excelListCleanup(params_df.iat[65, params_df.columns.get_loc('costVal_list')]))

In [4]:
params_df.costVal_list = params_df.costVal_list.apply(lambda x : excelListCleanup(x))
params_df.feasibility_list = params_df.feasibility_list.apply(lambda x : excelListCleanup(x))
params_df.lastHit_list = params_df.lastHit_list.apply(lambda x : excelListCleanup(x))

Helper Functions

In [5]:
from scipy import stats

def kolmogorovSmirnovTailed(F,G,alternative, a):
    result = stats.ks_2samp(F,G,alternative)
    #compare the p value with the significance level a
    if result[1]<a:
        return 1  #null hypothesis REJECTED -> cdf(F)>cdf(G)
    else:
        return 0  #null hypothesis RETAINED -> cdf(F)<=cdf(G)

In [6]:
def kolmogorovSmirnovComparison(params_df,n,columnName,alternative,a):
    kolmogorov_smirnov_matrix = np.zeros((n,n))
    subDf = params_df.copy()
    for i in range(n):
        for j in range(n):
            scoreVal = kolmogorovSmirnovTailed(subDf.iat[i, subDf.columns.get_loc(columnName)],
                                               subDf.iat[j, subDf.columns.get_loc(columnName)],
                                               alternative,a)
            kolmogorov_smirnov_matrix[i,j] = scoreVal
            #if scoreVal == 1:
            #    kolmogorov_smirnov_matrix[i,j] = 1
            #else:
            #    kolmogorov_smirnov_matrix[i,j] = 0
    scoreList = []
    for i in range(n):
        scoreList.append(np.sum(kolmogorov_smirnov_matrix[i]))
    return np.array(scoreList)

In [7]:
paramOrder_df = params_df[['nb_r','N']]
paramOrder_df

Unnamed: 0,nb_r,N
0,0,50
1,0,100
2,0,200
3,5,50
4,5,100
5,5,200


In [8]:
allScores=[]
n=6

### Kolmogorov Smirnov Ranking Based on costVals

In [9]:
performace_per_file_matrix = np.zeros((n,1))

for i in range(1):
    x=kolmogorovSmirnovComparison(params_df,6,'costVal_list','greater',0.05)
    performace_per_file_matrix[:,i] = x 

costValScores=6*[0]
for i in range(6):
    costValScores[i] = np.sum(performace_per_file_matrix[i,:])
allScores.append(costValScores)
costValScores

[0.0, 1.0, 1.0, 0.0, 1.0, 5.0]

In [10]:
max_index = np.argmax(costValScores)
print("max idx (winner params) = " + str(max_index))
paramOrder_df.iloc[max_index]

max idx (winner params) = 5


nb_r      5
N       200
Name: 5, dtype: int64

### Kolmogorov Smirnov Ranking Based on lastHit

In [11]:
performace_per_file_matrix = np.zeros((n,1))

for i in range(1):
    x=kolmogorovSmirnovComparison(params_df,6,'lastHit_list','greater',0.05)
    performace_per_file_matrix[:,i] = x 

lastHitScores=6*[0]
for i in range(6):
    lastHitScores[i] = np.sum(performace_per_file_matrix[i,:])
allScores.append(lastHitScores)
lastHitScores

[2.0, 1.0, 4.0, 1.0, 1.0, 0.0]

In [12]:
max_index = np.argmax(lastHitScores)
print("max idx (winner params) = " + str(max_index))
paramOrder_df.iloc[max_index]

max idx (winner params) = 2


nb_r      0
N       200
Name: 2, dtype: int64

In [13]:
for i in allScores:
    print(i)

[0.0, 1.0, 1.0, 0.0, 1.0, 5.0]
[2.0, 1.0, 4.0, 1.0, 1.0, 0.0]


### Notes on KS Test:

<br>The desired comparison result is to find the parametrizations that achieve a smaller costVal and lastHit. </br> 

<br>Python is being used to achieve this, so we have the option of chooseing the 'alternative' parameter which defines the alternative hypothesis H1.</br>

<br>Since we want to find the parametrizations with smaller values, we need to choose the option 'greater' in order to define the alternative hypothesis H1 as : parametrization P_a minimizes better than parametrization P_b. The cdf of parametrization P_a is expected to be larger compared to the cdf of parametrization P_b (P_a > P_b)<br>
<br>The null hypohtesis is H0: The cdf of parametrization P_a is expected to be smaller or equal to the cdf of parametrization P_b (P_a <= P_b)</br>

<br>MATLAB Documentation Explanation : "If the data values in x1 tend to be larger than those in x2, the empirical distribution function of x1 tends to be smaller than that of x2, and vice versa."</br>
<br>Python Documentation Explanation: "Suppose, however, that the first sample were drawn from a normal distribution shifted toward greater values. In this case, the cumulative density function (CDF) of the underlying distribution tends to be less than the CDF underlying the second sample. Therefore, we would expect the null hypothesis to be rejected with alternative='less' "</br>
<br>According to the above, it is my understanding that if I have Sample1 with values greater than values of Sample2, and want to prove that relationship, if I want an H0 rejection I need to test for H1 = 'less' (or 'smaller' in MATLAB). The desired outcome is the rejection of H0 : the cdf of Sample1 is >= the cdf of Sample2. </br>
<br>If I am examining the opposite case, where I want to prove that the values of Sample1<Sample2, I would need to choose H1 = 'greater' (or 'larger' in MATLAB). In this scenario I expect the cdf of Sample1 to be greater than the cdf of Sample2, so I need to reject a null hypothesis where the cdf of Sample1<= the cdf of Sample2.</br>
<br>For each case, all 6 parametrizations are compared to each other and that results in a final score list that contains the sum on ones (Null Hypothesis Rejections) each has amassed.</br>
<br>Adding up the score each parametrization achieves for each case results in a total score.</br>
<br>The one with the highest score is the best one. Of course, ties may occur.</br>