In [1]:
import MySegments
import MyVisualiser
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.externals import joblib

import os
from os import path
from os import mkdir
from os import listdir
from os.path import isfile, join

from scipy import stats

import statsmodels.stats.weightstats as smws

In [2]:
def my_ttost(x,y,thresh):
    #return the p-value for the lower threshhold test
    #http://jpktd.blogspot.se/2012/10/tost-statistically-significant.html
    pv1 = smws.ttost_ind(np.array(x).flatten(), np.array(y).flatten(),-thresh,thresh, usevar='unequal')[1][1]    
    return pv1
    

In [20]:

models = [ 'MI', 'F-Score']
clfs = []
cv_results = []

#load best estimators and cv results

file_path = './New_Paper_Results/Beijing/'
for model in models:    
    for file in os.listdir(file_path + model):
        if file.endswith('.pkl'):
            #print(os.path.join('./Results/' + model, file))
            clfs.append(joblib.load(os.path.join(file_path + model, file)))
            #print(joblib.load(os.path.join('./Results_CV9/' + model, file)))
        if file.endswith('.csv'):
            cv_results.append(pd.read_csv(os.path.join(file_path + model, file), sep='\t'))#, header=1))

#Load Data
#mypath = 'Test'
#onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    
#Get labelled data
#sc = MySegments.SegmentCollection(folder = mypath, segments_path=onlyfiles, labels_path = 'labels.csv', classes_path='class_names.csv')
#idx,X,y = sc.get_labelled()

#split train and test data, maintaining class distributions
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)

print('----Non-inferiority test----')
print('----optimize accuracy----')
p_val = 0.25
ninf_margin = 0.05
print('p-Value: ', p_val, '\tnon-inferiority margin: ', ninf_margin*100, '%')

for i,(model, clf, cv_result) in enumerate(zip(models, clfs, cv_results)):
    # ------Non inferiority test----
    # test if mean_test_score with std_test_score is NOT INFERIOR 
    # to the mean_test_score with std_test_score of the highest ranking classifier
    # H_0: µ_1 < µ_2 + delta_lower -> µ_1 may be inferior to µ_2
    # H_A: µ_1 >= µ_2 + delta_lower -> µ_1 is NOT inferior to µ_2 
    
    #filter to get individual cross-validation results
    filter_col = [col for col in cv_result if col.startswith('split') & col.endswith('_test_Accuracy')]
    n_obs = len(filter_col) #number of observations (cross-validations)
    
        
    #get highest ranked estimator
    top_score = cv_result.sort_values('rank_test_Accuracy', ascending=True).head(1)
    top_score_mean = top_score['mean_test_Accuracy']
    top_score_std = top_score['std_test_Accuracy']
    top_scores = top_score[filter_col]
    top_n_features = top_score['n_features']
    
    #print(top_score)
    
    #compute t-test against top_score Non-inferiority
    cv_result['p_val'] = cv_result.apply(lambda row: my_ttost(row[filter_col], top_scores, ninf_margin), axis=1)
    filtered = cv_result[cv_result['p_val'] <= p_val]
    filtered = filtered.sort_values('n_features',ascending=True).head(1)
    
    #store the better classifier, if there is one
    top_n = top_score.iloc[0]['n_features']
    filt_n = filtered.iloc[0]['n_features']
    print('--------',model,'-------')
    if top_n <= filt_n:
        print('kept classifier with', top_n, 'features')
        print(top_score[['mean_score_time','mean_fit_time', 'n_features', 'mean_test_Accuracy', 'std_test_Accuracy']])
    else:
        print('better model found with', filt_n, 'features (initially ', top_n,')')
        filt_C = filtered.iloc[0]['param_classify__estimator__C']
        filt_gamma = filtered.iloc[0]['param_classify__estimator__gamma']
        clf.set_params(reduce_dim__k = filt_n, classify__estimator__C = filt_C, classify__estimator__gamma = filt_gamma)
        #Store best estimator
        best_filename = file_path + model +'_acc_' + 'p_' + str(p_val) + '_ninf_' + str(ninf_margin*100) + '_'+ str(filt_n) + '.pkl'
        joblib.dump(clf, best_filename)
        print('best estimating model saved at:', best_filename)    
        print('previous top-score')
        print(top_score[['mean_score_time','mean_fit_time', 'n_features', 'mean_test_Accuracy', 'std_test_Accuracy']])
        print('reduced')
        print(filtered[['mean_score_time','mean_fit_time', 'n_features', 'mean_test_Accuracy', 'std_test_Accuracy']])
        
        


----Non-inferiority test----
----optimize accuracy----
p-Value:  0.25 	non-inferiority margin:  5.0 %
-------- MI -------
better model found with 10 features (initially  226 )
best estimating model saved at: ./New_Paper_Results/Beijing/MI_acc_p_0.25_ninf_5.0_10.pkl
previous top-score
     mean_score_time  mean_fit_time  n_features  mean_test_Accuracy  \
580         0.035607       0.087131         226            0.914875   

     std_test_Accuracy  
580           0.026438  
reduced
     mean_score_time  mean_fit_time  n_features  mean_test_Accuracy  \
819         0.009333       0.374519          10             0.87724   

     std_test_Accuracy  
819           0.025467  
-------- F-Score -------
better model found with 21 features (initially  226 )
best estimating model saved at: ./New_Paper_Results/Beijing/F-Score_acc_p_0.25_ninf_5.0_21.pkl
previous top-score
     mean_score_time  mean_fit_time  n_features  mean_test_Accuracy  \
664         0.040608       0.083152         226          