In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:26<00:00, 20.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:06<00:00, 31.42it/s]


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
#for some reason it doesnt like this at the bottom of the previous notebook
selected_params = {}

for i in range(0, no_channels):
    keep_keys = forward_selection(params, scoring_function, i)
    
    # convert keys to a list to ensure order
    keys = list(params.keys())
    
    #keeping desired dict entries only
    selected_params[i] = {key:params[key][i] for key, keep in zip(keys, keep_keys) if keep}

In [5]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [1 if label == 'Unhealthy' else -1 for label in health_state]

In [6]:
def get_av_confusion_matrix(y_test, y_pred):
    av_confusion_mat = np.zeros(shape = (len(y_test), 2, 2))
    for i in range(0, len(y_test)):
        av_confusion_mat[i] = confusion_matrix(y_test[i], y_pred[i])
    return np.mean(av_confusion_mat, axis=0)

def objective_score(y_test, y_pred):
    """
    change to incorporate balanced accuracy 
    """
    balanced_acc = get_balanced_accuracy(y_test, y_pred)
    specificity = get_specificity(y_test, y_pred)
    f1 = get_f1_score(y_test, y_pred)
    
    
    return f1*0.7 + balanced_acc*0.3

def convert_dict_to_array1(params_dict):
    no_features = len(params_dict[0])
    params_array = np.zeros((no_channels, no_patients, no_features))
    
    for j in range(0, no_channels):
        for i, values in enumerate(params_dict[j].values()):
            params_array[j][:, i] = values
    return params_array, no_features
    




First convert the dictionary containing the parameters to use for each channel into an array of suitable shape for svm models. 

In [7]:
selected_params_array, no_features = convert_dict_to_array1(selected_params)

In [8]:
def tune_hyperparams(params, health_state, param_grid, scorer='balanced_accuracy'):
    
    X_train, X_test, y_train, y_test = train_test_split(params, health_state, test_size=0.3, stratify=health_state)
    
    #initialise classifier
    svc = SVC(class_weight='balanced', probability = True)

    # perform grid search
    grid_search = GridSearchCV(svc, param_grid, cv=3, scoring=scorer)
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

Then we are optimizing the hyperparameters that are used to train the model by perfroma grid search over the ``param_grid`` dictionary. The hyperparameters are chosen based on their optimisation of the manually defined ``scoring_function``. This takes a while as it has to perform a search over all the possible hyperparameters for each channel.

In [32]:
# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale']#including 'auto' aswell takes forever
}


best_estimators = []
for i in range(0, no_channels):
    best_estimators.append(tune_hyperparams(selected_params_array[i], health_state, param_grid, scoring_function))
    

In [33]:
print(best_estimators)

[SVC(C=10, class_weight='balanced', probability=True), SVC(C=0.1, class_weight='balanced', kernel='linear', probability=True), SVC(C=0.01, class_weight='balanced', probability=True), SVC(C=0.01, class_weight='balanced', probability=True), SVC(C=0.01, class_weight='balanced', probability=True), SVC(C=0.01, class_weight='balanced', kernel='poly', probability=True)]


Now using the best set of hyperparameters we perform a 3 fold stratisfied split to investigate the accuracy. Firstly we can investigate the average accuracy of each channel.

In [34]:
def skfold(params, health_state, n_splits, best_estimator, scorer):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    balanced_accuracy = []
    score_func = []
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        y_pred = best_estimator.predict(X_test)
        score_func.append(scorer(best_estimator, X_train, y_test))
        balanced_accuracy.append(get_balanced_accuracy(y_test, y_pred))
        
    return np.mean(np.array(score_func)), np.mean(np.array(balanced_accuracy))

In [35]:
score_accuracy = []
balanced_accuracy = []
n_splits=3
for i in range(0, no_channels):
    score_acc, bal_acc = skfold(selected_params_array[i], health_state, n_splits, best_estimators[i], scoring_function)
    score_accuracy.append(score_acc)
    balanced_accuracy.append(bal_acc)
    

In [36]:
import pandas as pd

#presenting results as pandas df
data = {
    'Success Metric': ['Objective Score', 'Balanced Accuracy'],
    'Channel 1': [f'{score_accuracy[0]}', f'{balanced_accuracy[0]}'],
    'Channel 2': [f'{score_accuracy[1]}', f'{balanced_accuracy[1]}'],
    'Channel 3': [f'{score_accuracy[2]}', f'{balanced_accuracy[2]}'],
    'Channel 4': [f'{score_accuracy[3]}', f'{balanced_accuracy[3]}'],
    'Channel 5': [f'{score_accuracy[4]}', f'{balanced_accuracy[4]}'],
    'Channel 6': [f'{score_accuracy[5]}', f'{balanced_accuracy[5]}'],
    
}

df = pd.DataFrame(data)

df

Unnamed: 0,Success Metric,Channel 1,Channel 2,Channel 3,Channel 4,Channel 5,Channel 6
0,Objective Score,0.6924482800849917,0.7387637364627594,0.5585912172119069,0.5585912172119069,0.5585912172119069,0.7619493863212963
1,Balanced Accuracy,0.5836601307189543,0.5315126050420168,0.5,0.5,0.5,0.5202614379084968


Then can average the probabilities for each channel to get an overall accuracy for the method.

In [37]:
#make so does repeated skfold for even more averages, bool??

#should have cv gridsearch in here aswell???

def skfold_probabilities(params, health_state, n_splits, best_estimator):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    probabilities = []
    sample_percentages = []
    y_tests = []
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        #calculating percentage of healthy in training data
        sample_percentages.append(np.mean(y_train == 'Healthy'))
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        probabilities.append(best_estimator.predict_proba(X_test))
        y_tests.append(y_test)
        
    return probabilities, sample_percentages, y_tests
    
        
        

In [38]:
n_splits = 3 #3 fold validation

#list of probabilities each entry (determined by number of channels) containing n_split list of probabilities
probs = []
thresholds = []
y_tests = []
for i in range(0, no_channels):
    prob, threshold, y_test  = skfold_probabilities(selected_params_array[i], health_state, n_splits, best_estimators[i])
    probs.append(prob)
    thresholds.append(threshold)
    y_tests.append(y_test)

In [53]:
print(probs)

[[array([[0.22163112, 0.77836888],
       [0.2175475 , 0.7824525 ],
       [0.22155929, 0.77844071],
       [0.23316508, 0.76683492],
       [0.21555793, 0.78444207],
       [0.22248594, 0.77751406],
       [0.21884684, 0.78115316],
       [0.21548196, 0.78451804],
       [0.21037662, 0.78962338],
       [0.22105536, 0.77894464],
       [0.21777234, 0.78222766],
       [0.21280018, 0.78719982],
       [0.2209186 , 0.7790814 ],
       [0.22137687, 0.77862313],
       [0.21841413, 0.78158587],
       [0.22680899, 0.77319101],
       [0.22099738, 0.77900262],
       [0.23208382, 0.76791618],
       [0.22007213, 0.77992787],
       [0.21644467, 0.78355533],
       [0.23302152, 0.76697848],
       [0.22163027, 0.77836973],
       [0.2136263 , 0.7863737 ],
       [0.21823362, 0.78176638],
       [0.21949316, 0.78050684],
       [0.2194932 , 0.7805068 ],
       [0.23270936, 0.76729064],
       [0.22811324, 0.77188676],
       [0.21645359, 0.78354641],
       [0.22013875, 0.77986125],
       [

In [92]:
def average_probabilities(probs, n_splits, channels):
    average_probs = []
    for i in range(0, n_splits):
        probas = []
        for channel in channels:
            probas.append(probs[channel][i])
        average_probs.append(np.mean(probas, axis=0)[:, 0])
    return average_probs

def manual_y_pred(average_probs, n_splits, thresholds):
    manual_y_pred = []
    for i in range(0, n_splits):
        manual_predict = []
        for j in range(0, len(average_probs[i])):
            if average_probs[i][j] > thresholds[0][i]:
                manual_predict.append('Healthy')
            else:
                manual_predict.append('Unhealthy')
        manual_y_pred.append(manual_predict)
    return manual_y_pred


channel_indices_list = [0, 1, 2, 3, 4, 5]
all_combinations = []
for r in range(1, len(probs) + 1):
    combination = list(itertools.combinations(channel_indices_list, r))
    all_combinations.extend(combination)
    
best_score = 0
best_channel_indices = []
for combo in all_combinations:
        
        # Use combo to get channel indices
        selected_channel_indices = [channel_indices_list[i] for i in combo]
        print(selected_channel_indices)
        combo_probs = average_probabilities(probs, n_splits, selected_channel_indices)
        manual_pred = manual_y_pred(combo_probs, n_splits, thresholds)
        
        score = []
        for i in range(0, n_splits):
            score.append(objective_score(y_tests[0][i], manual_pred[i]))
            
        print(np.average(score))
        if np.average(score) > best_score:
                best_score = np.average(score)
                best_channel_indices = selected_channel_indices


[0]
0.6676349924585218
[1]
0.39091961360550737
[2]
0.35289855072463766
[3]
0.35289855072463766
[4]
0.35289855072463766
[5]
0.41326795952220463
[0, 1]
0.47129191780086915
[0, 2]
0.46519686104344854
[0, 3]
0.3495847953216374
[0, 4]
0.44159291736018075
[0, 5]
0.5005849248790425
[1, 2]
0.34956521739130436
[1, 3]
0.3741730605285592
[1, 4]
0.34956521739130436
[1, 5]
0.23974799099967856
[2, 3]
0.35289855072463766
[2, 4]
0.35289855072463766
[2, 5]
0.3829019906971179
[3, 4]
0.35289855072463766
[3, 5]
0.3873485341078215
[4, 5]
0.3829019906971179
[0, 1, 2]
0.46468129967680416
[0, 1, 3]
0.35849390311233104
[0, 1, 4]
0.46908902691511384
[0, 1, 5]
0.44369251144351507
[0, 2, 3]
0.35289855072463766
[0, 2, 4]
0.35289855072463766
[0, 2, 5]
0.40247283790392513
[0, 3, 4]
0.35289855072463766
[0, 3, 5]
0.3840347787048212
[0, 4, 5]
0.3928567418283396
[1, 2, 3]
0.35289855072463766
[1, 2, 4]
0.35289855072463766
[1, 2, 5]
0.3829019906971179
[1, 3, 4]
0.34956521739130436
[1, 3, 5]
0.3873485341078215
[1, 4, 5]
0.

In [93]:
print(best_score)
print(best_channel_indices)

0.6676349924585218
[0]


In [51]:
average_probs = []
for i in range(0, n_splits):
    probas = []
    for j in range(0, no_channels):
        probas.append(probs[j][i])
    average_probs.append(np.mean(probas, axis=0)[:, 0])

In [52]:
print(average_probs)

[array([0.26458355, 0.26094373, 0.26491372, 0.25195   , 0.26221964,
       0.2627782 , 0.2634132 , 0.26297215, 0.26334847, 0.26288328,
       0.26190067, 0.26227905, 0.26362449, 0.26271695, 0.26303101,
       0.26079984, 0.26244707, 0.25319153, 0.26430161, 0.26118665,
       0.21277362, 0.26418146, 0.26323883, 0.26350317, 0.21103034,
       0.20960775, 0.2435847 , 0.26292805, 0.26102334, 0.26448739,
       0.26215168, 0.26389641, 0.26293423, 0.24084187, 0.26476359,
       0.26204023, 0.26246152, 0.22126013, 0.26240142, 0.25983493,
       0.26301511, 0.26419327, 0.26373377, 0.2606268 , 0.26368101,
       0.26377183, 0.26249969, 0.26197041, 0.26243624, 0.26252417,
       0.26262936, 0.26156961, 0.26163424, 0.26178491, 0.26367796,
       0.19875458, 0.26391996, 0.26151323, 0.26224057, 0.26384257,
       0.26302199, 0.26335107, 0.26319759, 0.20564418, 0.2639271 ,
       0.26252151]), array([0.24676355, 0.27713333, 0.24748039, 0.26837856, 0.25258571,
       0.24765505, 0.25546612, 0.2530669

In [41]:
manual_y_pred = []
for i in range(0, n_splits):
    manual_predict = []
    for j in range(0, len(average_probs[i])):
        if average_probs[i][j] > thresholds[0][i]:
            manual_predict.append('Healthy')
        else:
            manual_predict.append('Unhealthy')
    manual_y_pred.append(manual_predict)

In [42]:
balanced_accuracy_score = []
score_function_score = []
for i in range(0, n_splits):
    score_function_score.append(objective_score(y_tests[0][i], manual_y_pred[i]))
    balanced_accuracy_score.append(get_balanced_accuracy(y_tests[0][i], manual_y_pred[i]))
    
    

In [44]:
print(balanced_accuracy_score)#will average these
print(score_function_score)

[0.4725490196078431, 0.5, 0.5]
[0.24001031991744065, 0.15, 0.758695652173913]


In [22]:
for i in range(0, no_channels):
    print(selected_params[i].keys())

dict_keys(['mean', 'power_ratio', 'sd_ratio'])
dict_keys(['mean', 'std', 'power_ratio'])
dict_keys(['rr_mean', 'RMSSD', 'pNN50'])
dict_keys(['rr_std', 'RMSSD', 'pNN50'])
dict_keys(['pNN50', 'skews', 'power_ratio'])
dict_keys(['mean', 'std', 'power_ratio'])
