In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:12<00:00, 42.84it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:06<00:00, 29.18it/s]


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import itertools

In [3]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [1 if label == 'Unhealthy' else -1 for label in health_state]

In [17]:
def get_av_confusion_matrix(y_test, y_pred):
    av_confusion_mat = np.zeros(shape = (len(y_test), 2, 2))
    for i in range(0, len(y_test)):
        av_confusion_mat[i] = confusion_matrix(y_test[i], y_pred[i])
    return np.mean(av_confusion_mat, axis=0)

def objective_score(y_test, y_pred):
    """
    change to incorporate balanced accuracy 
    """
    balanced_acc = get_balanced_accuracy(y_test, y_pred)
    specificity = get_specificity(y_test, y_pred)
    f1 = get_f1_score(y_test, y_pred)
    
    
    return f1*0.7 + balanced_acc*0.3

def convert_dict_to_array1(params_dict):
    no_features = len(params_dict[0])
    params_array = np.zeros((no_channels, no_patients, no_features))
    for j in range(0, no_channels):
        for i, values in enumerate(params_dict[j].values()):
            params_array[j][:, i] = values
    return params_array, no_features




First convert the dictionary containing the parameters to use for each channel into an array of suitable shape for svm models. 

In [31]:
selected_params_array, no_features = convert_dict_to_array1(selected_params)

In [32]:
def tune_hyperparams(params, health_state, param_grid, scorer='balanced_accuracy'):
    
    X_train, X_test, y_train, y_test = train_test_split(params, health_state, test_size=0.3, stratify=health_state)
    
    #initialise classifier
    svc = SVC(class_weight='balanced', probability = True)

    # perform grid search
    grid_search = GridSearchCV(svc, param_grid, cv=3, scoring=scorer)
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

Then we are optimizing the hyperparameters that are used to train the model by perfroma grid search over the ``param_grid`` dictionary. The hyperparameters are chosen based on their optimisation of the manually defined ``scoring_function``. This takes a while as it has to perform a search over all the possible hyperparameters for each channel.

In [33]:
# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale']#including 'auto' aswell takes forever
}


best_estimators = []
for i in range(0, no_channels):
    best_estimators.append(tune_hyperparams(selected_params_array[i], health_state, param_grid, scoring_function))
    

In [34]:
print(best_estimators)

[SVC(C=0.1, class_weight='balanced', kernel='poly', probability=True), SVC(C=0.01, class_weight='balanced', probability=True), SVC(C=0.1, class_weight='balanced', kernel='poly', probability=True), SVC(C=0.01, class_weight='balanced', probability=True), SVC(C=1, class_weight='balanced', kernel='poly', probability=True), SVC(C=0.01, class_weight='balanced', probability=True)]


Now using the best set of hyperparameters we perform a 3 fold stratisfied split to investigate the accuracy. Firstly we can investigate the average accuracy of each channel.

In [35]:
def skfold_with_probabilities(params, health_state, n_splits, best_estimator, scorer):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    probabilities = []
    sample_percentages = []
    y_tests = []
    balanced_accuracy = []
    score_func = []
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        
        #calculating percentage of healthy in training data
        sample_percentages.append(np.mean(y_train == 'Healthy')) #how is threshold measured??? health_state[train_index]??
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        y_pred = best_estimator.predict(X_test)
        score_func.append(scorer(best_estimator, X_train, y_test))
        balanced_accuracy.append(get_balanced_accuracy(y_test, y_pred))
        
        #for evaluation of model later
        probabilities.append(best_estimator.predict_proba(X_test))
        y_tests.append(y_test)

    return np.mean(np.array(score_func)), np.mean(np.array(balanced_accuracy)), probabilities, sample_percentages, y_tests

In [36]:
score_accuracy = []
balanced_accuracy = []
n_splits=3

probs = []
thresholds = []
y_tests = []

for i in range(0, no_channels):
    score_acc, bal_acc, prob, threshold, y_test = skfold_with_probabilities(selected_params_array[i], health_state, n_splits, best_estimators[i], scoring_function)
    score_accuracy.append(score_acc)
    balanced_accuracy.append(bal_acc)
    probs.append(prob)
    thresholds.append(threshold)
    y_tests.append(y_test)
    

In [37]:
import pandas as pd

#presenting results as pandas df
data = {
    'Success Metric': ['Objective Score', 'Balanced Accuracy'],
    'Channel 1': [f'{score_accuracy[0]}', f'{balanced_accuracy[0]}'],
    'Channel 2': [f'{score_accuracy[1]}', f'{balanced_accuracy[1]}'],
    'Channel 3': [f'{score_accuracy[2]}', f'{balanced_accuracy[2]}'],
    'Channel 4': [f'{score_accuracy[3]}', f'{balanced_accuracy[3]}'],
    'Channel 5': [f'{score_accuracy[4]}', f'{balanced_accuracy[4]}'],
    'Channel 6': [f'{score_accuracy[5]}', f'{balanced_accuracy[5]}'],
    
}

df = pd.DataFrame(data)

df

Unnamed: 0,Success Metric,Channel 1,Channel 2,Channel 3,Channel 4,Channel 5,Channel 6
0,Objective Score,0.69516838340291,0.5585912172119069,0.6273051199861387,0.5585912172119069,0.5879495602485815,0.5585912172119069
1,Balanced Accuracy,0.6972175536881419,0.5,0.656610644257703,0.5,0.6078057889822596,0.5


Then can average the probabilities for each channel to get an overall accuracy for the method.

In [38]:
def average_probabilities(probs, n_splits, channels):
    average_probs = []
    for i in range(0, n_splits):
        probas = []
        for channel in channels:
            probas.append(probs[channel][i])
        average_probs.append(np.mean(probas, axis=0)[:, 0])
    return average_probs

def manual_y_pred(average_probs, n_splits, thresholds):
    manual_y_pred = []
    for i in range(0, n_splits):
        manual_predict = []
        for j in range(0, len(average_probs[i])):
            if average_probs[i][j] > thresholds[0][i]:
                manual_predict.append('Healthy')
            else:
                manual_predict.append('Unhealthy')
        manual_y_pred.append(manual_predict)
    return manual_y_pred


channel_indices_list = [0, 1, 2, 3, 4, 5]
all_combinations = []
for r in range(1, len(probs) + 1):
    combination = list(itertools.combinations(channel_indices_list, r))
    all_combinations.extend(combination)
    
print(thresholds)
best_score = 0
best_channel_indices = []
for combo in all_combinations:
        
        # Use combo to get channel indices
        selected_channel_indices = [channel_indices_list[i] for i in combo]
        print(selected_channel_indices)
        combo_probs = average_probabilities(probs, n_splits, selected_channel_indices)
        manual_pred = manual_y_pred(combo_probs, n_splits, thresholds)
        
        score = []
        for i in range(0, n_splits):
            score.append(objective_score(y_tests[0][i], manual_pred[i]))
            
        print(np.average(score))
        if np.average(score) > best_score:
                best_score = np.average(score)
                best_channel_indices = selected_channel_indices


[[0.2230769230769231, 0.22900763358778625, 0.22137404580152673], [0.2230769230769231, 0.22900763358778625, 0.22137404580152673], [0.2230769230769231, 0.22900763358778625, 0.22137404580152673], [0.2230769230769231, 0.22900763358778625, 0.22137404580152673], [0.2230769230769231, 0.22900763358778625, 0.22137404580152673], [0.2230769230769231, 0.22900763358778625, 0.22137404580152673]]
[0]
0.7698095238095237
[1]
0.35289855072463766
[2]
0.635310477814314
[3]
0.35289855072463766
[4]
0.6687421759527021
[5]
0.35289855072463766
[0, 1]
0.5862664096423077
[0, 2]
0.7785317715359423
[0, 3]
0.41415878186872107
[0, 4]
0.733318211342508
[0, 5]
0.6332009276829011
[1, 2]
0.4573069684013957
[1, 3]
0.35289855072463766
[1, 4]
0.5649557506170785
[1, 5]
0.35289855072463766
[2, 3]
0.3588044920666203
[2, 4]
0.6759542520565538
[2, 5]
0.49941201041820243
[3, 4]
0.44544979646118166
[3, 5]
0.35289855072463766
[4, 5]
0.5356494389770713
[0, 1, 2]
0.6744592486970459
[0, 1, 3]
0.3633527508090615
[0, 1, 4]
0.6789650968

In [39]:
print(best_score)
print(best_channel_indices)

0.7785317715359423
[0, 2]


In [40]:
for i in range(0, no_channels):
    print(selected_params[i].keys())

dict_keys(['kurtosis', 'shannon_en', 'sd_ratio'])
dict_keys(['lf', 'shannon_en', 'sd_ratio'])
dict_keys(['skews', 'shannon_en', 'sd_ratio'])
dict_keys(['skews', 'lf', 'sd_ratio'])
dict_keys(['std', 'skews', 'sd_ratio'])
dict_keys(['rr_std', 'RMSSD', 'std'])
