In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [01:22<00:00,  6.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 221/221 [00:01<00:00, 147.27it/s]


(202, 16)
dict_keys(['rr_amps', 'lf', 'sd_ratio_outliers_removed'])
dict_keys(['shannon_en', 'sd_ratio', 'sd_ratio_outliers_removed'])
dict_keys(['skews', 'shannon_en', 'sd_ratio_outliers_removed'])
dict_keys(['lf', 'shannon_en', 'sd_ratio'])
dict_keys(['rr_std', 'rr_amps', 'skews'])
dict_keys(['rr_std', 'pNN50', 'hf'])


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import itertools

In [3]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [1 if label == 'Unhealthy' else -1 for label in health_state]

In [20]:
def get_av_confusion_matrix(y_test, y_pred):
    av_confusion_mat = np.zeros(shape = (len(y_test), 2, 2))
    for i in range(0, len(y_test)):
        av_confusion_mat[i] = confusion_matrix(y_test[i], y_pred[i])
    return np.mean(av_confusion_mat, axis=0)

def objective_score(y_test, y_pred):
    """
    change to incorporate balanced accuracy 
    """
    balanced_acc = get_balanced_accuracy(y_test, y_pred)
    specificity = get_specificity(y_test, y_pred)
    f1 = get_f1_score(y_test, y_pred)
    
    
    return f1*0.7 + balanced_acc*0.3

def convert_multi_dict_to_array1(params_dict, nan_indices):
    no_features = len(params_dict[0])
    params_list = []
    for j in range(0, no_channels):
        params_array = np.zeros((len(health_state[nan_indices[j]]), no_features))
        for i, values in enumerate(params_dict[j].values()):
            params_array[:, i] = values
        params_list.append(params_array)
    return params_list, no_features





First convert the dictionary containing the parameters to use for each channel into an array of suitable shape for svm models. 

In [21]:
print(len(selected_params[0]))

3


In [22]:
selected_params_array, no_features = convert_multi_dict_to_array1(selected_params, nan_indices)

In [28]:
def tune_hyperparams(params, health_state, param_grid, scorer='balanced_accuracy'):
    
    X_train, X_test, y_train, y_test = train_test_split(params, health_state, test_size=0.3, stratify=health_state)
    
    #initialise classifier
    svc = SVC(class_weight='balanced', probability = True)

    # perform grid search
    grid_search = GridSearchCV(svc, param_grid, cv=3, scoring=scorer)
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

Then we are optimizing the hyperparameters that are used to train the model by perfroma grid search over the ``param_grid`` dictionary. The hyperparameters are chosen based on their optimisation of the manually defined ``scoring_function``. This takes a while as it has to perform a search over all the possible hyperparameters for each channel.

In [30]:
# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale']#including 'auto' aswell takes forever
}


best_estimators = []
for i in range(0, no_channels):
    best_estimators.append(tune_hyperparams(selected_params_array[i], health_state[nan_indices[i]], param_grid, scoring_function))
    

In [31]:
print(best_estimators)

[SVC(C=0.01, class_weight='balanced', kernel='poly', probability=True), SVC(C=10, class_weight='balanced', kernel='poly', probability=True), SVC(C=10, class_weight='balanced', kernel='poly', probability=True), SVC(C=10, class_weight='balanced', probability=True), SVC(C=0.1, class_weight='balanced', probability=True), SVC(C=1, class_weight='balanced', probability=True)]


Now using the best set of hyperparameters we perform a 3 fold stratisfied split to investigate the accuracy. Firstly we can investigate the average accuracy of each channel.

In [32]:
def skfold_with_probabilities(params, health_state, n_splits, best_estimator, scorer):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    probabilities = []
    sample_percentages = []
    y_tests = []
    balanced_accuracy = []
    score_func = []
    test_indices = test_index
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        
        #calculating percentage of healthy in training data
        sample_percentages.append(np.mean(y_train == 'Healthy')) #how is threshold measured??? health_state[train_index]??
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        y_pred = best_estimator.predict(X_test)
        score_func.append(scorer(best_estimator, X_train, y_test))
        balanced_accuracy.append(get_balanced_accuracy(y_test, y_pred))
        
        #for evaluation of model later
        probabilities.append(best_estimator.predict_proba(X_test))
        y_tests.append(y_test)
        
        #for reconstruction of full patient data
        test_indices.append(test_index)

    return np.mean(np.array(score_func)), np.mean(np.array(balanced_accuracy)), probabilities, sample_percentages, y_tests

In [33]:
score_accuracy = []
balanced_accuracy = []
n_splits=3

probs = []
thresholds = []
y_tests = []
test_indices = []

for i in range(0, no_channels):
    score_acc, bal_acc, prob, threshold, y_test, test_indice = skfold_with_probabilities(selected_params_array[i], health_state[nan_indices[i]], n_splits, best_estimators[i], scoring_function)
    score_accuracy.append(score_acc)
    balanced_accuracy.append(bal_acc)
    probs.append(prob)
    thresholds.append(threshold)
    y_tests.append(y_test)
    test_indices.append(test_indice)

In [39]:
print(y_tests[1])

[array(['Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Unhealthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Healthy', 'Healthy', 'Unhealthy', 'Unhealthy'], dtype='<U9'), array(['Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealt

In [34]:
import pandas as pd

#presenting results as pandas df
data = {
    'Success Metric': ['Objective Score', 'Balanced Accuracy'],
    'Channel 1': [f'{score_accuracy[0]}', f'{balanced_accuracy[0]}'],
    'Channel 2': [f'{score_accuracy[1]}', f'{balanced_accuracy[1]}'],
    'Channel 3': [f'{score_accuracy[2]}', f'{balanced_accuracy[2]}'],
    'Channel 4': [f'{score_accuracy[3]}', f'{balanced_accuracy[3]}'],
    'Channel 5': [f'{score_accuracy[4]}', f'{balanced_accuracy[4]}'],
    'Channel 6': [f'{score_accuracy[5]}', f'{balanced_accuracy[5]}'],
    
}

df = pd.DataFrame(data)

df

Unnamed: 0,Success Metric,Channel 1,Channel 2,Channel 3,Channel 4,Channel 5,Channel 6
0,Objective Score,0.7312046996132976,0.7116955603154919,0.6709219192354419,0.7353605079832205,0.3953263458495926,0.7430517838772692
1,Balanced Accuracy,0.6392063492063492,0.7220364176885917,0.7689393939393939,0.7156224903680964,0.551058201058201,0.6977883751468656


Then can average the probabilities for each channel to get an overall accuracy for the method.

In [41]:
print(probs[0])

[array([[0.17247379, 0.82752621],
       [0.20476708, 0.79523292],
       [0.17887641, 0.82112359],
       [0.35500185, 0.64499815],
       [0.16264308, 0.83735692],
       [0.17051051, 0.82948949],
       [0.17100177, 0.82899823],
       [0.16394754, 0.83605246],
       [0.17751075, 0.82248925],
       [0.16590905, 0.83409095],
       [0.16594226, 0.83405774],
       [0.66720964, 0.33279036],
       [0.395821  , 0.604179  ],
       [0.18531225, 0.81468775],
       [0.45044345, 0.54955655],
       [0.23884888, 0.76115112],
       [0.19013166, 0.80986834],
       [0.16454678, 0.83545322],
       [0.17201113, 0.82798887],
       [0.35983107, 0.64016893],
       [0.19887984, 0.80112016],
       [0.21036031, 0.78963969],
       [0.16497163, 0.83502837],
       [0.17159683, 0.82840317],
       [0.48393919, 0.51606081],
       [0.16805078, 0.83194922],
       [0.16492111, 0.83507889],
       [0.21973127, 0.78026873],
       [0.16441082, 0.83558918],
       [0.16737797, 0.83262203],
       [0

As the arrays are now all different length with different patients' date having been discarded the below no longer works. will try to recreate each patients data from the array indices and test indices given by sklearn so that probabilities can be compared.

In [35]:
def average_probabilities(probs, n_splits, channels):
    average_probs = []
    for i in range(0, n_splits):
        probas = []
        for channel in channels:
            probas.append(probs[channel][i])
        average_probs.append(np.mean(probas, axis=0)[:, 0])
    return average_probs

def manual_y_pred(average_probs, n_splits, thresholds):
    manual_y_pred = []
    for i in range(0, n_splits):
        manual_predict = []
        for j in range(0, len(average_probs[i])):
            if average_probs[i][j] > thresholds[0][i]:
                manual_predict.append('Healthy')
            else:
                manual_predict.append('Unhealthy')
        manual_y_pred.append(manual_predict)
    return manual_y_pred


channel_indices_list = [0, 1, 2, 3, 4, 5]
all_combinations = []
for r in range(1, len(probs) + 1):
    combination = list(itertools.combinations(channel_indices_list, r))
    all_combinations.extend(combination)
    
#print(thresholds)
best_score = 0
best_channel_indices = []
for combo in all_combinations:
        
        # Use combo to get channel indices
        selected_channel_indices = [channel_indices_list[i] for i in combo]
        #print(selected_channel_indices)
        combo_probs = average_probabilities(probs, n_splits, selected_channel_indices)
        manual_pred = manual_y_pred(combo_probs, n_splits, thresholds)
        
        score = []
        for i in range(0, n_splits):
            score.append(objective_score(y_tests[0][i], manual_pred[i]))
            
        print(np.average(score))
        if np.average(score) > best_score:
                best_score = np.average(score)
                best_channel_indices = selected_channel_indices


0.7613507516717578


IndexError: list index out of range

In [None]:
print(best_score)
print(best_channel_indices)

In [None]:
for i in range(0, no_channels):
    print(selected_params[i].keys())