In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:32<00:00, 16.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:07<00:00, 26.09it/s]


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [1 if label == 'Unhealthy' else -1 for label in health_state]

In [4]:
def get_balanced_accuracy(y_test, y_pred):
    """
    balanced accuracy....
    """
    num_healthy_true = np.sum([x=='Healthy' for x in y_test])
    num_unhealthy_true = len(y_test) - num_healthy_true
    count_healthy_accurate = 0
    count_unhealthy_accurate = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Unhealthy':
            count_unhealthy_accurate +=1
        elif y_pred[i] == y_test[i] == 'Healthy':
            count_healthy_accurate +=1
    healthy_percentage = count_healthy_accurate/num_healthy_true
    unhealthy_percentage = count_unhealthy_accurate/num_unhealthy_true
    balanced_accuracy = (healthy_percentage + unhealthy_percentage) * 0.5
    return balanced_accuracy

def get_specificity(y_test, y_pred):
    """
    true negative rate
    """
    true_negative = 0
    false_positive = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Healthy':
            true_negative += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Healthy':
            false_positive += 1
    
    return true_negative / (true_negative+false_positive)
            
def get_f1_score(y_test, y_pred):
    """
    balance between precision and recall
    """
    true_positive = 0
    false_positive = 0
    false_negative = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Unhealthy':
            true_positive += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Healthy':
            false_positive += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Unhealthy':
            false_negative += 1
    return (2*true_positive)/(2*true_positive+false_positive+false_negative)


def get_av_confusion_matrix(y_test, y_pred):
    av_confusion_mat = np.zeros(shape = (len(y_test), 2, 2))
    for i in range(0, len(y_test)):
        av_confusion_mat[i] = confusion_matrix(y_test[i], y_pred[i])
    return np.mean(av_confusion_mat, axis=0)

def scoring_function(model, X, y):
    """
    change to incorporate balanced accuracy 
    """
    
    y_pred = model.predict(X)
    y_test = y
    balanced_acc = get_balanced_accuracy(y_test, y_pred)
    specificity = get_specificity(y_test, y_pred)
    f1 = get_f1_score(y_test, y_pred)
    
    
    return f1*0.7 + balanced_acc*0.3

def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_channels, no_patients, no_features))
    
    for j in range(0, no_channels):
        for i, values in enumerate(selected_params.values()):
            selected_params_array[j][:, i] = values[j]
    return selected_params_array, no_features

First convert dictionary to array of suitable shape for svm 

In [5]:
selected_params_array, no_features = convert_dict_to_array(selected_params)

In [6]:
def tune_hyperparams(params, health_state, param_grid, scorer='balanced_accuracy'):
    
    X_train, X_test, y_train, y_test = train_test_split(params, health_state, test_size=0.3, stratify=health_state)
    
    #initialise classifier
    svc = SVC(class_weight='balanced', probability = True)

    # perform grid search
    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring=scorer)
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

Find best set of hyperparameters for each channel

In [7]:
# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}


best_estimators = []
for i in range(0, no_channels):
    best_estimators.append(tune_hyperparams(selected_params_array[i], health_state, param_grid, scoring_function))
    

Now using the best set of hyperparameters we perform a 3 fold stratisfied split to find the accuracy, done two ways, one calculates accuracy each time, one averages each channel to find the accuracy.

In [None]:
def skfold(params, health_state, n_splits, best_estimator, scorer):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    balanced_accuracy = []
    score_func = []
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        y_pred = best_estimator.predict(X_test)
        score_func.append(scorer(best_estimator, X_train, y_test))
        balanced_accuracy.append(get_balanced_accuracy(y_test, y_pred))
        
    return np.mean(np.array(score_func)), np.mean(np.array(balanced_accuracy)), y_pred

In [17]:
#make so does repeated skfold for even more averages, bool??

#do an average skfold and one that combines predictions somehow, same random state??? cuz need comparisson of probabilities or true/false

#should have cv gridsearch in here aswell???


def skfold_probabilities(params, health_state, n_splits, best_estimator):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) #can do repeated skf for better validation
    
    probabilities = []
    sample_percentages = []
    y_tests = []
    for train_index, test_index in skf.split(params, health_state):
        #getting test and train data sets
        X_train, X_test = params[train_index], params[test_index]
        y_train, y_test = health_state[train_index], health_state[test_index]
        
        #calculating percnetage of healthy in training data
        sample_percentages.append(np.sum([x=='Healthy' for x in y_train])/len(y_train))
        
        #fitting data on previously calculated best estimator
        best_estimator.fit(X_train, y_train)
        
        #evaluating model
        probabilities.append(best_estimator.predict_proba(X_test))
        y_tests.append(y_test)
        
    return probabilities, sample_percentages, y_tests
    
        
        

In [19]:
n_splits = 3 #3 fold validation

#list of probabilities each entry (determined by number of channels) containing n_split list of probabilities
probs = []
thresholds = []
y_tests = []
for i in range(0, no_channels):
    prob, threshold, y_test  = skfold_probabilities(selected_params_array[i], health_state, n_splits, best_estimators[i])
    probs.append(prob)
    thresholds.append(threshold)
    y_tests.append(y_test)

In [85]:
average_probs = []
for i in range(0, n_splits):
    probas = []
    for j in range(0, no_channels):
        probas.append(probs[j][i])
    average_probs.append(np.mean(probas, axis=0)[:, 0])

In [97]:
manual_y_pred = []
for i in range(0, n_splits):
    manual_predict = []
    for j in range(0, len(average_probs[i])):
        if average_probs[i][j] > thresholds[0][i]:
            manual_predict.append('Healthy')
        else:
            manual_predict.append('Unhealthy')
    manual_y_pred.append(manual_predict)

In [99]:
balanced_accuracy_score = []
score_function_score = []
for i in range(0, n_splits):
    #score_func.append(scorer(best_estimator, X_train, y_test)) cant do this in this method??
    balanced_accuracy_score.append(get_balanced_accuracy(y_test[i], manual_y_pred[i]))
    
    

In [100]:
print(balanced_accuracy_score)

[0.6686274509803922, 0.5980392156862745, 0.6666666666666667]
