This notebook exists to experiment with different methods for classification

In [2]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:40<00:00, 13.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:03<00:00, 55.17it/s]


In [3]:
removed_params = ['rr_std', 'RMSSD', 'pNN50', 'mean', 'lf', 'hf', 'skews', 'rr_mean', 'power_ratio', 'std', 'kurtosis', 'shannon_en'] #convert to a set?

selected_params = {}
for key in params.keys():
    if key not in removed_params:
        selected_params[key] = params[key]
print(selected_params.keys())

dict_keys(['sd_ratio'])


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [5]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [1 if label == 'Unhealthy' else -1 for label in health_state]

print(len(health_state))

196


In [6]:
def get_balanced_accuracy(y_test, y_pred):
    """
    balanced accuracy....
    """
    num_healthy_true = np.sum([x=='Healthy' for x in y_test])
    num_unhealthy_true = len(y_test) - num_healthy_true
    count_healthy_accurate = 0
    count_unhealthy_accurate = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Unhealthy':
            count_unhealthy_accurate +=1
        elif y_pred[i] == y_test[i] == 'Healthy':
            count_healthy_accurate +=1
    healthy_percentage = count_healthy_accurate/num_healthy_true
    unhealthy_percentage = count_unhealthy_accurate/num_unhealthy_true
    balanced_accuracy = (healthy_percentage + unhealthy_percentage) * 0.5
    return balanced_accuracy

def get_specificity(y_test, y_pred):
    """
    true negative rate
    """
    true_negative = 0
    false_positive = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Healthy':
            true_negative += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Healthy':
            false_positive += 1
    
    return true_negative / (true_negative+false_positive)
            
def get_f1_score(y_test, y_pred):
    """
    balance between precision and recall
    """
    true_positive = 0
    false_positive = 0
    false_negative = 0
    for i in range(0, len(y_test)):
        if y_pred[i] == y_test[i] == 'Unhealthy':
            true_positive += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Healthy':
            false_positive += 1
        elif y_pred[i] != y_test[i] and y_test[i] == 'Unhealthy':
            false_negative += 1
    return (2*true_positive)/(2*true_positive+false_positive+false_negative)


def get_av_confusion_matrix(y_test, y_pred):
    av_confusion_mat = np.zeros(shape = (len(y_test), 2, 2))
    for i in range(0, len(y_test)):
        av_confusion_mat[i] = confusion_matrix(y_test[i], y_pred[i])
    return np.mean(av_confusion_mat, axis=0)

def scoring_function(model, X, y):
    """
    change to incorporate balanced accuracy 
    """
    
    y_pred = model.predict(X)
    y_test = y
    balanced_acc = get_balanced_accuracy(y_test, y_pred)
    specificity = get_specificity(y_test, y_pred)
    f1 = get_f1_score(y_test, y_pred)
    
    
    return f1*0.7 + balanced_acc*0.3

## SVM

In [7]:
def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_channels, no_patients, no_features))
    
    for j in range(0, no_channels):
        for i, values in enumerate(selected_params.values()):
            selected_params_array[j][:, i] = values[j]
    return selected_params_array, no_features

        
selected_params_array, no_features = convert_dict_to_array(selected_params)


### Basic SVM

In [20]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array[0], health_state, test_size=0.3, stratify=health_state)

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True, class_weight = 'balanced')  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)
print(y_pred)

percentage_healthy = np.sum([x=='Healthy' for x in y_train])/len(y_train)
print(percentage_healthy)

manual_predict = []
for i in range(0, len(probabilities)):
    if probabilities[i][0] > percentage_healthy:
        manual_predict.append('Healthy')
    else:
        manual_predict.append('Unhealthy')

print(manual_predict)
    
#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = get_balanced_accuracy(y_test, y_pred)
print("Accuracy:", accuracy)
print('balanced accuracy:', balanced_accuracy)

print(np.sum([x=='Healthy' for x in y_test]))
print(y_test, y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.1776776  0.8223224 ]
 [0.32762719 0.67237281]
 [0.36315422 0.63684578]
 [0.19114033 0.80885967]
 [0.13890857 0.86109143]
 [0.13595551 0.86404449]
 [0.15702067 0.84297933]
 [0.35192144 0.64807856]
 [0.20533057 0.79466943]
 [0.18178267 0.81821733]
 [0.36193017 0.63806983]
 [0.12757749 0.87242251]
 [0.15992855 0.84007145]
 [0.14491477 0.85508523]
 [0.25726742 0.74273258]
 [0.22060341 0.77939659]
 [0.264638   0.735362  ]
 [0.10923593 0.89076407]
 [0.23614693 0.76385307]
 [0.10808522 0.89191478]
 [0.21012091 0.78987909]
 [0.1359941  0.8640059 ]
 [0.30467727 0.69532273]
 [0.14831865 0.85168135]
 [0.18625672 0.81374328]
 [0.31558503 0.68441497]
 [0.09878978 0.90121022]
 [0.35577334 0.64422666]
 [0.12622218 0.87377782]
 [0.24710084 0.75289916]
 [0.18569333 0.81430667]
 [0.36547052 0.63452948]
 [0.35267362 0.64732638]
 [0.13817142 0.86182858]
 [0.15748661 0.84251339]
 [0.13609207 0.86390793]
 [0.13699357 0.86300643]
 [0.13971907 0.86028093]
 [0.18450037 0.81549963]
 [0.24258428 0.75741572]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

#recall needs encoded_health_state
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array, health_state, test_size=0.3, stratify=health_state)

# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

#initialise
svc = SVC(class_weight='balanced')

# perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring=scoring_function)
grid_search.fit(X_train, y_train)


print(grid_search.best_params_)

# train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

In [None]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = get_balanced_accuracy(y_test, y_pred)

print("Accuracy:", accuracy)
print('balanced accuracy:', balanced_accuracy)

### Stratified k-fold validation

need to weight the train and test sets to cope with difference with numbers of healthy and unhealthy, stratified k-fold validation

In [None]:
from sklearn.model_selection import StratifiedKFold

n_splits = 3 #3 fold validation
skf = StratifiedKFold(n_splits=n_splits, shuffle=True) #can do repeated skf for better validation


accuracies=[]
balanced_accuracies=[]
y_pred_list = []
y_test_list=[]
for train_index, test_index in skf.split(selected_params_array, health_state):
    #getting test and train data sets
    X_train, X_test = selected_params_array[train_index], selected_params_array[test_index]
    y_train, y_test = health_state[train_index], health_state[test_index]
    
    y_test_list.append(y_test)
    #training model with tuned hyperparameters
    tuned_svc = grid_search.best_estimator_
    tuned_svc.fit(X_train, y_train)
    
    #evaluating model
    y_pred = tuned_svc.predict(X_test)
    y_pred_list.append(y_pred)
    accuracies.append(accuracy_score(y_test, y_pred))
    balanced_accuracies.append((get_balanced_accuracy(y_test, y_pred)))

In [None]:
print(accuracies)
print(balanced_accuracies)

In [None]:
print(np.mean(balanced_accuracies))

In [None]:
print(y_pred_list[2], y_test_list[2])
print(confusion_matrix(y_pred_list[2], y_test_list[2]))

In [None]:
print(y_pred, y_test)

In [None]:
print("Classification Report:")
print(classification_report(y_test_list[1], y_pred_list[1]))
print(get_specificity(y_test_list[1], y_pred_list[1]))
print(get_av_confusion_matrix(y_test_list, y_pred_list))