This notebook exists to experiment with different methods for classification

In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:08<00:00, 64.48it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 202/202 [00:00<00:00, 237.05it/s]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))

202


## SVM

In [4]:
def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_patients, no_features))
    
    for i, values in enumerate(selected_params.values()):
        selected_params_array[:, i] = values
    return selected_params_array, no_features
        
selected_params_array, no_features = convert_dict_to_array(selected_params)
print(selected_params_array)

[[ 3.59814926e+00]
 [ 3.40976803e+00]
 [ 2.09016303e+00]
 [ 8.59284492e+00]
 [ 6.84643085e+00]
 [ 1.41721106e+01]
 [ 1.83739075e+01]
 [ 3.30304554e+00]
 [ 6.34594992e+00]
 [ 1.79516388e+01]
 [ 1.65761481e+01]
 [ 1.80630309e+01]
 [ 2.51633679e+01]
 [ 9.42859279e+00]
 [ 1.57471829e+01]
 [ 9.63279317e+00]
 [ 5.52476317e+00]
 [ 1.38431217e+01]
 [ 1.72186198e+01]
 [ 1.23860823e+01]
 [ 1.68937720e+01]
 [ 1.42008653e+01]
 [ 1.26775316e+01]
 [ 1.98349474e+01]
 [ 9.44022765e+00]
 [ 1.22941937e+01]
 [ 7.47302836e+00]
 [ 1.12364574e+01]
 [ 1.31506514e+01]
 [ 9.95356105e+00]
 [ 1.45779778e+01]
 [ 1.90225555e+01]
 [ 2.47351608e+01]
 [ 2.24477170e+01]
 [ 1.26269813e+01]
 [ 1.53567284e+01]
 [ 1.12440616e+01]
 [ 1.23078099e+01]
 [ 9.28140205e+00]
 [ 4.20085398e+00]
 [ 1.44510943e+01]
 [ 5.40146750e+00]
 [ 9.72382279e+00]
 [ 1.81259084e+01]
 [ 2.06150498e+01]
 [ 1.39247441e+01]
 [ 2.96907204e+00]
 [ 1.55597593e+01]
 [ 1.08377665e+01]
 [ 1.90860085e+01]
 [ 1.56418283e+01]
 [ 1.40067642e+01]
 [ 1.1693295

need to weight the train and test sets to cope with difference with numbers of healthy and unhealthy, stratified k-fold validation

In [6]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array, health_state, test_size=0.3, stratify=health_state)

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(y_test, y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.30086271 0.69913729]
 [0.23360425 0.76639575]
 [0.21455691 0.78544309]
 [0.21472765 0.78527235]
 [0.31071249 0.68928751]
 [0.28279619 0.71720381]
 [0.22607311 0.77392689]
 [0.21466043 0.78533957]
 [0.21469077 0.78530923]
 [0.21426438 0.78573562]
 [0.21409692 0.78590308]
 [0.21466313 0.78533687]
 [0.26472496 0.73527504]
 [0.2145474  0.7854526 ]
 [0.21463023 0.78536977]
 [0.25555711 0.74444289]
 [0.2146893  0.7853107 ]
 [0.21474601 0.78525399]
 [0.21442512 0.78557488]
 [0.21467715 0.78532285]
 [0.22137009 0.77862991]
 [0.22311064 0.77688936]
 [0.30165406 0.69834594]
 [0.21475888 0.78524112]
 [0.21474331 0.78525669]
 [0.21470243 0.78529757]
 [0.214671   0.785329  ]
 [0.21475462 0.78524538]
 [0.21473822 0.78526178]
 [0.2156903  0.7843097 ]
 [0.2147006  0.7852994 ]
 [0.26904475 0.73095525]
 [0.21393304 0.78606696]
 [0.21470966 0.78529034]
 [0.21462155 0.78537845]
 [0.2487424  0.7512576 ]
 [0.21475826 0.78524174]
 [0.21475282 0.78524718]
 [0.22304525 0.77695475]
 [0.21399066 0.78600934]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [7]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM
svc = SVC()

# Perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

# Train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [8]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7868852459016393


In [11]:
print(y_pred, y_test)

['Unhealthy' 'Healthy' 'Unhealthy' 'Unhealthy' 'Healthy' 'Unhealthy'
 'Healthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Healthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Healthy'] ['Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'U

In [10]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

     Healthy       0.60      0.21      0.32        14
   Unhealthy       0.80      0.96      0.87        47

    accuracy                           0.79        61
   macro avg       0.70      0.59      0.59        61
weighted avg       0.76      0.79      0.75        61



## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)

NameError: name 'y_pred_lasso' is not defined