This notebook exists to experiment with different methods for classification

In [1]:
%run "Feature_Selection.ipynb" #change to feature selection when is fixed

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:13<00:00, 40.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 229/229 [00:01<00:00, 169.64it/s]


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [6]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))

229


## SVM

In [11]:
def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_patients, no_features))
    
    for i, values in enumerate(selected_params.values()):
        selected_params_array[:, i] = values
    return selected_params_array, no_features
        
selected_params_array, no_features = convert_dict_to_array(selected_params)
#print(selected_params_array)

need to weight the train and test sets to cope with difference with numbers of healthy and unhealthy, stratified k-fold validation

In [27]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array, health_state, test_size=0.3, stratify=health_state)

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(sum(i == 'Healthy' for i in y_train))

print(y_test, y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.1861225  0.8138775 ]
 [0.2048213  0.7951787 ]
 [0.20437488 0.79562512]
 [0.21601423 0.78398577]
 [0.1834946  0.8165054 ]
 [0.19495662 0.80504338]
 [0.26476777 0.73523223]
 [0.19625426 0.80374574]
 [0.20010932 0.79989068]
 [0.17620194 0.82379806]
 [0.20058548 0.79941452]
 [0.18599563 0.81400437]
 [0.18125408 0.81874592]
 [0.16828853 0.83171147]
 [0.17752866 0.82247134]
 [0.15744232 0.84255768]
 [0.16081926 0.83918074]
 [0.16267738 0.83732262]
 [0.24933296 0.75066704]
 [0.17595998 0.82404002]
 [0.2217895  0.7782105 ]
 [0.17320528 0.82679472]
 [0.17259614 0.82740386]
 [0.22318556 0.77681444]
 [0.18186906 0.81813094]
 [0.16950756 0.83049244]
 [0.18265531 0.81734469]
 [0.1928695  0.8071305 ]
 [0.17578016 0.82421984]
 [0.25252243 0.74747757]
 [0.18225092 0.81774908]
 [0.18594278 0.81405722]
 [0.18301404 0.81698596]
 [0.24465629 0.75534371]
 [0.18808112 0.81191888]
 [0.17039958 0.82960042]
 [0.15925887 0.84074113]
 [0.25758213 0.74241787]
 [0.21963818 0.78036182]
 [0.18266295 0.81733705]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [12]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM
svc = SVC()

# Perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

# Train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}


In [14]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8260869565217391


In [15]:
print(y_pred)

['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy']


## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)