This notebook exists to experiment with different methods for classification

In [1]:
%run "Feature_Selection.ipynb" #change to feature selection when is fixed

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:13<00:00, 40.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 229/229 [00:01<00:00, 169.64it/s]


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [6]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))

229


## SVM

In [11]:
def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_patients, no_features))
    
    for i, values in enumerate(selected_params.values()):
        selected_params_array[:, i] = values
    return selected_params_array, no_features
        
selected_params_array, no_features = convert_dict_to_array(selected_params)
#print(selected_params_array)

need to weight the train and test sets to cope with difference with numbers of healthy and unhealthy, stratified k-fold validation

In [16]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array, health_state, test_size=0.3)

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


print(y_test, y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.21281222 0.78718778]
 [0.16947771 0.83052229]
 [0.18725597 0.81274403]
 [0.16618926 0.83381074]
 [0.17210202 0.82789798]
 [0.17766335 0.82233665]
 [0.17291664 0.82708336]
 [0.18506021 0.81493979]
 [0.18514797 0.81485203]
 [0.18229647 0.81770353]
 [0.23208504 0.76791496]
 [0.17008524 0.82991476]
 [0.1763109  0.8236891 ]
 [0.16180646 0.83819354]
 [0.17817129 0.82182871]
 [0.18272526 0.81727474]
 [0.16492514 0.83507486]
 [0.20260482 0.79739518]
 [0.22162559 0.77837441]
 [0.16021679 0.83978321]
 [0.15955258 0.84044742]
 [0.16238397 0.83761603]
 [0.16822557 0.83177443]
 [0.16349276 0.83650724]
 [0.16896994 0.83103006]
 [0.16595206 0.83404794]
 [0.16283572 0.83716428]
 [0.16333142 0.83666858]
 [0.16511066 0.83488934]
 [0.17198039 0.82801961]
 [0.1634592  0.8365408 ]
 [0.21105298 0.78894702]
 [0.16968541 0.83031459]
 [0.16512638 0.83487362]
 [0.19301179 0.80698821]
 [0.17867836 0.82132164]
 [0.17461467 0.82538533]
 [0.15079375 0.84920625]
 [0.15825729 0.84174271]
 [0.16384032 0.83615968]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [12]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM
svc = SVC()

# Perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

# Train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}


In [14]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8260869565217391


In [15]:
print(y_pred)

['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy']


## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)