This notebook exists to experiment with different methods for classification

In [1]:
%run "Parameter_Estimation.ipynb" #change to feature selection when is fixed

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:15<00:00, 36.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 229/229 [00:02<00:00, 91.90it/s]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))
# unhealthy_peak_to_peak_averages = peak_to_peak_averages[encoded_health_state]
# healthy_peak_to_peak_averages = peak_to_peak_averages[~np.array(encoded_health_state)]

# unhealthy_peak_amp = peak_amp_average[encoded_health_state]
# healthy_peak_amp = peak_amp_average[~np.array(encoded_health_state)]


229


In [46]:
from sklearn.feature_selection import RFE

# initializing parameter array
X = np.zeros((no_patients, 3))#need no. samples as rows, no. features as columns for machine learning analysis
X[:, 0] = params['rr_mean']
X[:, 1] = params['rr_std']
X[:, 2] = params['RMSSD']
#X[:, 3] = params['pNN50']

#splitting data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, health_state, test_size=0.3)

#initialise SVM -- have to use a linear kernel??
svm = SVC(kernel="linear")

#initialize RFE with the SVM model and desired number of feauters
rfe = RFE(estimator=svm, n_features_to_select=1)

rfe.fit(X_train, y_train)

In [47]:
print("Selected features:", rfe.support_)
print("Feature ranking:", rfe.ranking_)

Selected features: [False False  True]
Feature ranking: [3 2 1]


In [38]:
# transform the dataset to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# train the SVM on the selected features
svm.fit(X_train_rfe, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test_rfe)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with selected features:", accuracy)

Accuracy with selected features: 0.8985507246376812


### Forward/Backward Elimination

May be slower than RFE but does not need to have coefficients i.e. a linear kernel.

In [41]:
from sklearn.feature_selection import SequentialFeatureSelector 

svm_rbf = SVC(kernel='rbf')

SFS_forward = SequentialFeatureSelector(estimator=svm_rbf, tol = 5)

SFS_forward.fit(X_train, y_train)

SFS_forward.get_support()

array([ True, False, False])

In [45]:
SFS_backward = SequentialFeatureSelector(estimator=svm_rbf, tol=-5, direction='backward')

SFS_backward.fit(X_train, y_train)

SFS_forward.get_support()

array([ True, False, False])

In [None]:
#these agree only first one should be kept but disagrees with RFE???

## Embedded Methods:

Also done whilst using the model

- Regularization: Techniques like LASSO (L1 regularization) and Ridge (L2 regularization) penalize the magnitude of feature coefficients, forcing less important features to have coefficients close to zero.

In [25]:
#LASSO/Ridge Regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

#changing health_state to binary for use in regression
binary_health_state = [1 if label == 'Unhealthy' else 0 for label in health_state]

#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, binary_health_state, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

#create and fit regression models
lasso_alpha = 0.1  # Regularization strength (hyperparameter)
lasso = Lasso(alpha=lasso_alpha)
lasso.fit(X_train_scaled, y_train)

ridge_alpha = 0.1 
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_scaled, y_train)

#use the trained models for prediction
X_test_scaled = scaler.fit_transform(X_test)

y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)



## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)

LASSO Regression:
MSE: 0.12358525815217393
R2 Score: -0.08961002604166657

Ridge Regression:
MSE: 0.11076380627276888
R2 Score: 0.023432441361754486


## SVM

In [27]:
#will have feature selection to decide this number
no_features = 1

# Initialize the array
selected_params = np.zeros((no_patients, no_features))#need no. samples as rows, no. features as columns for machine learning analysis

# Populate the array with values from the dictionary
selected_params[:, 0] = params['rr_mean']
#selected_params[:, 1] = params['rr_std']
#selected_params[:, 2] = params['RMSSD']
#selected_params[:, 3] = params['pNN50']#fix this

print(selected_params)

[[ 696.15662651]
 [ 760.53246753]
 [ 831.97101449]
 [ 727.95      ]
 [ 685.23809524]
 [ 666.72413793]
 [ 808.54166667]
 [1012.52631579]
 [ 672.81395349]
 [ 670.12643678]
 [1150.8       ]
 [ 703.37804878]
 [ 703.        ]
 [ 917.68253968]
 [1010.63157895]
 [ 712.07407407]
 [ 787.28767123]
 [ 671.91954023]
 [ 912.21875   ]
 [ 811.73239437]
 [ 912.15873016]
 [ 833.71014493]
 [ 884.16923077]
 [ 817.74285714]
 [ 799.34722222]
 [1009.64912281]
 [1013.92982456]
 [ 699.31325301]
 [ 729.58227848]
 [ 635.76086957]
 [ 926.33333333]
 [ 763.59210526]
 [ 905.84375   ]
 [ 666.72413793]
 [ 816.16901408]
 [ 844.63235294]
 [ 820.70422535]
 [ 686.64285714]
 [ 659.21590909]
 [ 739.38461538]
 [ 632.17391304]
 [ 693.89285714]
 [ 725.9875    ]
 [1018.875     ]
 [ 855.67647059]
 [ 728.125     ]
 [ 612.12631579]
 [ 778.89189189]
 [ 753.5       ]
 [ 748.76623377]
 [ 772.54666667]
 [ 747.2987013 ]
 [ 739.73417722]
 [1029.55357143]
 [ 900.15625   ]
 [ 817.70422535]
 [ 742.34615385]
 [ 680.22093023]
 [ 704.5243902

In [51]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params, health_state, test_size=0.3) 

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


print(y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.212339   0.787661  ]
 [0.21339044 0.78660956]
 [0.233723   0.766277  ]
 [0.21577211 0.78422789]
 [0.19768837 0.80231163]
 [0.1974286  0.8025714 ]
 [0.19824469 0.80175531]
 [0.2247068  0.7752932 ]
 [0.19495832 0.80504168]
 [0.19506528 0.80493472]
 [0.20666914 0.79333086]
 [0.19734292 0.80265708]
 [0.23010159 0.76989841]
 [0.19508187 0.80491813]
 [0.21371596 0.78628404]
 [0.22712963 0.77287037]
 [0.21766809 0.78233191]
 [0.22302439 0.77697561]
 [0.19614697 0.80385303]
 [0.21029395 0.78970605]
 [0.19768468 0.80231532]
 [0.19706453 0.80293547]
 [0.21683004 0.78316996]
 [0.2023203  0.7976797 ]
 [0.19484458 0.80515542]
 [0.23001499 0.76998501]
 [0.19847034 0.80152966]
 [0.19784247 0.80215753]
 [0.22403739 0.77596261]
 [0.19575272 0.80424728]
 [0.21446551 0.78553449]
 [0.20487415 0.79512585]
 [0.21261513 0.78738487]
 [0.21525335 0.78474665]
 [0.20706215 0.79293785]
 [0.21627061 0.78372939]
 [0.21118088 0.78881912]
 [0.19505176 0.80494824]
 [0.2173406  0.7826594 ]
 [0.19773797 0.80226203]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [53]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM
svc = SVC()

# Perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_params_)

# Train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}


In [54]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8260869565217391


In [55]:
print(y_pred)

['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy']
