This notebook exists to experiment with different methods for classification

In [1]:
%run "Parameter_Estimation.ipynb" #allowing access to parameters

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:15<00:00, 35.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 229/229 [00:02<00:00, 81.60it/s]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))
# unhealthy_peak_to_peak_averages = peak_to_peak_averages[encoded_health_state]
# healthy_peak_to_peak_averages = peak_to_peak_averages[~np.array(encoded_health_state)]

# unhealthy_peak_amp = peak_amp_average[encoded_health_state]
# healthy_peak_amp = peak_amp_average[~np.array(encoded_health_state)]


229


## Parameter Selection

factor analysis

### investigating correlation 
- currently using peason r (pmcc)

In [5]:
from scipy.stats import pearsonr


corr, p_value = pearsonr(params['rr_mean'], params['rr_std'])


In [6]:
print(corr, p_value)

0.13617674637503555 0.039489188328994965


## PCA


- loses the knowledge of features, less intuitive
- will experiment with it anyway

In [19]:


# Initialize the array
X = np.zeros((no_patients, 4))#need no. samples as rows, no. features as columns for machine learning analysis

# Populate the array with values from the dictionary
X[:, 0] = params['rr_mean']
X[:, 1] = params['rr_std']
X[:, 2] = params['RMSSD']
X[:, 3] = params['pNN50']
    
#standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#set desired number of principle components
num_components = 2

#using sklearn PCA
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)

In [21]:
#using principle components to do ML
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, health_state, test_size=0.3)#reshape needs to be fixed 

#init and train model, using radial basis functions
svm_classifier = SVC(kernel='rbf', gamma='scale')  #'scale' normalises data, prevents overfitting
svm_classifier.fit(X_train, y_train)

#predictions
y_pred = svm_classifier.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(y_test, y_pred)
print("Accuracy:", accuracy)

['Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy'] ['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy

## Wrapper Methods:

These methods do feature selection whilst using the model

 - Forward Selection: Features are sequentially added to the model, starting with an empty set and adding the feature that improves model performance the most at each step.
 - Backward Elimination: Features are sequentially removed from the model, starting with the full set of features and removing the feature that decreases model performance the least at each step.
 - Recursive Feature Elimination (RFE): Features are recursively pruned based on the importance assigned to them by the model. Less important features are eliminated iteratively until the desired number of features is reached.

## Embedded Methods:

Also done whilst using the model

- Regularization: Techniques like LASSO (L1 regularization) and Ridge (L2 regularization) penalize the magnitude of feature coefficients, forcing less important features to have coefficients close to zero.

In [None]:
#LASSO/Ridge Regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

#changing health_state to binary for use in regression
binary_health_state = [1 if label == 'Unhealthy' else 0 for label in health_state]

#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, binary_health_state, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

#create and fit regression models
lasso_alpha = 0.1  # Regularization strength (hyperparameter)
lasso = Lasso(alpha=lasso_alpha)
lasso.fit(X_train_scaled, y_train)

ridge_alpha = 0.1 
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_scaled, y_train)

#use the trained models for prediction
X_test_scaled = scaler.fit_transform(X_test)

y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)



## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)

## SVM

In [23]:
#will have feature selection to decide this number
no_features = 1

# Initialize the array
selected_params = np.zeros((no_patients, no_features))#need no. samples as rows, no. features as columns for machine learning analysis

# Populate the array with values from the dictionary
selected_params[:, 0] = params['rr_mean']
#selected_params[:, 1] = params['rr_std']
#selected_params[:, 2] = params['RMSSD']
#selected_params[:, 3] = params['pNN50']#fix this

print(selected_params)

[[ 696.15662651]
 [ 760.53246753]
 [ 831.97101449]
 [ 727.95      ]
 [ 685.23809524]
 [ 666.72413793]
 [ 808.54166667]
 [1012.52631579]
 [ 672.81395349]
 [ 670.12643678]
 [1150.8       ]
 [ 703.37804878]
 [ 703.        ]
 [ 917.68253968]
 [1010.63157895]
 [ 712.07407407]
 [ 787.28767123]
 [ 671.91954023]
 [ 912.21875   ]
 [ 811.73239437]
 [ 912.15873016]
 [ 833.71014493]
 [ 884.16923077]
 [ 817.74285714]
 [ 799.34722222]
 [1009.64912281]
 [1013.92982456]
 [ 699.31325301]
 [ 729.58227848]
 [ 635.76086957]
 [ 926.33333333]
 [ 763.59210526]
 [ 905.84375   ]
 [ 666.72413793]
 [ 816.16901408]
 [ 844.63235294]
 [ 820.70422535]
 [ 686.64285714]
 [ 659.21590909]
 [ 739.38461538]
 [ 632.17391304]
 [ 693.89285714]
 [ 725.9875    ]
 [1018.875     ]
 [ 855.67647059]
 [ 728.125     ]
 [ 612.12631579]
 [ 778.89189189]
 [ 753.5       ]
 [ 748.76623377]
 [ 772.54666667]
 [ 747.2987013 ]
 [ 739.73417722]
 [1029.55357143]
 [ 900.15625   ]
 [ 817.70422535]
 [ 742.34615385]
 [ 680.22093023]
 [ 704.5243902

In [26]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params.reshape(229, no_features), health_state, test_size=0.3)#reshape needs to be fixed 

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(y_train)
print(y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.19758798 0.80241202]
 [0.19884666 0.80115334]
 [0.19204283 0.80795717]
 [0.19934955 0.80065045]
 [0.19977362 0.80022638]
 [0.20014535 0.79985465]
 [0.19932222 0.80067778]
 [0.19738977 0.80261023]
 [0.20098918 0.79901082]
 [0.20121703 0.79878297]
 [0.19483109 0.80516891]
 [0.20188335 0.79811665]
 [0.19566098 0.80433902]
 [0.20099974 0.79900026]
 [0.19576888 0.80423112]
 [0.20095919 0.79904081]
 [0.19701857 0.80298143]
 [0.20097035 0.79902965]
 [0.19736104 0.80263896]
 [0.19831199 0.80168801]
 [0.2001466  0.7998534 ]
 [0.20163578 0.79836422]
 [0.20111364 0.79888636]
 [0.19736122 0.80263878]
 [0.20204918 0.79795082]
 [0.19827909 0.80172091]
 [0.1993153  0.8006847 ]
 [0.20100158 0.79899842]
 [0.19897968 0.80102032]
 [0.1986154  0.8013846 ]
 [0.19874393 0.80125607]
 [0.20099883 0.79900117]
 [0.19855938 0.80144062]
 [0.20200128 0.79799872]
 [0.1974113  0.8025887 ]
 [0.20108163 0.79891837]
 [0.20259157 0.79740843]
 [0.20109608 0.79890392]
 [0.19903041 0.80096959]
 [0.20090345 0.79909655]


In [22]:
print(y_train[1])
print(y_pred[1])

Unhealthy
Unhealthy
