This notebook exists to experiment with different methods for classification

In [1]:
%run "Parameter_Estimation.ipynb" #allowing access to parameters

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:15<00:00, 35.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 229/229 [00:01<00:00, 134.07it/s]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
health_state = allowed_patients.get_diagnoses()

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))
# unhealthy_peak_to_peak_averages = peak_to_peak_averages[encoded_health_state]
# healthy_peak_to_peak_averages = peak_to_peak_averages[~np.array(encoded_health_state)]

# unhealthy_peak_amp = peak_amp_average[encoded_health_state]
# healthy_peak_amp = peak_amp_average[~np.array(encoded_health_state)]


229


## Parameter Selection

factor analysis

### investigating correlation 
- currently using peason r (pmcc)

In [4]:
from scipy.stats import pearsonr


corr, p_value = pearsonr(params['peak_amp_av'], params['peak_to_peak_av'])


In [5]:
print(corr, p_value)

0.11633836490440197 0.07894247890368668


## PCA


- loses the knowledge of features, less intuitive
- will experiment with it anyway

In [6]:
# Initialize the array
X = np.zeros((2, no_patients))

# Populate the array with values from the dictionary
for i in range(2):
    X[i] = params['peak_to_peak_av']
    X[i] = params['peak_amp_av']

    
#standardize data
X = X.reshape(no_patients, 2)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#set desired number of principle components
num_components = 2

#using sklearn PCA
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)

In [7]:
#using principle components to do ML
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, health_state, test_size=0.3)#reshape needs to be fixed 

#init and train model, using radial basis functions
svm_classifier = SVC(kernel='rbf', gamma='scale')  #'scale' normalises data, prevents overfitting
svm_classifier.fit(X_train, y_train)

#predictions
y_pred = svm_classifier.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8840579710144928


## Wrapper Methods:

These methods do feature selection whilst using the model

 - Forward Selection: Features are sequentially added to the model, starting with an empty set and adding the feature that improves model performance the most at each step.
 - Backward Elimination: Features are sequentially removed from the model, starting with the full set of features and removing the feature that decreases model performance the least at each step.
 - Recursive Feature Elimination (RFE): Features are recursively pruned based on the importance assigned to them by the model. Less important features are eliminated iteratively until the desired number of features is reached.

## Embedded Methods:

Also done whilst using the model

- Regularization: Techniques like LASSO (L1 regularization) and Ridge (L2 regularization) penalize the magnitude of feature coefficients, forcing less important features to have coefficients close to zero.

In [8]:
#LASSO/Ridge Regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

#changing health_state to binary for use in regression
binary_health_state = [1 if label == 'Unhealthy' else 0 for label in health_state]

#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, binary_health_state, test_size=0.3)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

#create and fit regression models
lasso_alpha = 0.1  # Regularization strength (hyperparameter)
lasso = Lasso(alpha=lasso_alpha)
lasso.fit(X_train_scaled, y_train)

ridge_alpha = 0.1 
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_scaled, y_train)

#use the trained models for prediction
X_test_scaled = scaler.fit_transform(X_test)

y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)



## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)

LASSO Regression:
MSE: 0.17069123641304348
R2 Score: -0.003285156250000254

Ridge Regression:
MSE: 0.1710401380589816
R2 Score: -0.005335922591125364


## SVM

In [10]:
#will have feature selection to decide this number
no_features = 2

# Initialize the array
selected_params = np.zeros((no_features, no_patients))

# Populate the array with values from the dictionary
for i in range(no_features):
    selected_params[i] = params['peak_to_peak_av']
    selected_params[i] = params['peak_amp_av']

selected_params = selected_params.reshape(no_patients, no_features) #need no. samples as rows, no. features as columns for machine learning analysis

In [11]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params.reshape(229, 2), health_state, test_size=0.3)#reshape needs to be fixed 

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(y_train)
print(y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.19697559 0.80302441]
 [0.27634842 0.72365158]
 [0.19787724 0.80212276]
 [0.19745675 0.80254325]
 [0.19790871 0.80209129]
 [0.19776824 0.80223176]
 [0.19774501 0.80225499]
 [0.33456858 0.66543142]
 [0.19778161 0.80221839]
 [0.19784968 0.80215032]
 [0.19823762 0.80176238]
 [0.19775053 0.80224947]
 [0.19750696 0.80249304]
 [0.19646375 0.80353625]
 [0.1979192  0.8020808 ]
 [0.19661062 0.80338938]
 [0.19781604 0.80218396]
 [0.19753836 0.80246164]
 [0.21504864 0.78495136]
 [0.19813152 0.80186848]
 [0.39820264 0.60179736]
 [0.19806824 0.80193176]
 [0.19707195 0.80292805]
 [0.19787428 0.80212572]
 [0.19770601 0.80229399]
 [0.19812785 0.80187215]
 [0.19142569 0.80857431]
 [0.3037562  0.6962438 ]
 [0.19763257 0.80236743]
 [0.19628933 0.80371067]
 [0.19695922 0.80304078]
 [0.19827544 0.80172456]
 [0.19798317 0.80201683]
 [0.19014539 0.80985461]
 [0.19551898 0.80448102]
 [0.194195   0.805805  ]
 [0.19756381 0.80243619]
 [0.19399014 0.80600986]
 [0.19692973 0.80307027]
 [0.19778988 0.80221012]


In [16]:
print(y_train[1])
print(y_pred[1])

Unhealthy
Unhealthy
