This notebook exists to experiment with different methods for classification

In [1]:
%run "Feature_Selection.ipynb" 

100%|████████████████████████████████████████████████████████████████████████████████| 549/549 [00:08<00:00, 64.48it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 202/202 [00:00<00:00, 237.05it/s]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [19]:
health_state = np.array(allowed_patients.get_diagnoses())

encoded_health_state = [True if label == 'Unhealthy' else False for label in health_state]

print(len(health_state))

202


## SVM

In [20]:
def convert_dict_to_array(selected_params):
    no_features = len(selected_params)
    selected_params_array = np.zeros((no_patients, no_features))
    
    for i, values in enumerate(selected_params.values()):
        selected_params_array[:, i] = values
    return selected_params_array, no_features
        
selected_params_array, no_features = convert_dict_to_array(selected_params)

### Basic SVM

In [31]:
#splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_params_array, health_state, test_size=0.3, stratify=health_state)

#init and train model, using radial basis functions
svm_model = SVC(kernel='rbf', gamma='scale', probability=True)  #'scale' normalises data, prevents overfitting
#probability = True allows calculation of probabilities through 5-fold CV

svm_model.fit(X_train, y_train)

#predicting probabilities
probabilities = svm_model.predict_proba(X_test)
print(probabilities)


#predictions
y_pred = svm_model.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(y_test, y_pred)

#view a classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

[[0.22710332 0.77289668]
 [0.22709736 0.77290264]
 [0.22709912 0.77290088]
 [0.22710748 0.77289252]
 [0.22709708 0.77290292]
 [0.22709761 0.77290239]
 [0.22709682 0.77290318]
 [0.22709943 0.77290057]
 [0.2270982  0.7729018 ]
 [0.22709843 0.77290157]
 [0.22709973 0.77290027]
 [0.22709911 0.77290089]
 [0.22709787 0.77290213]
 [0.22709743 0.77290257]
 [0.22709837 0.77290163]
 [0.22709937 0.77290063]
 [0.22710221 0.77289779]
 [0.22709737 0.77290263]
 [0.22709938 0.77290062]
 [0.22709731 0.77290269]
 [0.22711013 0.77288987]
 [0.22709739 0.77290261]
 [0.22709824 0.77290176]
 [0.22709899 0.77290101]
 [0.22709906 0.77290094]
 [0.22709684 0.77290316]
 [0.22709841 0.77290159]
 [0.22709941 0.77290059]
 [0.22709837 0.77290163]
 [0.2270991  0.7729009 ]
 [0.22709881 0.77290119]
 [0.22709774 0.77290226]
 [0.2270994  0.7729006 ]
 [0.22711023 0.77288977]
 [0.2271064  0.7728936 ]
 [0.22709795 0.77290205]
 [0.22709909 0.77290091]
 [0.2270973  0.7729027 ]
 [0.22711079 0.77288921]
 [0.2270973  0.7729027 ]


### Hyper Parameter Tuning
- c is inversely proportional to l2 regularisation parameter
- gamma determines scaling of features


In [32]:
from sklearn.model_selection import GridSearchCV

# define hyperparameter grid to test
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

#initialise
svc = SVC()

# perform grid search
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X_train, y_train)


print(grid_search.best_params_)

# train with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train, y_train)

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [33]:
#predictions
y_pred = best_svc.predict(X_test)

#evaluating accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7868852459016393


### Stratified k-fold validation

need to weight the train and test sets to cope with difference with numbers of healthy and unhealthy, stratified k-fold validation

In [53]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5 #5 fold validation
skf = StratifiedKFold(n_splits=n_splits) #can do repeated skf for better validation

accuracies=[]
y_pred_list = []
y_test_list=[]
for train_index, test_index in skf.split(selected_params_array, health_state):
    #getting test and train data sets
    X_train, X_test = selected_params_array[train_index], selected_params_array[test_index]
    y_train, y_test = health_state[train_index], health_state[test_index]
    
    y_test_list.append(y_test)
    #training model with tuned hyperparameters
    tuned_svc = grid_search.best_estimator_
    tuned_svc.fit(X_train, y_train)
    
    #evaluating model
    y_pred = tuned_svc.predict(X_test)
    y_pred_list.append(y_pred)
    accuracies.append(accuracy_score(y_test, y_pred))

In [54]:
print(accuracies)

[0.8292682926829268, 0.7317073170731707, 0.775, 0.775, 0.75]


In [57]:
print(np.mean(accuracies))

0.7721951219512195


In [56]:
print(y_pred_list[1], y_test_list[1])

['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Healthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'] ['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Healthy' 'Healthy' 'Healthy' 'Healthy' 'Healthy' 'Healthy'
 'Healthy' 'Healthy' 'Healthy' 'Healthy']


In [36]:
print(y_pred, y_test)

['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Healthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Healthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Healthy' 'Unhealthy' 'Unhealthy'
 'Healthy' 'Unhealthy' 'Healthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'] ['Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Healthy' 'Healthy' 'Unhealthy' 'Healthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Healthy' 'Healthy' 'Unhealthy' 'Healthy' 'Healthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy'
 'Healthy' 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Healthy' 'Unhealthy'
 'Unhealthy' 'Unhealthy' 'Unhealthy' 'Unhealthy']


In [67]:
print("Classification Report:")
print(classification_report(y_test_list[0], y_pred_list[0]))

Classification Report:
              precision    recall  f1-score   support

     Healthy       1.00      0.22      0.36         9
   Unhealthy       0.82      1.00      0.90        32

    accuracy                           0.83        41
   macro avg       0.91      0.61      0.63        41
weighted avg       0.86      0.83      0.78        41



## Comparing Accuracy
- MSE - average between true and predicted values
- R2 score - measure of proportion of variance in dependent variable (target) that is explained by independent variables (features), indicates how well the model captures the variations in the data

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer, f1_score, roc_auc_score



# Assuming selected_params_array and health_state are defined
X = selected_params_array
y = health_state

# Get the tuned model (best estimator from grid search)
tuned_svc = grid_search.best_estimator_

# Define accuracy as the scoring metric
accuracy_scorer = make_scorer(accuracy_score)
#dont work??
#f1_scorer = make_scorer(f1_score)
#roc_auc_scorer = make_scorer(roc_auc_score)

# Perform cross-validation
n_splits = 5  # 5-fold validation
accuracies = cross_val_score(tuned_svc, X, y, cv=n_splits, scoring=roc_auc_scorer)

# Calculate the mean accuracy
mean_accuracy = accuracies.mean()
print(f'Mean Accuracy: {mean_accuracy}')


Mean Accuracy: nan


Traceback (most recent call last):
  File "C:\Users\court\Documents\anaconda\lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
  File "C:\Users\court\Documents\anaconda\lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\court\Documents\anaconda\lib\site-packages\sklearn\utils\_param_validation.py", line 214, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\court\Documents\anaconda\lib\site-packages\sklearn\metrics\_ranking.py", line 606, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "C:\Users\court\Documents\anaconda\lib\site-packages\sklearn\utils\validation.py", line 946, in check_array
    raise ValueError(
ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.

Traceback (most recent call last):
  File "C:\Users\co

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

#calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("LASSO Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)

NameError: name 'y_pred_lasso' is not defined