In [None]:
#load necessary packages
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.evaluate import bootstrap_point632_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, plot_roc_curve

In [None]:
#Read file
#samples should be in rows and features in columns
#data already normalized from metaboanalyst 
df = pd.read_csv("path_to_file")

#prepare and encode data
data = df.values
X = data[:, 1:-1]
y = data[:, -1].astype(str)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

#check for class imbalance
print('\nratio of Healthy and Infected = ', sum(y)/len(y))

In [None]:
#define model
SVM_Linear = SVC(kernel = 'linear')

In [None]:
#optimize hyperparameters

#optimizating in a broader range
C_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_dict = {"C":C_range}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
grid_search = GridSearchCV(SVM_Linear, param_grid=param_dict, cv=cv, scoring='accuracy', verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#optimizing in a narrower ranger
C_range = np.arange(0.001, 1.1, 0.001)
param_dict = {"C":C_range}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
grid_search = GridSearchCV(SVM_Linear, param_grid=param_dict, cv=cv, scoring='accuracy', verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#capture best hyperparameter
best_parameter = grid_search.best_params_

In [None]:
#redefine model with best hyperparameter
SVM_Linear_best = SVC(kernel = 'linear', C=best_parameter)

#fit the model
SVM_Linear_best.fit(X, y)

#plot ROC curve from SVM_Linear_best
plot_roc_curve(SVM_Linear_best, X, y)

In [None]:
# 632+bootstrap scoring metrics

#Accuracy
scores = bootstrap_point632_score(SVM_Linear_best, X, y, method='.632+')
mean_score = np.mean(scores)
print('Accuracy: %.2f%%' % (100*mean_score))

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

#F1
print('\n')
scores = bootstrap_point632_score(SVM_Linear_best, X, y, n_splits=200, method='.632+', scoring_func=f1_score, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
mean_score = np.mean(scores)
print('F1: %.2f' % mean_score)

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

#Recall
print('\n')
scores = bootstrap_point632_score(SVM_Linear_best, X, y, n_splits=200, method='.632+', scoring_func=recall_score, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
mean_score = np.mean(scores)
print('recall_score: %.2f' % mean_score)

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

#Precision
print('\n')
scores = bootstrap_point632_score(SVM_Linear_best, X, y, n_splits=200, method='.632+', scoring_func=precision_score, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
mean_score = np.mean(scores)
print('precision_score: %.2f' % mean_score)

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

#matthews correlation coffecient
print('\n')
scores = bootstrap_point632_score(SVM_Linear_best, X, y, n_splits=200, method='.632+', scoring_func=matthews_corrcoef, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
mean_score = np.mean(scores)
print('matthews correlation coffecient: %.2f' % mean_score)

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

#ROC_AUC
print('\n')
from sklearn.metrics import roc_auc_score
scores = bootstrap_point632_score(SVM_Linear_best, X, y, n_splits=200, method='.632+', scoring_func=roc_auc_score, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
mean_score = np.mean(scores)
print('ROC AUC: %.2f' % mean_score)

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))