In [None]:
#load necessary packages
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.evaluate import bootstrap_point632_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, plot_roc_curve

In [None]:
#Read file
#samples should be in rows and features in columns
#data already normalized from metaboanalyst 
df = pd.read_csv("path_to_file")

#prepare and encode data
data = df.values
X = data[:, 1:-1]
y = data[:, -1].astype(str)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

#check for class imbalance
print('\nratio of Healthy and Infected = ', sum(y)/len(y))

In [None]:
#define model
SVM_RBF = SVC(kernel='rbf')

In [None]:
#optimize hyperparameters

#optimizating in a broader range
C_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
gamma_range = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]
param_dict = {"C":C_range, 'gamma': gamma_range}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
grid_search = GridSearchCV(SVM_RBF, param_grid=param_dict, cv=cv, scoring='accuracy', verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#optimizing in a narrower ranger
C_range = np.arange(1, 100, 1)
gamma_range = np.arange(0.0001, 0.01, 0.0001)
param_dict = {"C":C_range, 'gamma': gamma_range}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
grid_search = GridSearchCV(SVM_RBF, param_grid=param_dict, cv=cv, scoring='accuracy', verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#capture best hyperparameter
best_C=grid_search.best_params_['C']
best_gamma=grid_search.best_params_['gamma']

In [None]:
#redefine model with best hyperparameter
SVM_RBF_best = SVC(kernel = 'rbf', C=best_C, gamma=best_gamma)

#fit the model
SVM_RBF_best.fit(X, y)

#plot ROC curve
plot_roc_curve(SVM_RBF_best, X, y)

In [None]:
# 632+bootstrap scoring metrics

metrics = {'Accuracy': accuracy_score, 'Precision': precision_score, 'Recall': recall_score,
           'F1': f1_score, 'matthews correlation coffecient': matthews_corrcoef, 'ROC AUC': roc_auc_score}

for name, metric in metrics.items():
    scores = bootstrap_point632_score(SVM_RBF_best, X, y, n_splits=200, method='.632+', scoring_func=metric, 
                                  predict_proba=False, random_seed=None, clone_estimator=True)
    mean_score = np.mean(scores)
    print(name, '%.2f' % mean_score)
    # Confidence interval
    lower = np.percentile(scores, 2.5)
    upper = np.percentile(scores, 97.5)
    print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))
    print('\n')