In [None]:
#load necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.evaluate import bootstrap_point632_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, plot_roc_curve

In [None]:
#Read file
#samples should be in rows and features in columns
#data already normalized from metaboanalyst 
df = pd.read_csv("path_to_file")

#prepare and encode data
data = df.values
X = data[:, 1:-1]
y = data[:, -1].astype(str)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

#check for class imbalance
print('\nratio of Healthy and Infected = ', sum(y)/len(y))

In [None]:
#generate Principle Components
pca = PCA()
pca.fit(X)
pc_x = pca.transform(X)

#Scree Plot
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

In [None]:
#optimize hyperparameters

#optimizating in a broader range
LogisticRegression__C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
pca__n_components = range(1, 64)
param_dict = {"LogisticRegression__C":LogisticRegression__C, 'pca__n_components':pca__n_components}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
pipe = Pipeline([('pca', PCA(n_components=pca__n_components)),
                 ('LogisticRegression', LogisticRegression(C=LogisticRegression__C, solver='liblinear'))])
grid_search = GridSearchCV(pipe, param_grid=param_dict, cv=cv, scoring='accuracy', refit=True, verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#optimizing in a narrower ranger
LogisticRegression__C = np.arange(0.01, 1, 0.01)
pca__n_components = range(1, 64)
param_dict = {"LogisticRegression__C":LogisticRegression__C, 'pca__n_components':pca__n_components}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
pipe = Pipeline([('pca', PCA(n_components=pca__n_components)),
                 ('LogisticRegression', LogisticRegression(C=LogisticRegression__C, solver='liblinear'))])
grid_search = GridSearchCV(pipe, param_grid=param_dict, cv=cv, scoring='accuracy', refit=True, verbose=1)
grid_search.fit(X, y)
print('best parameter', grid_search.best_params_, 'accuracy', grid_search.best_score_)

#capture best hyperparameter
best_LogisticRegression__C = grid_search.best_params_['LogisticRegression__C']
best_pca__n_components = grid_search.best_params_['pca__n_components']

In [None]:
#redefine model with best hyperparameter
PCLR_pipe_best = Pipeline([('pca', PCA(n_components=best_pca__n_components)),
                 ('LogisticRegression', LogisticRegression(C=best_LogisticRegression__C, solver='liblinear'))])

#fit the model
PCLR_pipe_best.fit(X, y)

#plot ROC curve from SVM_Linear_best
plot_roc_curve(PCLR_pipe_best, X, y)

In [None]:
# 632+bootstrap scoring metrics

metrics = {'Accuracy': accuracy_score, 'Precision': precision_score, 'Recall': recall_score,
           'F1': f1_score, 'matthews correlation coffecient': matthews_corrcoef, 'ROC AUC': roc_auc_score}

for name, metric in metrics.items():
    scores = bootstrap_point632_score(PCLR_pipe_best, X, y, n_splits=200, method='.632+', scoring_func=metric,
                                  predict_proba=False, random_seed=None, clone_estimator=True)
    mean_score = np.mean(scores)
    print(name, '%.2f' % mean_score)
    # Confidence interval
    lower = np.percentile(scores, 2.5)
    upper = np.percentile(scores, 97.5)
    print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))
    print('\n')