# ALD Exploration

Max's analysis of different classifiers.

> Henry: I adapted the data loading to run it myself. Autoformatting was applied.

## Loading data

In [None]:
import pandas as pd
import os
import numpy as np
import sklearn
import sklearn.linear_model as skllm
import sklearn.ensemble as skle
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
FOLDER_DATA_RAW = 'data/raw'

annotation_file = pd.read_csv(os.path.join(
    FOLDER_DATA_RAW, 'Experiment annotation file.csv'), index_col=[0])
annotation_file_plasma = annotation_file[annotation_file['Sample type'] == 'Plasma']
annotation_file_plasma.index = pd.Index(
    annotation_file_plasma.index, dtype=int)

report_plasma = pd.read_csv(os.path.join(
    FOLDER_DATA_RAW, '20190620_210717_20190620_P0000005_Lili2Klibrary_Report.csv'), na_values='Filtered')
report_plasma.rename({'PG.Genes': 'Gene names',
                      'PG.ProteinAccessions': 'Protein ID'}, inplace=True, axis=1)
report_plasma.head()
columns_ = ['Protein ID', 'Gene names']
ids_ = report_plasma[columns_].apply(lambda series_: series_.str.split(';'))

experimental_columns = annotation_file_plasma['Sample ID']
report_plasma[columns_] = report_plasma[columns_].apply(
    lambda series_: series_.str.split(';').str[0])
map_filenames_ids = dict(
    zip(annotation_file['File name'], annotation_file['Sample ID']))

data_plasma_raw = report_plasma.copy()
data_plasma_raw.drop(data_plasma_raw.filter(
    regex='StrippedSequences').columns, axis=1, inplace=True)

data_plasma_raw = data_plasma_raw.rename(mapper=map_filenames_ids, axis=1)
IDmapping_UniprotID_to_Genename = dict(
    zip(data_plasma_raw['Protein ID'], data_plasma_raw['Gene names']))
data_plasma_raw = data_plasma_raw.set_index(
    'Protein ID').drop('Gene names', axis=1)

key_ProteinID = pd.read_csv(os.path.join(FOLDER_DATA_RAW, 'ID_matching_key.csv'),
                            index_col="Protein ID").drop("Unnamed: 0", axis=1)

### Clinical Data

In [None]:
DATAFOLDER = 'data/processed'
PROTEOM = 'data_ml_proteomics.csv'
CLINICAL = 'df_cli_164.csv'
COL_ID = 'Sample ID'
f_data_clinic = os.path.join(FOLDER_DATA_RAW, CLINICAL)
data_cli = pd.read_csv(f_data_clinic, index_col=COL_ID)
data_cli = data_cli[data_cli['kleiner'] != 0.5]

FEATURES_ML = ['nas_steatosis_ordinal', 'nas_inflam', 'kleiner',
               'fib4', 'elf', 'ft', 'te', 'swe', 'aar', 'ast',
               'apri', 'forns', 'm30', 'm65', 'meld', 'p3np', 'timp1', 'cap']

file_cutoff_clinic = "data/raw/clinical_marker_test_cut-offs.xlsx"
cutoffs_clinic = pd.read_excel(
    file_cutoff_clinic, sheet_name="cutoffs", index_col='marker')

markers_to_drop = []
for marker in cutoffs_clinic.index:
    if marker not in data_cli.columns:
        print(f"{marker}: Missing in clinics data.")
        markers_to_drop.append(marker)

if markers_to_drop:
    cutoffs_clinic.drop(labels=markers_to_drop, inplace=True)
cutoffs_clinic

### Preprocessing

In [None]:
from helper import log2, imputation_normal_distribution

DATA_COMPLETENESS = 0.6
MIN_N_PROTEIN_GROUPS = 200
CV_COEFFICIENT = 0.3

data_plasma_filtered = data_plasma_raw.dropna(
    axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
data_plasma_filtered = data_plasma_filtered.dropna(
    axis=1, thresh=MIN_N_PROTEIN_GROUPS)
data_plasma_filtered_log = data_plasma_filtered.apply(log2)
data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
    imputation_normal_distribution)

qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                   == 'QC']['Sample ID']
df_qc = data_plasma_filtered.copy()[qc_plasma]
def coef_of_variation(x): return np.std(x) / np.mean(x)


proteins_cv = df_qc.apply(coef_of_variation, axis=1)

cv_selected = proteins_cv < CV_COEFFICIENT

df_qc = df_qc.assign(cv=proteins_cv)
qc_30 = df_qc[cv_selected].index

df = data_plasma_filtered_log_imputed.copy()
df = df.rename_axis('Sample ID', axis=1).T
# filter proteins for CV < 30% of the inter-day/plate quality assessment
df_30 = df[qc_30]
data_proteomics = df_30

### Targets

In [None]:
from helper import create_dichotome

target_columns = ['kleiner', 'nas_steatosis_ordinal', 'nas_inflam']
Y = data_cli[target_columns]

kleiner_ge_2 = create_dichotome(Y['kleiner'], 2)
kleiner_ge_3 = create_dichotome(Y['kleiner'], 3)
steatosis_ge_1 = create_dichotome(Y['nas_steatosis_ordinal'], 1)
inflamation_ge_2 = create_dichotome(Y['nas_inflam'], 2)

## Feature Selection

In [None]:
from helper import FeatureSelector

feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
proteins_selected_f2 = feature_selected.fit(
    data_proteomics, kleiner_ge_2, 'F2')
proteins_selected_f3 = feature_selected.fit(
    data_proteomics, kleiner_ge_3, 'F3')
proteins_selected_s1 = feature_selected.fit(
    data_proteomics, steatosis_ge_1, 'S1')
proteins_selected_I2 = feature_selected.fit(
    data_proteomics, inflamation_ge_2, 'I2')

## Cross Validation of Model

In [None]:
test_cases = {}

test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
test_cases['I2'] = {'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold

scoring = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc']
CV_FOLDS = 5
CV_REPEATS = 10

_clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

proteins_selected = proteins_selected_f2
y = kleiner_ge_2

_X = data_proteomics[proteins_selected.index]
in_both = y.index.intersection(_X.index)
_X = _X.loc[in_both]
_y = y.loc[in_both]

result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
    n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

In [None]:
# Make sure the results are similar to the previous results

refvals = {}
refvals['test_precision'] = 0.832388
refvals['test_recall'] = 0.788949
refvals['test_f1'] = 0.808201
refvals['test_balanced_accuracy'] = 0.793597
refvals['test_roc_auc'] = 0.884170

mean_vals = np.round(pd.DataFrame(result).mean().to_frame().T, 3)

for key in refvals.keys():
    assert np.round(refvals[key], 3) == mean_vals[key][0]

## Check Model
Is there a model that outperforms logistic regression out of the box?

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
# Try multiple optimizers and see what happens
fpath_results_classifier_comparison = os.path.join(
    DATAFOLDER, 'results_classifier_comparison.pkl')

try:
    grouped = pd.read_pickle(fpath_results_classifier_comparison)
except FileNotFoundError:

    from tqdm.notebook import tqdm as tqdm

    from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
    from xgboost import XGBClassifier

    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]

    summary = []

    for model in tqdm(MLA):

        for test_case in test_cases.keys():

            proteins_selected = test_cases[test_case]['proteins']
            y = test_cases[test_case]['y']

            _X = data_proteomics[proteins_selected.index]
            in_both = y.index.intersection(_X.index)
            _X = _X.loc[in_both]
            _y = y.loc[in_both]

            result = cross_validate(model, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
                n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)
            result['name'] = model.__class__.__name__
            result['test_case'] = test_case

            summary.append(pd.DataFrame(result))

    print('Complete')
    summary_df = pd.concat(summary)
    grouped = summary_df.groupby(['name', 'test_case']).mean(
    ).sort_values(by=['test_case', 'test_f1'], ascending=False)
    grouped = grouped.reset_index()

    grouped.to_pickle(fpath_results_classifier_comparison)
grouped

### Define a baseline: Logistic Regression

In [None]:
baseline = {}

for test_case in test_cases.keys():
    baseline[test_case] = grouped[(grouped['name'] == 'LogisticRegressionCV') & (
        grouped['test_case'] == test_case)].iloc[0]

cols = ['fit_time', 'score_time', 'test_precision', 'test_recall',
        'test_f1', 'test_balanced_accuracy', 'test_roc_auc']

In [None]:
grouped_norm = grouped.copy()

for i in range(len(grouped)):

    grouped_norm.loc[i,
                     cols] -= baseline[grouped_norm.loc[i, 'test_case']][cols]

### Best performing algorithms: F1, AUC

In [None]:
test_cases = ['F2', 'I2', 'S1']
colors = ['darkblue', 'darkred', 'gray']
metric = 'test_f1'
for i in range(3):
    data=grouped[grouped['test_case']==test_cases[i]].sort_values(by=metric)
    fig, axs=plt.subplots(figsize=(3,6), squeeze=False)
    plt.barh(y=data['name'], width=data[metric], color=colors[i])
    plt.xlabel('F1 score', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=12)
    plt.xlim(0.5, 1)
    plt.title(test_cases[i], fontsize=14)
    plt.savefig('figures/classifiers_{}_{}.png'.format(metric, test_cases[i]), dpi=120,bbox_inches='tight')

In [None]:
grouped.to_csv('tables/classifier_comparison.csv')

In [None]:
grouped_norm.sort_values(
    by='test_f1', ascending=False).groupby('test_case').head(5)

In [None]:
grouped_norm.sort_values(
    by='test_roc_auc', ascending=False).groupby('test_case').head(4)

In [None]:
plt.figure(figsize=(20, 15))
sns.barplot(x='test_f1', y='name', hue='test_case', data=grouped)
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
sns.barplot(x='test_f1', y='name', hue='test_case', data=grouped_norm)
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
sns.barplot(x='test_roc_auc', y='name', hue='test_case', data=grouped_norm)
plt.show()

## Can we optimize on the feature selection part?

Let's try a couple of things here: Are the numbers of features optimized?

In [None]:
feature_selection_comparison_cv = os.path.join(
        DATAFOLDER, 'feature_selection_comparison_cv.pkl')
try:
    summary = pd.read_pickle(feature_selection_comparison_cv)
except FileNotFoundError:
    summary = []

    for n_features in tqdm(range(1, 50)):

        feature_selected = FeatureSelector(
            k=n_features, protein_gene_data=key_ProteinID)
        proteins_selected_f2 = feature_selected.fit(
            data_proteomics, kleiner_ge_2, 'F2')
        proteins_selected_f3 = feature_selected.fit(
            data_proteomics, kleiner_ge_3, 'F3')
        proteins_selected_s1 = feature_selected.fit(
            data_proteomics, steatosis_ge_1, 'S1')
        proteins_selected_I2 = feature_selected.fit(
            data_proteomics, inflamation_ge_2, 'I2')

        test_cases = {}

        test_cases['F2'] = {
            'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
        test_cases['F3'] = {
            'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
        test_cases['S1'] = {
            'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
        test_cases['I2'] = {
            'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

        for test_case in test_cases.keys():
            _clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

            proteins_selected = test_cases[test_case]['proteins']
            y = test_cases[test_case]['y']

            _X = data_proteomics[proteins_selected.index]
            in_both = y.index.intersection(_X.index)
            _X = _X.loc[in_both]
            _y = y.loc[in_both]

            result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
                n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

            result['name'] = _clf.__class__.__name__
            result['n_features'] = n_features
            result['test_case'] = test_case

            summary.append(pd.DataFrame(result))

    summary = pd.concat(summary)
    summary.to_pickle(feature_selection_comparison_cv)

In [None]:
summary

In [None]:
combined = summary.groupby(['test_case', 'n_features']).mean()

combined.sort_values(by='test_f1', ascending=False).groupby(
    'test_case').head(1)

In [None]:
combined.sort_values(by='test_roc_auc', ascending=False).groupby(
    'test_case').head(1)

In [None]:
plt.figure(figsize=(20, 15))
sns.lineplot(x='n_features', y='test_f1', hue='test_case', data=summary)
plt.ylim([0.5, 0.9])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Number of Features vs F1-score')
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
sns.lineplot(x='n_features', y='test_roc_auc', hue='test_case', data=summary)
plt.ylim([0.5, 1])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Number of Features vs roc auc')
plt.show()

## Decision tree based feature selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel


def get_features_dt(X, y, dummy):

    mask_samples_in_both = X.index.intersection(y.index)
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X.loc[mask_samples_in_both], y.loc[mask_samples_in_both])
    clf.feature_importances_

    indices = np.argsort(clf.feature_importances_)[::-1]

    return X.columns[indices]


proteins_selected_f2 = get_features_dt(data_proteomics, kleiner_ge_2, 'F2')
proteins_selected_f3 = get_features_dt(data_proteomics, kleiner_ge_3, 'F3')
proteins_selected_s1 = get_features_dt(data_proteomics, steatosis_ge_1, 'S1')
proteins_selected_I2 = get_features_dt(data_proteomics, inflamation_ge_2, 'I2')

test_cases = {}

test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
test_cases['I2'] = {'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

decision_tree_feature_selection = os.path.join(
    DATAFOLDER, 'decision_tree_feature_selection.pkl')

try:
    combind_dt = pd.read_pickle(decision_tree_feature_selection)
except FileNotFoundError:
    summary = []

    for n_features in tqdm(range(1, 50)):

        for test_case in test_cases.keys():
            _clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

            proteins_selected = test_cases[test_case]['proteins']
            y = test_cases[test_case]['y']

            _X = data_proteomics[proteins_selected[:n_features]]
            in_both = y.index.intersection(_X.index)
            _X = _X.loc[in_both]
            _y = y.loc[in_both]

            result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
                n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

            result['name'] = _clf.__class__.__name__
            result['n_features'] = n_features
            result['test_case'] = test_case

            summary.append(pd.DataFrame(result))
    combind_dt = pd.concat(summary)
    combind_dt.to_pickle(decision_tree_feature_selection)

In [None]:
plt.figure(figsize=(20, 15))
sns.lineplot(x='n_features', y='test_f1', hue='test_case', data=combind_dt)
plt.ylim([0.5, 0.9])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Number of Features vs F1-score')
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
sns.lineplot(x='n_features', y='test_roc_auc',
             hue='test_case', data=combind_dt)
plt.ylim([0.5, 1])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Number of Features vs roc auc')
plt.show()

In [None]:
feature_selected = FeatureSelector(k=5, protein_gene_data=key_ProteinID)
proteins_selected_f2 = feature_selected.fit(
    data_proteomics, kleiner_ge_2, 'F2')

feature_selected = FeatureSelector(k=20, protein_gene_data=key_ProteinID)
proteins_selected_f3 = feature_selected.fit(
    data_proteomics, kleiner_ge_3, 'F3')

feature_selected = FeatureSelector(k=32, protein_gene_data=key_ProteinID)
proteins_selected_s1 = feature_selected.fit(
    data_proteomics, steatosis_ge_1, 'S1')

feature_selected = FeatureSelector(k=7, protein_gene_data=key_ProteinID)
proteins_selected_I2 = feature_selected.fit(
    data_proteomics, inflamation_ge_2, 'I2')

test_cases = {}

test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
test_cases['I2'] = {'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

In [None]:
# Try multiple optimizers and see what happens

from tqdm.notebook import tqdm as tqdm

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

MLA = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),


    # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()
]

summary = []


for model in tqdm(MLA):

    for test_case in test_cases.keys():

        proteins_selected = test_cases[test_case]['proteins']
        y = test_cases[test_case]['y']

        _X = data_proteomics[proteins_selected.index]
        in_both = y.index.intersection(_X.index)
        _X = _X.loc[in_both]
        _y = y.loc[in_both]

        result = cross_validate(model, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
            n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)
        result['name'] = model.__class__.__name__
        result['test_case'] = test_case

        summary.append(pd.DataFrame(result))

print('Complete')

In [None]:
summary_df = pd.concat(summary)
grouped = summary_df.groupby(['name', 'test_case']).mean(
).sort_values(by=['test_case', 'test_f1'], ascending=False)
grouped = grouped.reset_index()
grouped

grouped_norm = grouped.copy()

for i in range(len(grouped)):

    grouped_norm.loc[i,
                     cols] -= baseline[grouped_norm.loc[i, 'test_case']][cols]

In [None]:
grouped_norm.sort_values(
    by='test_f1', ascending=False).groupby('test_case').head(1)

In [None]:
grouped_norm.sort_values(
    by='test_roc_auc', ascending=False).groupby('test_case').head(1)

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='test_f1', y='name', hue='test_case', data=grouped_norm)
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x='test_roc_auc', y='name', hue='test_case', data=grouped_norm)
plt.show()

## Missing value imputation ? / Preprocesisng

### CV

In [None]:
from helper import log2, imputation_normal_distribution

summary = []

for CV_COEFFICIENT in tqdm(np.linspace(0.2, 0.5, 10)):

    DATA_COMPLETENESS = 0.6
    MIN_N_PROTEIN_GROUPS = 200
    #CV_COEFFICIENT = 0.3

    data_plasma_filtered = data_plasma_raw.dropna(
        axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
    data_plasma_filtered = data_plasma_filtered.dropna(
        axis=1, thresh=MIN_N_PROTEIN_GROUPS)
    data_plasma_filtered_log = data_plasma_filtered.apply(log2)
    data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
        imputation_normal_distribution)

    qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                       == 'QC']['Sample ID']
    df_qc = data_plasma_filtered.copy()[qc_plasma]
    def coef_of_variation(x): return np.std(x) / np.mean(x)
    proteins_cv = df_qc.apply(coef_of_variation, axis=1)

    cv_selected = proteins_cv < CV_COEFFICIENT

    df_qc = df_qc.assign(cv=proteins_cv)
    qc_30 = df_qc[cv_selected].index

    df = data_plasma_filtered_log_imputed.copy()
    df = df.rename_axis('Sample ID', axis=1).T
    # filter proteins for CV < 30% of the inter-day/plate quality assessment
    df_30 = df[qc_30]
    data_proteomics = df_30

    feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
    proteins_selected_f2 = feature_selected.fit(
        data_proteomics, kleiner_ge_2, 'F2')
    proteins_selected_f3 = feature_selected.fit(
        data_proteomics, kleiner_ge_3, 'F3')
    proteins_selected_s1 = feature_selected.fit(
        data_proteomics, steatosis_ge_1, 'S1')
    proteins_selected_I2 = feature_selected.fit(
        data_proteomics, inflamation_ge_2, 'I2')

    test_cases = {}

    test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
    test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
    test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
    test_cases['I2'] = {
        'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

    for test_case in test_cases.keys():
        _clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

        proteins_selected = test_cases[test_case]['proteins']
        y = test_cases[test_case]['y']

        _X = data_proteomics[proteins_selected.index]
        in_both = y.index.intersection(_X.index)
        _X = _X.loc[in_both]
        _y = y.loc[in_both]

        result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
            n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

        result['name'] = _clf.__class__.__name__
        result['CV'] = CV_COEFFICIENT
        result['test_case'] = test_case

        summary.append(pd.DataFrame(result))

combined = pd.concat([pd.DataFrame(_).groupby(
    'test_case').mean() for _ in summary])
combined = combined.reset_index()

combined.sort_values(by=['test_case', 'test_f1'],
                     ascending=False).groupby('test_case').head(3)

* no big difference

In [None]:
from helper import log2, imputation_normal_distribution

summary = []

for DATA_COMPLETENESS in tqdm(np.linspace(0.4, 0.8, 20)):

    #DATA_COMPLETENESS = 0.6
    MIN_N_PROTEIN_GROUPS = 200
    CV_COEFFICIENT = 0.3

    data_plasma_filtered = data_plasma_raw.dropna(
        axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
    data_plasma_filtered = data_plasma_filtered.dropna(
        axis=1, thresh=MIN_N_PROTEIN_GROUPS)
    data_plasma_filtered_log = data_plasma_filtered.apply(log2)
    data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
        imputation_normal_distribution)

    qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                       == 'QC']['Sample ID']
    df_qc = data_plasma_filtered.copy()[qc_plasma]
    def coef_of_variation(x): return np.std(x) / np.mean(x)
    proteins_cv = df_qc.apply(coef_of_variation, axis=1)

    cv_selected = proteins_cv < CV_COEFFICIENT

    df_qc = df_qc.assign(cv=proteins_cv)
    qc_30 = df_qc[cv_selected].index

    df = data_plasma_filtered_log_imputed.copy()
    df = df.rename_axis('Sample ID', axis=1).T
    # filter proteins for CV < 30% of the inter-day/plate quality assessment
    df_30 = df[qc_30]
    data_proteomics = df_30

    feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
    proteins_selected_f2 = feature_selected.fit(
        data_proteomics, kleiner_ge_2, 'F2')
    proteins_selected_f3 = feature_selected.fit(
        data_proteomics, kleiner_ge_3, 'F3')
    proteins_selected_s1 = feature_selected.fit(
        data_proteomics, steatosis_ge_1, 'S1')
    proteins_selected_I2 = feature_selected.fit(
        data_proteomics, inflamation_ge_2, 'I2')

    test_cases = {}

    test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
    test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
    test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
    test_cases['I2'] = {
        'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

    for test_case in test_cases.keys():
        _clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

        proteins_selected = test_cases[test_case]['proteins']
        y = test_cases[test_case]['y']

        _X = data_proteomics[proteins_selected.index]
        in_both = y.index.intersection(_X.index)
        _X = _X.loc[in_both]
        _y = y.loc[in_both]

        result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
            n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

        result['name'] = _clf.__class__.__name__
        result['DATA_COMPLETENESS'] = DATA_COMPLETENESS
        result['test_case'] = test_case

        summary.append(pd.DataFrame(result))

combined = pd.concat([pd.DataFrame(_).groupby(
    'test_case').mean() for _ in summary])
combined = combined.reset_index()

combined.sort_values(by=['test_case', 'test_f1'],
                     ascending=False).groupby('test_case').head(3)

* this also doesn't change much.

## Scalers 

In [None]:
from sklearn.preprocessing import StandardScaler
from helper import log2, imputation_normal_distribution

summary = []


for imputation in tqdm(['impute_normal', 'z_scaling', 'mean']):

    DATA_COMPLETENESS = 0.6
    MIN_N_PROTEIN_GROUPS = 200
    CV_COEFFICIENT = 0.3

    data_plasma_filtered = data_plasma_raw.dropna(
        axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
    data_plasma_filtered = data_plasma_filtered.dropna(
        axis=1, thresh=MIN_N_PROTEIN_GROUPS)
    data_plasma_filtered_log = data_plasma_filtered.apply(log2)

    if imputation == 'impute_normal':
        data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
            imputation_normal_distribution)

    elif imputation == 'z_scaling':

        scaler = StandardScaler()

        data_plasma_filtered_log_imputed_np = scaler.fit_transform(
            data_plasma_filtered_log.values)
        data_plasma_filtered_log_imputed = data_plasma_filtered_log.copy()
        data_plasma_filtered_log_imputed.loc[:, :] = np.nan_to_num(
            data_plasma_filtered_log_imputed_np)

    elif imputation == 'mean':
        # use the mean of protein group as imputation

        data_plasma_filtered_log_imputed = data_plasma_filtered_log.fillna(
            data_plasma_filtered_log.mean())

    qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                       == 'QC']['Sample ID']
    df_qc = data_plasma_filtered.copy()[qc_plasma]
    def coef_of_variation(x): return np.std(x) / np.mean(x)
    proteins_cv = df_qc.apply(coef_of_variation, axis=1)

    cv_selected = proteins_cv < CV_COEFFICIENT

    df_qc = df_qc.assign(cv=proteins_cv)
    qc_30 = df_qc[cv_selected].index

    df = data_plasma_filtered_log_imputed.copy()
    df = df.rename_axis('Sample ID', axis=1).T
    # filter proteins for CV < 30% of the inter-day/plate quality assessment
    df_30 = df[qc_30]
    data_proteomics = df_30

    feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
    proteins_selected_f2 = feature_selected.fit(
        data_proteomics, kleiner_ge_2, 'F2')
    proteins_selected_f3 = feature_selected.fit(
        data_proteomics, kleiner_ge_3, 'F3')
    proteins_selected_s1 = feature_selected.fit(
        data_proteomics, steatosis_ge_1, 'S1')
    proteins_selected_I2 = feature_selected.fit(
        data_proteomics, inflamation_ge_2, 'I2')

    test_cases = {}

    test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
    test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
    test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
    test_cases['I2'] = {
        'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

    for test_case in test_cases.keys():
        _clf = skllm.LogisticRegression(random_state=0, solver='liblinear')

        proteins_selected = test_cases[test_case]['proteins']
        y = test_cases[test_case]['y']

        _X = data_proteomics[proteins_selected.index]
        in_both = y.index.intersection(_X.index)
        _X = _X.loc[in_both]
        _y = y.loc[in_both]

        result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
            n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

        result['name'] = _clf.__class__.__name__
        result['imputation'] = imputation
        result['test_case'] = test_case

        xx = pd.DataFrame(result).groupby('test_case').mean()

        xx['imputation'] = imputation

        summary.append(xx)


pd.concat(summary).sort_values(
    by=['test_case', 'test_f1'], ascending=False).groupby('test_case').head(3)

* Z-scaling seems to be always better, except for S1

## Hyperparameter Tuning

Check how we could improve the results with hyperparameter tuning, i.e. when using extratrees

In [None]:
DATA_COMPLETENESS = 0.6
MIN_N_PROTEIN_GROUPS = 200
CV_COEFFICIENT = 0.3

data_plasma_filtered = data_plasma_raw.dropna(
    axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
data_plasma_filtered = data_plasma_filtered.dropna(
    axis=1, thresh=MIN_N_PROTEIN_GROUPS)
data_plasma_filtered_log = data_plasma_filtered.apply(log2)
data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
    imputation_normal_distribution)

qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                   == 'QC']['Sample ID']
df_qc = data_plasma_filtered.copy()[qc_plasma]
def coef_of_variation(x): return np.std(x) / np.mean(x)


proteins_cv = df_qc.apply(coef_of_variation, axis=1)

cv_selected = proteins_cv < CV_COEFFICIENT

df_qc = df_qc.assign(cv=proteins_cv)
qc_30 = df_qc[cv_selected].index

df = data_plasma_filtered_log_imputed.copy()
df = df.rename_axis('Sample ID', axis=1).T
# filter proteins for CV < 30% of the inter-day/plate quality assessment
df_30 = df[qc_30]
data_proteomics = df_30

feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
proteins_selected_f2 = feature_selected.fit(
    data_proteomics, kleiner_ge_2, 'F2')
proteins_selected_f3 = feature_selected.fit(
    data_proteomics, kleiner_ge_3, 'F3')
proteins_selected_s1 = feature_selected.fit(
    data_proteomics, steatosis_ge_1, 'S1')
proteins_selected_I2 = feature_selected.fit(
    data_proteomics, inflamation_ge_2, 'I2')

test_cases = {}

test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
test_cases['I2'] = {'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

scoring = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc']
CV_FOLDS = 5
CV_REPEATS = 10

_clf = ensemble.ExtraTreesClassifier()

proteins_selected = proteins_selected_f2
y = kleiner_ge_2

_X = data_proteomics[proteins_selected.index]
in_both = y.index.intersection(_X.index)
_X = _X.loc[in_both]
_y = y.loc[in_both]

result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
    n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

mean_vals = pd.DataFrame(result).mean().to_frame().T

mean_vals

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

params = {
    'n_estimators': range(10, 200, 10),
    'max_features': range(0, 10, 1),
    'min_samples_leaf': range(20, 50, 5),
    'min_samples_split': range(15, 36, 5),
}

folds = 10
param_comb = 10

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)

random_search = RandomizedSearchCV(_clf, param_distributions=params, n_iter=param_comb,
                                   scoring='roc_auc', n_jobs=4, cv=skf.split(_X, _y), verbose=3, random_state=1001)

# Here we go

random_search.fit(_X, _y)

In [None]:
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)

In [None]:
DATA_COMPLETENESS = 0.6
MIN_N_PROTEIN_GROUPS = 200
CV_COEFFICIENT = 0.3

data_plasma_filtered = data_plasma_raw.dropna(
    axis=0, thresh=data_plasma_raw.shape[1] * DATA_COMPLETENESS)
data_plasma_filtered = data_plasma_filtered.dropna(
    axis=1, thresh=MIN_N_PROTEIN_GROUPS)
data_plasma_filtered_log = data_plasma_filtered.apply(log2)
data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(
    imputation_normal_distribution)

qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2']
                                   == 'QC']['Sample ID']
df_qc = data_plasma_filtered.copy()[qc_plasma]
def coef_of_variation(x): return np.std(x) / np.mean(x)


proteins_cv = df_qc.apply(coef_of_variation, axis=1)

cv_selected = proteins_cv < CV_COEFFICIENT

df_qc = df_qc.assign(cv=proteins_cv)
qc_30 = df_qc[cv_selected].index

df = data_plasma_filtered_log_imputed.copy()
df = df.rename_axis('Sample ID', axis=1).T
# filter proteins for CV < 30% of the inter-day/plate quality assessment
df_30 = df[qc_30]
data_proteomics = df_30

feature_selected = FeatureSelector(k=10, protein_gene_data=key_ProteinID)
proteins_selected_f2 = feature_selected.fit(
    data_proteomics, kleiner_ge_2, 'F2')
proteins_selected_f3 = feature_selected.fit(
    data_proteomics, kleiner_ge_3, 'F3')
proteins_selected_s1 = feature_selected.fit(
    data_proteomics, steatosis_ge_1, 'S1')
proteins_selected_I2 = feature_selected.fit(
    data_proteomics, inflamation_ge_2, 'I2')

test_cases = {}

test_cases['F2'] = {'proteins': proteins_selected_f2, 'y': kleiner_ge_2}
test_cases['F3'] = {'proteins': proteins_selected_f3, 'y': kleiner_ge_3}
test_cases['S1'] = {'proteins': proteins_selected_s1, 'y': steatosis_ge_1}
test_cases['I2'] = {'proteins': proteins_selected_I2, 'y': inflamation_ge_2}

scoring = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc']
CV_FOLDS = 5
CV_REPEATS = 10

_clf = ensemble.ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None, max_features=9,
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=25, min_samples_split=35,
                                     min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=None,
                                     oob_score=False, random_state=None, verbose=0,
                                     warm_start=False)

proteins_selected = proteins_selected_f2
y = kleiner_ge_2

_X = data_proteomics[proteins_selected.index]
in_both = y.index.intersection(_X.index)
_X = _X.loc[in_both]
_y = y.loc[in_both]

result = cross_validate(_clf, X=_X, y=_y, groups=_y, cv=RepeatedStratifiedKFold(
    n_splits=CV_FOLDS, n_repeats=CV_REPEATS, random_state=0), scoring=scoring)

mean_vals = pd.DataFrame(result).mean().to_frame().T

mean_vals