## Some information about this project and the problem we are trying to solve:
### Aim: 
    Investigate whether we could detect liver lesions at a relatively early stage based on plasma proteome profile.
    Liver lesions being fibrosis, inflammation and steatosis as ranked by importance of clinical indication
### Study design: 
    1. Cross-sectional
    2. Alcoholic liver disease (ALD, N=459)
    3. Healthy cohort (HP, N=136)
    4. Matching for age, BMI and gender as much as possible 
### Available dataset:
    1. Plasma proteome
    2. Clinical variables (medication, blood measurements, potentially outcome data)
### Problem: 
    1. To distinguish significant fibrosis (F0-1 vs. F2-4) in at risk population (ALD cohort)
    2. To distinguish advanced fibrosis (F0-2 vs. F3-4) in at risk population (ALD cohort)
    3. To distinguish significant inflammation (0-1 vs. 2-5) in at risk population (ALD cohort)
    4. To distinguish presence of steatosis (>33% vs. <33%) in at risk population (ALD cohort) 
  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from scipy.stats import zscore
import itertools 

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, mean_squared_error, roc_curve, auc, accuracy_score, f1_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from inspect import signature

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import matthews_corrcoef

In [3]:
def convert_to_numeric(data):
    df = data.copy()
    columns = df.columns
    df_new = pd.DataFrame(columns = columns)
    for i in columns:
        new_values = pd.to_numeric(df[i], errors = 'ignore')
        df_new[i] = new_values
    return df_new

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
                
    return col_corr

def model_performance_cv (X, y, features_selected, n_repeats):

    performance_all = pd.DataFrame(columns = ['num_feat', 'train_roc_auc', 'test_roc_auc', 
                                          'features', 'precision',
                                          'sensitivity', 'specificity', 'F1-score', 'accuracy', 'MCC'])
    
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats = n_repeats, random_state=0)
    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        
        performance = model_performance(clf = clf_lr, features = features_selected, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test)
        performance_all = performance_all.append(performance).round(2)
    
    return(performance_all)

def model_performance(clf, features, X_train, y_train, X_test, y_test):
    clf.fit(X_train[features], y_train)
    pred_train = clf.predict_proba(X_train[features])
    pred_test = clf.predict_proba(X_test[features])
    
    y_pred_train = clf.predict(X_train[features])
    y_pred_test = clf.predict(X_test[features])
    
    tn, fp, fn, tp = confusion_matrix(y_test,  y_pred_test).ravel()
    
    num_feat = len(features)
    train_roc_auc = roc_auc_score(y_train, pred_train[:,1])
    test_roc_auc = roc_auc_score(y_test, pred_test[:,1])

    precision = tp/(tp + fp) if (tp + fp)!= 0 else np.nan
    sensitivity = tp/(tp + fn) if (tp + fn)!= 0 else np.nan
    specificity = tn/(tn + fp) if (tn + fp)!= 0 else np.nan    
    f1_score = 2*precision*sensitivity/(precision+sensitivity) if (precision + sensitivity) != 0 else np.nan    
    accuracy = (tp + tn)/(tp + tn + fp + fn) if (tp + tn +fp +fn) !=0 else np.nan
    mcc_a = tp * tn - fp * fn
    mcc_b = np.sqrt((tp + fp) * (fn + tn) * (fp + tn) * (tp + fn))
    mcc = mcc_a/mcc_b if mcc_b != 0 else np.nan

    values = [num_feat, train_roc_auc, test_roc_auc, 
              features, precision, sensitivity, specificity, f1_score, accuracy, mcc]
    result = pd.DataFrame(values).T
    result.columns = ['num_feat', 
                      'train_roc_auc', 
                      'test_roc_auc', 
                      'features',
                      'precision',
                      'sensitivity',
                      'specificity',
                      'F1-score',
                      'accuracy',
                      'MCC']
    return (result)

def feature_selection_by_rocauc(X_train, y_train, X_test, y_test, features):
    roc_values = []
    clf = clf_lr
    for feature in features:
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))

    score = pd.DataFrame(roc_values, columns = ['roc_auc_score'])
    score['features'] = features
    score['Gene names'] = score['features'].map(IDmapping_UniprotID_to_Genename)
    score = score.set_index('Gene names').sort_values(by = 'roc_auc_score', ascending = False)
    return (score)

def feature_selection(X, y, features, n_repeats):
    performances = []
    for feature in features:
        performance = model_performance_cv(X = X, y = y, features_selected= [feature], n_repeats = n_repeats).mean()
        columns = performance.index
        performances.append(list(performance))
    result = pd.DataFrame(performances, columns = columns)
    result['feature'] = features
    result['Gene name'] = result['feature'].map(IDmapping_UniprotID_to_Genename)
    return (result)

def feature_selection_bestcombo(features, X_train, y_train, n_repeats = 5):
    performances = []
    combo_features = []
    for k in range(6, 11):
        for i, j in enumerate(itertools.combinations(features, k)):
            combo = list(j)
            combo_features.append(combo)
            performance = model_performance_cv(X = X_train, y = y_train, features_selected = combo, n_repeats = n_repeats).mean()
            index = performance.index
            performance = list(performance)
            performances.append(performance)
    result = pd.DataFrame(performances, columns= index)
    result['features'] = combo_features
    return (result)

def plot_roc(clf, items, names, colors, X0, y0, X, y, title_1, title_2):
    fig, ax = plt.subplots(figsize = (4, 4))
    k = 0
    for item in items:
        name = names[k]
        selected_features = items[k]
        clf.fit(X0[selected_features], y0)
        pred = clf.predict_proba(X[selected_features])
        
        n_bootstraps = 1000
        rng_seed = 0
        scores = []
        roc_normal = roc_auc_score(y, pred[:, 1]).round(2)
        rng = np.random.RandomState(rng_seed)       
        
        for i in range(n_bootstraps):
            # bootstrap by sampling with replacement on the predcition indices
            indices = rng.randint(0, len(pred) -1, len(pred))
            if len(np.unique(y[indices])) < 2:
                # we need at least one positive and one negative sample for ROC AUC
                # to be defined: reject the sample
                continue
            score = roc_auc_score(y[indices], pred[indices][:, 1])
            scores.append(score)
        score_mean = np.array(scores).mean().round(2)
        confidence_lower = sorted(scores)[int(0.025 * len(sorted(scores)))]
        confidence_upper = sorted(scores)[int(0.975 * len(sorted(scores)))]
        roc_auc = roc_auc_score(y, pred[:,1])
    
        fpr = dict()
        tpr = dict()
        roc_aucs = dict()
        for i in range(0, 2):
            fpr[i], tpr[i], thresholds = roc_curve(y, pred[:, i], pos_label=1)
            roc_aucs[i] = auc(fpr[i], tpr[i])       

        lw = 1
        plt.plot(fpr[1], tpr[1], color=colors[k], lw=lw, label='{}: AUC = {}'.format(name, roc_normal))
    
        plt.plot([0, 1], [0, 1], color='gray', lw=lw, linestyle='--')
        plt.xlim([-0.02, 1.02])
        plt.ylim([-0.02, 1.02])
        plt.xlabel('False positive rate', fontsize=15)
        plt.ylabel('True positive rate', fontsize=15)
        plt.xticks(fontsize = 15)
        plt.yticks(fontsize = 15)
        plt.legend(loc="lower right")
        k +=1
        plt.title('{}\n{}'.format(title_1, title_2), fontsize = 15)
        plt.savefig('figures/model/3C/{}_{}.png'.format(title_1, title_2),dpi = 120, bbox_inches = 'tight')

In [4]:
# Define classifiers
clf_xgbc = XGBClassifier(n_jobs=8)
clf_rf = RandomForestClassifier(n_estimators=200, random_state=29)
clf_lr = LogisticRegression(random_state=0, solver='liblinear')
clf_lr_mc = LogisticRegression(multi_class='ovr', solver = 'lbfgs')

In [5]:
# Import datasets
# Proteomic dataset after filtering for 70% data completeness, log2 transformed, imputation at lower end of distribution, CV% < 30% (raw intensity before log transformation)
data_ml_proteomics = pd.read_csv('processed/ML/data_ml_proteomics.csv', index_col = 'Sample ID')
data_ml_proteomics = data_ml_proteomics.drop(['class', 'fibrosis'], axis = 1)
# Clinical parameter dataset after filtering for 80% data completeness, multivariate imputation
data_cli_46 = pd.read_csv('processed/ML/data_cli_46.csv', index_col = 'Sample ID')
# Combined dataset integrating proteomic- and clinical- datasets 
data_ml_combined = pd.read_csv('processed/ML/data_ml_combined.csv', index_col = 'Sample ID')

In [6]:
# Import matching keys
key_ProteinID = pd.read_csv('processed/ML/ID_matching_key.csv')
data_cli = pd.read_csv('processed/ML/df_cli_164.csv')

IDmapping_UniprotID_to_Genename = dict(zip(key_ProteinID['Protein ID'], key_ProteinID['Gene names']))
IDmapping_sampleID_to_fibrosis_class = dict(zip(data_cli_46.index, data_cli_46['fibrosis']))
IDmapping_labtest_subcate = dict(zip(data_cli['Sample ID'], data_cli['type']))
IDmapping_sampleID_to_te = dict(zip(data_cli['Sample ID'], data_cli['te']))

In [7]:
# Overview of fibrosis score (kleiner) distribution
data_cli['kleiner'].value_counts()

1.0    124
2.0    106
0.5     98
4.0     67
0.0     36
3.0     27
Name: kleiner, dtype: int64

## Ground truth biopsy derived scores: 
    1. Fibrosis:
        kleiner 
    2. Inflammation: 
        nas_inflam
    3. Steatosis:
        nas_steatosis_ordinal
## Diagnostic comparators (existing best-in-class biomarkers): 
    1. Fibrosis:
        transient elastography, 2-dimensional shear wave elastography, ELF test, FibroTest, FIB4 score, APRI score, Forns score, ProC3
    2. Inflammation: 
        M30=caspase-cleaved cytokeratin-18 fragments, M65=total CK18, AST:ALT ratio, ProC3
    3. Steatosis: 
        Controlled attenuation parameter (cap)


In [8]:
# Extract above-mentioned parameters and integrate with proteomics dataset
col_ml = ['Sample ID', 'nas_steatosis_ordinal', 'nas_inflam', 'kleiner', 
          'fib4', 'elf', 'ft', 'te', 'swe', 'aar','ast',
          'apri','forns','m30', 'm65', 'meld', 'p3np', 'timp1', 'cap' ]

# Inspect data completeness grouped by cohorts
data_cli.groupby('group2')[col_ml].count()

Unnamed: 0_level_0,Sample ID,nas_steatosis_ordinal,nas_inflam,kleiner,fib4,elf,ft,te,swe,aar,ast,apri,forns,m30,m65,meld,p3np,timp1,cap
group2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ALD,459,352,352,458,443,380,283,440,362,443,444,444,453,282,280,458,350,350,300
HP,136,0,0,0,0,0,0,136,136,0,119,0,0,0,0,136,0,0,133


In [9]:
data_cli_sor = data_cli[col_ml].set_index(['Sample ID'])

ald = data_cli[data_cli['group2'] == 'ALD']['Sample ID']
use_rows = set(data_cli_sor.index) & set(data_ml_proteomics.index) & set(ald)
data_ml_combined_sor = data_ml_proteomics.loc[use_rows].join(other = data_cli_sor.loc[use_rows])
data_ml_combined_sor = data_ml_combined_sor.sort_index(axis = 0)

## Fibrosis

In [10]:
# state-of-the-art (sor) markers: transient elastography, 2-dimensional shear wave elastography, ELF test, FibroTest, FIB4 score, APRI score, Forns score, ProC3
#sor_fibrosis_complete = ['te', 'swe', 'elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
sor_fibrosis = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
data_cli.groupby('group2')[sor_fibrosis].count()

Unnamed: 0_level_0,elf,ft,fib4,apri,forns,p3np
group2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALD,380,283,443,444,453,350
HP,0,0,0,0,0,0


### Complete dataset across all blood based fibrosis markers

In [11]:
sor_fibrosis = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
df = data_ml_combined_sor.dropna(subset = sor_fibrosis)
df['class_fibrosis'] = np.where(df['kleiner'] >1, 1, 0)

data = df
X = data.drop(labels = ['kleiner', 'class_fibrosis'], axis = 1)

y = data['class_fibrosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = data['kleiner'])
# Standadize features
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape, X_test.shape)

(197, 217) (85, 217)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
df['class_fibrosis'].value_counts(1)

1    0.503546
0    0.496454
Name: class_fibrosis, dtype: float64

In [13]:
score = feature_selection(X = X_train, y = y_train, features = data_ml_proteomics.columns, n_repeats=2)

In [14]:
score = score.sort_values(by = 'test_roc_auc', ascending = False)
score_selected = list(score['feature'][:10])

In [None]:
result_fibrosis = feature_selection_bestcombo(features = score_selected, X_train = X_train, y_train = y_train)

In [None]:
result_fibrosis = result_fibrosis.sort_values(by = 'test_roc_auc', ascending = False)
fibrosis_best = [i for i in list(result_fibrosis['features'][:1])[0]]
[IDmapping_UniprotID_to_Genename[i] for i in fibrosis_best]

In [None]:
sor_fibrosis = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
columns = ['num_feat', 'train_roc_auc', 'test_roc_auc', 'features', 'precision', 'sensitivity', 'specificity', 'F1-score', 'accuracy', 'MCC']
per_fibrosis = pd.DataFrame(columns = columns)

for i in sor_fibrosis:
    per_i = model_performance(clf = clf_lr, features = [i], X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
    per_fibrosis = pd.concat([per_fibrosis, per_i])

per_prot = model_performance(clf = clf_lr, features = fibrosis_best, X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
per_fibrosis = pd.concat([per_fibrosis, per_prot])
sor_fibrosis.append('proteomic panel')
per_fibrosis['model'] = sor_fibrosis
per_fibrosis = per_fibrosis.sort_values(by = 'MCC', ascending = False)
per_fibrosis = convert_to_numeric(per_fibrosis).round(2)
per_fibrosis

In [None]:
#colors = ['black','gray', 'pink', 'black', 'darkgray', 'gray', 'darkred']
colors = sns.diverging_palette(220, 10, sep=80, n = 7, center = 'dark')
items = [['elf'], ['ft'], ['fib4'], ['apri'], ['forns'], ['p3np'], fibrosis_best]
names = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np', 'proteomic panel']
clf = clf_lr
X0, y0 = X_train, y_train
X, y = X_test, y_test
title_1, title_2 = 'F0-1 vs. F2-4', 'Test set'
plot_roc(clf = clf, items = items, names = names, colors = colors, X0 =X0, y0 = y0, X = X, y = y, title_1 = title_1, title_2 = title_2)

### Fibrosis model compared to imaging methods

In [None]:
# state-of-the-art (sor) markers: transient elastography, 2-dimensional shear wave elastography, ELF test, FibroTest, FIB4 score, APRI score, Forns score, ProC3
sor_fibrosis2 = ['te', 'swe']
data_cli.groupby('group2')[sor_fibrosis2].count()

### Complete dataset across imaging-based comparators

In [None]:
sor_fibrosis2 = ['te', 'swe']
use_cols = data_ml_proteomics.columns.to_list() + sor_fibrosis2

df = data_ml_combined_sor.dropna(subset = sor_fibrosis2)
df = df.fillna(value = {'kleiner': -1})
df['class_fibrosis'] = np.where(df['kleiner'] >1, 1, 0)

X =df[use_cols]
y = df['class_fibrosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = df['kleiner'])
# Standadize features
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape, X_test.shape)

In [None]:
score2 = feature_selection(X = X_train, y = y_train, features = data_ml_proteomics.columns, n_repeats=2)

In [None]:
score2 = score2.sort_values(by = 'test_roc_auc', ascending = False)
score_selected2 = list(score2['feature'][:10])

In [None]:
result_fibrosis2 = feature_selection_bestcombo(features = score_selected2, X_train = X_train, y_train = y_train)

In [None]:
result_fibrosis2 = result_fibrosis2.sort_values(by = 'test_roc_auc', ascending = False)
fibrosis_best2 = [i for i in list(result_fibrosis2['features'][:1])[0]]
[IDmapping_UniprotID_to_Genename[i] for i in fibrosis_best2]

In [None]:
columns = ['num_feat', 'train_roc_auc', 'test_roc_auc', 'features', 'precision', 'sensitivity', 'specificity', 'F1-score', 'accuracy', 'MCC']
per_fibrosis = pd.DataFrame(columns = columns)
sor_fibrosis2 = ['te', 'swe']

for i in sor_fibrosis2:
    per_i = model_performance(clf = clf_lr, features = [i], X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
    per_fibrosis = pd.concat([per_fibrosis, per_i])

per_prot = model_performance(clf = clf_lr, features = fibrosis_best2, X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
per_fibrosis = pd.concat([per_fibrosis, per_prot])
sor_fibrosis2.append('proteomic panel')
per_fibrosis['model'] = sor_fibrosis2
per_fibrosis = per_fibrosis.sort_values(by = 'MCC', ascending = False)
per_fibrosis = convert_to_numeric(per_fibrosis).round(2)
per_fibrosis

In [None]:
colors = ['gray', 'pink', 'darkred', 'black', 'darkgray', 'gray']
#colors = sns.diverging_palette(220, 10, sep=80, n=7, center = 'dark')
items = [['swe'], ['te'], fibrosis_best2]
names = ['swe', 'te', 'proteomic panel']
clf = clf_lr
X0, y0 = X_train, y_train
X, y = X_test, y_test
title_1, title_2 = 'Fibrosis F0-1 vs. F2-4_2', 'Test set'
plot_roc(clf = clf, items = items, names = names, colors = colors, X0 =X0, y0 = y0, X = X, y = y, title_1 = title_1, title_2 = title_2)

In [None]:
[IDmapping_UniprotID_to_Genename[i] for i in fibrosis_best2]

# Inflammation

In [None]:
data_cli.groupby('group2')[col_ml].count()

In [None]:
# state-of-the-art (sor) markers: M30=caspase-cleaved cytokeratin-18 fragments, M65=total CK18, AST:ALT ratio, ProC3
sor_inflam = ['nas_inflam', 'm30', 'm65', 'aar', 'p3np', 'ast']
data_cli.groupby('group2')[sor_inflam].count()

### Complete dataset across all inflammation markerss

In [None]:
sor_inflam = ['nas_inflam', 'm30', 'm65', 'aar', 'p3np', 'ast']
df = data_ml_combined_sor.dropna(subset = sor_inflam)
df['class_inflam'] = np.where(df['nas_inflam'] > 1, 1, 0)

data = df
X = data.drop(labels = ['nas_inflam', 'class_inflam'], axis = 1)

y = data['class_inflam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = data['nas_inflam'])
# Standadize features
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape, X_test.shape)

In [None]:
#import warnings
#warnings.simplefilter('ignore')

In [None]:
score = feature_selection(X = X_train, y = y_train, features = data_ml_proteomics.columns, n_repeats=2)
score = score.sort_values(by = 'test_roc_auc', ascending = False)
score_selected = list(score['feature'][:10])

In [None]:
result_inflammation = feature_selection_bestcombo(features = score_selected, X_train = X_train, y_train = y_train)

In [None]:
result_inflammation = result_inflammation.sort_values(by = 'test_roc_auc', ascending = False)
inflammation_best = [i for i in list(result_inflammation['features'][:1])[0]]
[IDmapping_UniprotID_to_Genename[i] for i in inflammation_best]

In [None]:
sor_inflam = ['m30', 'm65', 'aar', 'p3np', 'ast']
columns = ['num_feat', 'train_roc_auc', 'test_roc_auc', 'features', 'precision', 'sensitivity', 'specificity', 'F1-score', 'accuracy', 'MCC']
per_inflam = pd.DataFrame(columns = columns)

for i in sor_inflam:
    per_i = model_performance(clf = clf_lr, features = [i], X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
    per_inflam = pd.concat([per_inflam, per_i])

per_prot = model_performance(clf = clf_lr, features = inflammation_best, X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
per_inflam = pd.concat([per_inflam, per_prot])
sor_inflam.append('proteomic panel')
per_inflam['model'] = sor_inflam
per_inflam = per_inflam.sort_values(by = 'MCC', ascending = False)
per_inflam = convert_to_numeric(per_inflam).round(2)
per_inflam

In [None]:
X_train.shape

In [None]:
colors = ['gray', 'pink', 'darkred', 'black', 'darkgray', 'gray']
colors = sns.diverging_palette(220, 10, sep=80, n=6, center = 'dark')
items = [['m30'], ['m65'], ['p3np'], ['aar'], ['ast'], inflammation_best]
names = ['m30', 'm65', 'p3np', 'aar', 'ast', 'proteomic panel']
clf = clf_lr
X0, y0 = X_train, y_train
X, y = X_train, y_train
title_1, title_2 = 'NAS_inflam 0-1 vs. 2-5', 'Train set'
plot_roc(clf = clf, items = items, names = names, colors = colors, X0 =X0, y0 = y0, X = X, y = y, title_1 = title_1, title_2 = title_2)

# Steatosis

## state-of-the-art

In [None]:
data_cli.groupby('group2')[col_ml].count()

In [None]:
# state-of-the-art (sor) markers: Controlled attenuation parameter (cap)
sor_steatosis = ['nas_steatosis_ordinal', 'cap']
data_cli.groupby('group2')[sor_steatosis].count()

### Complete dataset across all steatosis markerss

In [None]:
sor_steatosis = ['nas_steatosis_ordinal', 'cap']
df = data_ml_combined_sor.dropna(subset = sor_steatosis)
df['class_steatosis'] = np.where(df['nas_steatosis_ordinal'] > 0, 1, 0)

X = df.drop(labels = ['nas_steatosis_ordinal', 'class_steatosis'], axis = 1)

y = df['class_steatosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = df['nas_steatosis_ordinal'])
# Standadize features
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape, X_test.shape)

In [None]:
score = feature_selection(X = X_train, y = y_train, features = data_ml_proteomics.columns, n_repeats=2)

In [None]:
score = score.sort_values(by = 'test_roc_auc', ascending = False)
score_selected = list(score['feature'][:10])

In [None]:
result_steatosis = feature_selection_bestcombo(features = score_selected, X_train = X_train, y_train = y_train)

In [None]:
result_steatosis = result_steatosis.sort_values(by = 'test_roc_auc', ascending = False)
steatosis_best = [i for i in list(result_steatosis['features'][:1])[0]]
[IDmapping_UniprotID_to_Genename[i] for i in steatosis_best]

In [None]:
sor_steatosis = ['cap']
columns = ['num_feat', 'train_roc_auc', 'test_roc_auc', 'features', 'precision', 'sensitivity', 'specificity', 'F1-score', 'accuracy', 'MCC']
per_steatosis = pd.DataFrame(columns = columns)

for i in sor_steatosis:
    per_i = model_performance(clf = clf_lr, features = [i], X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
    per_steatosis = pd.concat([per_steatosis, per_i])

per_prot = model_performance(clf = clf_lr, features = steatosis_best, X_train= X_train, y_train = y_train, X_test = X_test, y_test = y_test)
per_steatosis = pd.concat([per_steatosis, per_prot])
models = sor_steatosis
models.append('proteomic panel')
#sor_steatosis.append('proteomic panel')
per_steatosis['model'] = sor_steatosis
per_steatosis = per_steatosis.sort_values(by = 'MCC', ascending = False)
per_steatosis = convert_to_numeric(per_steatosis).round(2)
per_steatosis

In [None]:
colors = ['gray', 'darkred']
#colors = sns.diverging_palette(220, 10, sep=80, n=5, center = 'dark')
items = [['cap'], steatosis_best]
names = ['cap', 'proteomic panel']
clf = clf_lr
X0, y0 = X_train, y_train
X, y = X_test, y_test
title_1, title_2 = 'NAS_steatosis', 'Test set'
plot_roc(clf = clf, items = items, names = names, colors = colors, X0 =X0, y0 = y0, X = X, y = y, title_1 = title_1, title_2 = title_2)