# Fatty Liver Disease (FLD) Study

- alcoholic vs non-alcoholic FLD, short: AFLD vs NAFLD


**Outline**
1. Study based only on clinical imaging (markers)
    1. Steatosis
    2. Inflammation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model
import sklearn.ensemble
import xgboost
import ipywidgets as widgets

In [3]:
import src.utils as utils

In [4]:
import os
CPUS = os.cpu_count()
RANDOMSTATE = 29
DATAFOLDER = 'processed/ML'

## Explore datasets

Diagnostic comparators (existing best-in-class) biomarkers
- Fibrosis markers: transient elastography, 2-dimensional shear wave elastography, ELF test, FibroTest, FIB4 score, APRI score, Forns score, ProC3
- Inflammation markers: M30=caspase-cleaved cytokeratin-18 fragments, M65=total CK18, AST:ALT ratio, ProC3
- Steatosis: Controlled attenuation parameter

In [5]:
pd.set_option('max_columns', 9)

files = [file for file in os.listdir(DATAFOLDER) if '.csv' in file]
w_data = widgets.Dropdown(options=files, index=5)

def show_data(file):
    filename = os.path.join(DATAFOLDER, file)
    global data # only here to show-case data for report
    try:
        data = pd.read_csv(filename, index_col='Sample ID')
    except:
        data = pd.read_csv(filename)
    try:
        w_cols.options = list(data.columns)
    except:
        pass
    display(data.head())
out = widgets.interactive_output(show_data, controls={'file':w_data})

widgets.VBox([w_data, out])

VBox(children=(Dropdown(index=5, options=('data_cli_16.csv', 'data_cli_46.csv', 'data_cli_96_ml.csv', 'data_ml…

In [20]:
# Possible Alternative for DropDown 
from src.widgets import multi_checkbox_widget

descriptions=data.columns
multi_checkbox_widget(descriptions)

VBox(children=(Text(value=''), VBox(children=(Checkbox(value=False, description='P02768'), Checkbox(value=Fals…

In [6]:
w_cols = widgets.SelectMultiple(options=list(data.columns))

def show_selected_proteins(columns):
    if len(columns)> 0:
        display(data[list(w_cols.value)])
    else:
        print('Select proteins')

out_sel = widgets.interactive_output(show_selected_proteins, {'columns': w_cols})
widgets.VBox([w_cols, out_sel])

VBox(children=(SelectMultiple(options=('P02768', 'A0A0G2JMB2', 'P01834', 'P02787', 'A0A286YEY1', 'A0A0A0MS08',…

### Load Complete clinical data

In [7]:
PROTEOM  = 'data_ml_proteomics.csv'
CLINICAL = 'df_cli_164.csv'
COL_ID = 'Sample ID'
f_data_clinic = os.path.join(DATAFOLDER, CLINICAL)
data_cli = pd.read_csv(f_data_clinic, index_col=COL_ID)
data_cli.head()

Unnamed: 0_level_0,age,kleiner,cpa,nas,...,group,group2,type,fibrosis_class
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate6_G11,64,3.0,9.216155,0.0,...,ALD,ALD,Compensated advanced fibrosis,F3-4
Plate1_F7,74,2.0,4.874057,3.0,...,ALD,ALD,Steatohepatitis,F2
Plate6_D2,71,1.0,3.199472,3.0,...,ALD,ALD,Steatohepatitis,F0-1
Plate6_C5,53,2.0,4.762666,5.0,...,ALD,ALD,Steatohepatitis,F2
Plate4_F8,63,4.0,13.702832,4.0,...,ALD,ALD,Compensated advanced fibrosis,F3-4


In [8]:
w_cols_cli = widgets.SelectMultiple(options=list(data_cli.columns))

def show_selected_markers(columns):
    if len(columns)> 0:
        display(data_cli[list(w_cols_cli.value)])
        display(data_cli[list(w_cols_cli.value)].describe())
    else:
        print('Select clinical markers')

out_cli = widgets.interactive_output(show_selected_markers, {'columns': w_cols_cli})
widgets.VBox([w_cols_cli, out_cli])

VBox(children=(SelectMultiple(options=('age', 'kleiner', 'cpa', 'nas', 'nas_inflam', 'nas_portinflam', 'nas_lo…

### Selected Clinical markers

Diagnostic comparators (existing best-in-class) biomarkers
- Fibrosis markers: 
    - transient elastography
    - 2-dimensional shear wave elastography
    - ELF test
    - FibroTest
    - FIB4 score
    - APRI score
    - Forns score
    - ProC3
- Inflammation markers:
    - M30=caspase-cleaved cytokeratin-18 fragments
    - M65=total CK18
    - AST:ALT ratio
    - ProC3
- Steatosis: Controlled attenuation parameter

In [10]:
FEATURES_ML = ['nas_steatosis_ordinal', 'nas_inflam', 'kleiner', 
          'fib4', 'elf', 'ft', 'te', 'swe', 'aar','ast',
          'apri','forns','m30', 'm65', 'meld', 'p3np', 'timp1', 'cap' ]
data_cli[FEATURES_ML].head()

Unnamed: 0_level_0,nas_steatosis_ordinal,nas_inflam,kleiner,fib4,...,meld,p3np,timp1,cap
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate6_G11,0.0,0.0,3.0,1.38,...,7.0,4.8,230.6,
Plate1_F7,1.0,2.0,2.0,8.22,...,12.0,7.8,318.7,
Plate6_D2,1.0,2.0,1.0,2.22,...,8.0,8.8,276.4,
Plate6_C5,2.0,3.0,2.0,32.700001,...,10.0,20.9,549.2,
Plate4_F8,0.0,4.0,4.0,1.66,...,7.0,22.0,545.6,


In [11]:
data_cli.groupby('group2')[FEATURES_ML].count()

Unnamed: 0_level_0,nas_steatosis_ordinal,nas_inflam,kleiner,fib4,...,meld,p3np,timp1,cap
group2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALD,352,352,458,443,...,458,350,350,300
HP,0,0,0,0,...,136,0,0,133


### Load proteome data

In [12]:
f_data_proteom = os.path.join(DATAFOLDER, PROTEOM)
data_ml_proteomics = pd.read_csv(f_data_proteom, index_col=COL_ID )
data_ml_proteomics

Unnamed: 0_level_0,P02768,A0A0G2JMB2,P01834,P02787,...,Q6YHK3,Q16270,class,fibrosis
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate1_A2,29.502126,25.961794,25.756107,25.064482,...,9.321622,12.540081,0,hp
Plate1_A3,29.606015,24.044912,25.428379,25.283375,...,11.971725,10.922941,0,hp
Plate1_A4,29.253610,25.271068,25.260238,24.806825,...,11.011772,10.988168,1,F2
Plate1_A5,29.488557,24.080138,24.880090,25.086008,...,9.748172,11.065879,1,F0-1
Plate1_A6,29.626314,24.419439,25.275982,24.991902,...,10.960807,11.271831,1,F0-1
...,...,...,...,...,...,...,...,...,...
Plate7_C10,29.560172,25.756191,25.474991,25.167589,...,11.923557,11.682579,1,F0-1
Plate7_C11,29.525346,25.263328,25.317142,24.966874,...,9.456483,12.132759,1,F0-1
Plate7_C12,29.570570,25.532301,25.410847,25.178165,...,10.700540,11.203767,1,F0-1
Plate7_D1,29.618001,25.240001,25.656683,24.951814,...,11.664310,12.026582,0,hp


In [13]:
pd.crosstab(
index=data_ml_proteomics.fibrosis,
columns=data_cli.fibrosis_class,
dropna=False,
margins=True
)

fibrosis_class,F0-1,F2,F3-4,All
fibrosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F0-1,256,0,0,256
F2,0,106,0,106
F3-4,0,0,93,93
All,256,106,93,591


5 plates of clinical data are not present in proteom data.

##### 

In [14]:
data_cli.fibrosis_class.index.difference(data_ml_proteomics.fibrosis.index)

Index(['Plate1_E1', 'Plate4_E3', 'Plate5_D7', 'Plate7_B6'], dtype='object', name='Sample ID')

In [15]:
FEATURES_CLINIC = ['ggt', 'alt', 'ast', 'alk', 'mcv', 'iga', 'igg', 'leu', 'glc']
data_cli[FEATURES_CLINIC].head()

Unnamed: 0_level_0,ggt,alt,ast,alk,mcv,iga,igg,leu,glc
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate6_G11,86.0,18,27.0,87,88.0,2.15,11.8,9.73,6.7
Plate1_F7,99.0,23,57.0,112,,,,,5.1
Plate6_D2,172.0,15,26.0,79,84.0,2.28,8.7,10.7,6.4
Plate6_C5,1816.0,72,267.0,290,109.5,9.26,18.0,7.6,6.1
Plate4_F8,166.0,25,37.0,127,80.0,6.1,9.7,16.9,6.1


### Load Protein GeneID Mapping

- UniProtID to Gene name mapping


In [16]:
key_ProteinID = pd.read_csv(os.path.join(DATAFOLDER, 'ID_matching_key.csv'), 
                            index_col="Protein ID").drop("Unnamed: 0", axis=1)
key_ProteinID.head()

Unnamed: 0_level_0,Gene names
Protein ID,Unnamed: 1_level_1
A0A024R6I7,SERPINA1
A0A075B6I0,IGLV8-61
A0A075B6J9,IGLV2-18
A0A075B6R9,IGKV2D-24
A0A075B6S2,IGKV2D-29


## Fibrosis

- state-of-the-art (sor) markers: 
  - transient elastography (te)
  - 2-dimensional shear wave elastography (swe)
  - ELF test (elf)
  - FibroTest (ft)
  - FIB4 score (fib4)
  - APRI score (apri)
  - Forns score (forns)
  - ProC3 (p3np)

In [17]:
#sor_fibrosis = ['te', 'swe', 'elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
sor_fibrosis = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
data_cli.groupby('group2')[sor_fibrosis].count()

Unnamed: 0_level_0,elf,ft,fib4,apri,forns,p3np
group2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALD,380,283,443,444,453,350
HP,0,0,0,0,0,0


## Classifiers

- Select Classifier by cross-validation using [sklearn functionality](https://scikit-learn.org/stable/model_selection.html#model-selection)

In [108]:
# Define classifiers
clf_xgbc  = xgboost.XGBClassifier(n_jobs=CPUS-1)
clf_rf    = sklearn.ensemble.RandomForestClassifier(n_estimators=200, random_state=RANDOMSTATE)
clf_lr    = sklearn.linear_model.LogisticRegression(random_state=0, solver='liblinear')
clf_svm   = sklearn.svm.SVC(kernel='linear', C=1)
clf_dict = {'xgboost': clf_xgbc,
           'RF': clf_rf,
           'Logistic': clf_lr,
           'SVM': clf_svm,
           }

In [79]:
target = data_ml_proteomics['class']
X = data_ml_proteomics.iloc[:, :-2]

In [88]:
# # sanity check
# disease = (data_cli['group2'] == 'ALD').astype('int64')
# shared = target.index.intersection(data_ml_proteomics.index)
# target.equals(disease.loc[shared])

True

In [95]:
from sklearn.model_selection import cross_validate
scoring = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc'] # how to customize cutoff?

In [107]:
clf=clf_xgbc
cv_result = cross_validate(clf, X, y=target, cv=5, scoring=scoring)
cv_result = pd.DataFrame(cv_result)
cv_result

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_balanced_accuracy,test_roc_auc
0,0.310585,0.012983,0.884211,0.923077,0.903226,0.76511,0.890502
1,0.219516,0.025767,0.87,0.956044,0.910995,0.737281,0.893773
2,0.228512,0.010998,0.876289,0.934066,0.904255,0.744811,0.866504
3,0.253998,0.015044,0.834951,0.945055,0.886598,0.657713,0.855515
4,0.222514,0.010998,0.849462,0.868132,0.858696,0.674807,0.840863


In [124]:
cv_results = {}
for key, clf in clf_dict.items(): 
    cv_results[key] = cross_validate(clf, X, y=target, cv=5, scoring=scoring)

In [125]:
cv_means = pd.DataFrame(cv_results)
cv_means = cv_means.applymap(np.mean).T
cv_means

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_balanced_accuracy,test_roc_auc
xgboost,0.252407,0.011901,0.862983,0.925275,0.892754,0.715944,0.869431
RF,0.593981,0.039192,0.815132,0.971429,0.885801,0.613757,0.885517
Logistic,0.027993,0.006851,0.889631,0.874725,0.881588,0.753236,0.878449
SVM,0.036703,0.013399,0.880168,0.843956,0.861017,0.727269,0.850584


## Versions

In [None]:
pip list | grep pandas