# Fatty Liver Disease (FLD) Study

- alcoholic vs non-alcoholic FLD, short: AFLD vs NAFLD


**Outline**

1. Study on liver disease types:
    1. Fibrosis
    1. Steatosis
    2. Inflammation
    
2. Two data sets with 
    1. clinical markers
    2. proteome information

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model
import sklearn.ensemble
import xgboost
import ipywidgets as widgets

In [13]:
import src.utils as utils

In [14]:
import os
CPUS = os.cpu_count()
RANDOMSTATE = 29
DATAFOLDER = 'processed/ML'

# Explore datasets

Diagnostic comparators (existing best-in-class) biomarkers
- Fibrosis markers: transient elastography, 2-dimensional shear wave elastography, ELF test, FibroTest, FIB4 score, APRI score, Forns score, ProC3
- Inflammation markers: M30=caspase-cleaved cytokeratin-18 fragments, M65=total CK18, AST:ALT ratio, ProC3
- Steatosis: Controlled attenuation parameter

In [15]:
pd.set_option('max_columns', 9)

files = [file for file in os.listdir(DATAFOLDER) if '.csv' in file]
w_data = widgets.Dropdown(options=files, index=5)

def show_data(file):
    filename = os.path.join(DATAFOLDER, file)
    global data # only here to show-case data for report
    try:
        data = pd.read_csv(filename, index_col='Sample ID')
    except:
        data = pd.read_csv(filename)
    try:
        w_cols.options = list(data.columns)
    except:
        pass
    display(data.head())
out = widgets.interactive_output(show_data, controls={'file':w_data})

widgets.VBox([w_data, out])

VBox(children=(Dropdown(index=5, options=('data_cli_16.csv', 'data_cli_46.csv', 'data_cli_96_ml.csv', 'data_ml…

In [16]:
# # Possible Alternative for DropDown 
# from src.widgets import multi_checkbox_widget

# descriptions=data.columns
# w_cols = multi_checkbox_widget(descriptions)
# w_cols

In [17]:
w_cols = widgets.SelectMultiple(options=list(data.columns))

def show_selected_proteins(columns):
    if len(columns)> 0:
        display(data[list(w_cols.value)])
        print(data[list(w_cols.value)].describe())
    else:
        print('Select proteins')

out_sel = widgets.interactive_output(show_selected_proteins, {'columns': w_cols})
widgets.VBox([w_cols, out_sel])

VBox(children=(SelectMultiple(options=('P02768', 'A0A0G2JMB2', 'P01834', 'P02787', 'A0A286YEY1', 'A0A0A0MS08',…

## Proteomics data
### Load Complete proteomics data

In [18]:
def extract_datamatrix_DIA(Report):
    data = Report.copy()
    for i in ['PG.ProteinAccessions', 'PG.Genes']:
        data[i]=data[i].str.split(';').str[0]
    data=data.rename({'PG.Genes': 'Gene names', 'PG.ProteinAccessions': 'Protein ID'}, axis=1)
    data=data.replace({'Filtered': np.float64('NaN')})
    return data

def convert_to_numeric(data):
    df = data.copy()
    columns = df.columns
    df_new = pd.DataFrame(columns = columns)
    for i in columns:
        new_values = pd.to_numeric(df[i], errors = 'ignore')
        df_new[i] = new_values
    return df_new

def imputation_normal_distribution(df):
    data_imputed = df.copy()
    for i in data_imputed.loc[:, data_imputed.isnull().any()]:
        missing = data_imputed[i].isnull()
        std = data_imputed[i].std()
        mean = data_imputed[i].mean()
        sigma = std*0.3
        mu = mean - (std*1.8)
        data_imputed.loc[missing, i] = np.random.normal(mu, sigma, size=len(data_imputed[missing]))
        
    return data_imputed

In [19]:
annotation_file = pd.read_csv('annotation/Experiment annotation file.csv', index_col = [0])
annotation_file_plasma = annotation_file[annotation_file['Sample type'] == 'Plasma']

In [20]:
Report_plasma = pd.read_csv('raw/proteomics/plasma/20190620_210717_20190620_P0000005_Lili2Klibrary_Report.csv')
experimental_columns = annotation_file_plasma['Sample ID']
data_plasma_raw = extract_datamatrix_DIA(Report_plasma)
data_plasma_raw.drop(data_plasma_raw.filter(regex='StrippedSequences').columns, axis=1, inplace = True)
data_plasma_raw = data_plasma_raw.rename(mapper = dict(zip(annotation_file['File name'], annotation_file['Sample ID'])), axis=1)
IDmapping_UniprotID_to_Genename = dict(zip(data_plasma_raw['Protein ID'], data_plasma_raw['Gene names']))

data_plasma_raw = data_plasma_raw.set_index('Protein ID').drop('Gene names', axis = 1)
# Filter at protein level for 70% data completeness across all runs
data_plasma_filtered = data_plasma_raw.dropna(thresh = data_plasma_raw.shape[1] * 0.7)
# Filter at sample level for total quantified protein groups above 200
cols_sel = [i for i in data_plasma_filtered.columns if data_plasma_filtered[i].count() > 200]
data_plasma_filtered = data_plasma_filtered[cols_sel]
data_plasma_filtered = convert_to_numeric(data_plasma_filtered)
data_plasma_filtered_log = np.log2(data_plasma_filtered)
data_plasma_filtered_log_imputed = imputation_normal_distribution(data_plasma_filtered_log.T).T

In [21]:
qc_plasma = annotation_file_plasma[annotation_file_plasma['Group2'] == 'QC']['Sample ID']
df_qc = data_plasma_filtered.copy()[qc_plasma]
func = lambda x: np.std(x) / np.mean(x)
var = df_qc.apply(func, axis = 1)
df_qc = df_qc.assign(var = var)
qc_30 = df_qc[df_qc['var'] < 0.3].index

df = data_plasma_filtered_log_imputed.copy()
df = df.rename_axis('Sample ID', axis=1).T
# filter proteins for CV < 30% of the inter-day/plate quality assessment 
df_30 = df[qc_30]
data_proteomics = df_30.copy()

## Clinical data
### Load Complete clinical data

In [22]:
PROTEOM  = 'data_ml_proteomics.csv'
CLINICAL = 'df_cli_164.csv'
COL_ID = 'Sample ID'
f_data_clinic = os.path.join(DATAFOLDER, CLINICAL)
data_cli = pd.read_csv(f_data_clinic, index_col=COL_ID)
data_cli.head()

Unnamed: 0_level_0,age,kleiner,cpa,nas,...,group,group2,type,fibrosis_class
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate6_G11,64,3.0,9.216155,0.0,...,ALD,ALD,Compensated advanced fibrosis,F3-4
Plate1_F7,74,2.0,4.874057,3.0,...,ALD,ALD,Steatohepatitis,F2
Plate6_D2,71,1.0,3.199472,3.0,...,ALD,ALD,Steatohepatitis,F0-1
Plate6_C5,53,2.0,4.762666,5.0,...,ALD,ALD,Steatohepatitis,F2
Plate4_F8,63,4.0,13.702832,4.0,...,ALD,ALD,Compensated advanced fibrosis,F3-4


In [23]:
w_cols_cli = widgets.SelectMultiple(options=list(data_cli.columns))

def show_selected_markers(columns):
    if len(columns)> 0:
        display(data_cli[list(w_cols_cli.value)])
        display(data_cli[list(w_cols_cli.value)].describe())
    else:
        print('Select clinical markers')

out_cli = widgets.interactive_output(show_selected_markers, {'columns': w_cols_cli})
widgets.VBox([w_cols_cli, out_cli])

VBox(children=(SelectMultiple(options=('age', 'kleiner', 'cpa', 'nas', 'nas_inflam', 'nas_portinflam', 'nas_lo…

### Selected Clinical markers

Diagnostic comparators (existing best-in-class) biomarkers
- state-of-the-art (**sor**) Fibrosis markers: 
    - `te`: transient elastography (sona liver scan)
    - `swe`: 2-dimensional shear wave elastography
    - `elf`: ELF test
    - `ft`: FibroTest
    - `fib4`: FIB4 score
    - `apri`: APRI score
    - `forns`: Forns score
    - `p3np`: ProC3
- Inflammation markers:
    - M30=caspase-cleaved cytokeratin-18 fragments
    - M65=total CK18
    - AST:ALT ratio
    - ProC3
- Steatosis: Controlled attenuation parameter

In [21]:
#sor_fibrosis = ['te', 'swe', 'elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
sor_fibrosis = ['elf', 'ft', 'fib4', 'apri', 'forns', 'p3np']
data_cli.groupby('kleiner')[sor_fibrosis].count()

Unnamed: 0_level_0,elf,ft,fib4,apri,forns,p3np
kleiner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,35,33,36,36,36,35
0.5,30,15,90,90,96,30
1.0,120,93,120,121,123,113
2.0,104,76,104,104,103,95
3.0,27,17,26,26,27,22
4.0,63,49,66,66,67,54


In [22]:
pd.set_option('max_columns', 20)
FEATURES_ML = ['nas_steatosis_ordinal', 'nas_inflam', 'kleiner', 
          'fib4', 'elf', 'ft', 'te', 'swe', 'aar','ast',
          'apri','forns','m30', 'm65', 'meld', 'p3np', 'timp1', 'cap' ]
data_cli[FEATURES_ML].head()

Unnamed: 0_level_0,nas_steatosis_ordinal,nas_inflam,kleiner,fib4,elf,ft,te,swe,aar,ast,apri,forns,m30,m65,meld,p3np,timp1,cap
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Plate6_G11,0.0,0.0,3.0,1.38,8.7,0.29,9.3,,1.5,27.0,0.2,4.75,106.672,293.25601,7.0,4.8,230.6,
Plate1_F7,1.0,2.0,2.0,8.22,9.8,0.77,,,2.48,57.0,1.18,8.82,161.834,438.918,12.0,7.8,318.7,
Plate6_D2,1.0,2.0,1.0,2.22,9.6,0.34,8.7,13.1,1.73,26.0,0.27,6.65,84.621803,502.39999,8.0,8.8,276.4,
Plate6_C5,2.0,3.0,2.0,32.700001,11.3,0.83,19.1,,3.71,267.0,11.63,11.88,668.94098,1432.85,10.0,20.9,549.2,
Plate4_F8,0.0,4.0,4.0,1.66,11.3,0.52,75.0,28.0,1.48,37.0,0.29,5.7,491.866,857.375,7.0,22.0,545.6,


In [23]:
data_cli.groupby('group2')[FEATURES_ML].count()

Unnamed: 0_level_0,nas_steatosis_ordinal,nas_inflam,kleiner,fib4,elf,ft,te,swe,aar,ast,apri,forns,m30,m65,meld,p3np,timp1,cap
group2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALD,352,352,458,443,380,283,440,362,443,444,444,453,282,280,458,350,350,300
HP,0,0,0,0,0,0,136,136,0,119,0,0,0,0,136,0,0,133


### Load proteome data

In [24]:
pd.set_option('max_column', 12)
f_data_proteom = os.path.join(DATAFOLDER, PROTEOM)
data_ml_proteomics = pd.read_csv(f_data_proteom, index_col=COL_ID )
data_ml_proteomics

Unnamed: 0_level_0,P02768,A0A0G2JMB2,P01834,P02787,A0A286YEY1,A0A0A0MS08,...,P11597,Q9Y5C1,Q6YHK3,Q16270,class,fibrosis
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Plate1_A2,29.502126,25.961794,25.756107,25.064482,25.675378,24.797298,...,10.768718,12.370819,9.321622,12.540081,0,hp
Plate1_A3,29.606015,24.044912,25.428379,25.283375,23.735519,24.876456,...,10.998392,12.929078,11.971725,10.922941,0,hp
Plate1_A4,29.253610,25.271068,25.260238,24.806825,24.940455,24.681722,...,11.243810,11.646316,11.011772,10.988168,1,F2
Plate1_A5,29.488557,24.080138,24.880090,25.086008,24.209744,24.460491,...,11.261748,12.144853,9.748172,11.065879,1,F0-1
Plate1_A6,29.626314,24.419439,25.275982,24.991902,24.315623,25.383452,...,10.971349,12.777709,10.960807,11.271831,1,F0-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate7_C10,29.560172,25.756191,25.474991,25.167589,25.417420,25.334500,...,12.255364,12.622469,11.923557,11.682579,1,F0-1
Plate7_C11,29.525346,25.263328,25.317142,24.966874,24.845308,25.999554,...,12.451619,12.772336,9.456483,12.132759,1,F0-1
Plate7_C12,29.570570,25.532301,25.410847,25.178165,24.698417,25.710095,...,10.542005,11.199029,10.700540,11.203767,1,F0-1
Plate7_D1,29.618001,25.240001,25.656683,24.951814,24.533916,26.078917,...,12.415602,12.453085,11.664310,12.026582,0,hp


All "healthy" patients have no fibrosis score, but the prevalence in the general population of fibrosis is between 6-7% this could be a source of confounding.

In [25]:
fibrosis =   data_ml_proteomics.fibrosis.fillna("NA")   # ML Data
fibrosis_class = data_cli.kleiner.fillna("NA")

pd.crosstab(
index=fibrosis,
columns=fibrosis_class,
margins=True
)

kleiner,0.0,0.5,1.0,2.0,3.0,4.0,NA,All
fibrosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F0-1,35,97,124,0,0,0,0,256
F2,0,0,0,106,0,0,0,106
F3-4,0,0,0,0,27,66,0,93
hp,0,0,0,0,0,0,136,136
All,35,97,124,106,27,66,136,591


5 plates of clinical data are not present in proteom data.

In [26]:
data_cli.fibrosis_class.index.difference(data_ml_proteomics.fibrosis.index)

Index(['Plate1_E1', 'Plate4_E3', 'Plate5_D7', 'Plate7_B6'], dtype='object', name='Sample ID')

## Proteome data

Questions:
- How to map a new sample to the protein-groups in the training data?

### Load Protein GeneID Mapping

- UniProtID to Gene name mapping


#### Comment:
- sorry, I didn't understand the question?

In [27]:
key_ProteinID = pd.read_csv(os.path.join(DATAFOLDER, 'ID_matching_key.csv'), 
                            index_col="Protein ID").drop("Unnamed: 0", axis=1)
key_ProteinID.head()

Unnamed: 0_level_0,Gene names
Protein ID,Unnamed: 1_level_1
A0A024R6I7,SERPINA1
A0A075B6I0,IGLV8-61
A0A075B6J9,IGLV2-18
A0A075B6R9,IGKV2D-24
A0A075B6S2,IGKV2D-29


In [28]:
key_ProteinID.loc['P35858']

Gene names    IGFALS
Name: P35858, dtype: object

### Impute missing features of clinical data:

Using [`sklearn.impute.simpleImputer`](https://scikit-learn.org/stable/modules/impute.html)'s default `'mean'` strategy. 
Alternatively one could replace missing values with zeros on the standardised data to zero mean and standard deviation of one.

In [29]:
FEATURES_CLINIC = ['ggt', 'alt', 'ast', 'alk', 'mcv', 'iga', 'igg', 'leu', 'glc']
data_cli[FEATURES_CLINIC].head()

Unnamed: 0_level_0,ggt,alt,ast,alk,mcv,iga,igg,leu,glc
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Plate6_G11,86.0,18,27.0,87,88.0,2.15,11.8,9.73,6.7
Plate1_F7,99.0,23,57.0,112,,,,,5.1
Plate6_D2,172.0,15,26.0,79,84.0,2.28,8.7,10.7,6.4
Plate6_C5,1816.0,72,267.0,290,109.5,9.26,18.0,7.6,6.1
Plate4_F8,166.0,25,37.0,127,80.0,6.1,9.7,16.9,6.1


In [24]:
#ToDo

## Classifiers

- Select Classifier by cross-validation using [sklearn functionality](https://scikit-learn.org/stable/model_selection.html#model-selection)

In [30]:
# Define classifiers
clf_xgbc  = xgboost.XGBClassifier(n_jobs=CPUS-1)
clf_rf    = sklearn.ensemble.RandomForestClassifier(n_estimators=200, random_state=RANDOMSTATE)
clf_lr    = sklearn.linear_model.LogisticRegression(random_state=0, solver='liblinear')
clf_svm   = sklearn.svm.SVC(kernel='linear', C=1)
clf_dict = {'xgboost': clf_xgbc,
           'RF': clf_rf,
           'Logistic': clf_lr,
           'SVM': clf_svm,
           }

In [26]:
clf_rf.fit?

[0;31mSignature:[0m [0mclf_rf[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Build a forest of trees from the training set (X, y).

Parameters
----------
X : array-like or sparse matrix of shape (n_samples, n_features)
    The training input samples. Internally, its dtype will be converted
    to ``dtype=np.float32``. If a sparse matrix is provided, it will be
    converted into a sparse ``csc_matrix``.

y : array-like of shape (n_samples,) or (n_samples, n_outputs)
    The target values (class labels in classification, real numbers in
    regression).

sample_weight : array-like of shape (n_samples,), default=None
    Sample weights. If None, then samples are equally weighted. Splits
    that would create child nodes with net zero or negative weight are
    ignored while searching for a split in each node. In the case of
    classification, splits are

## Visualization of data

Look at UMAPs with labels from disease categories.
  - Does the assigned disease correspond to certain groups
 
For clinical data, on could look at a selection of scatter plots in order to see if it is feasible to separate some groups based on two features.

# Models

Different _experimental_ setups for prediction models will be compared. First, for the target **fibrosis**. Fibrosis is reported on a five-point scale from stage F0 to F4.

ML setup binary    | HP  | F0  | F1  | F2  | F3  | F4
--- | --- | ---    | --- | --- | --- | ---
HP-F0-F2 vs F3-F4  | c   | c   | c   | c   | t   | t    
F0-F2 vs F3-F4 (advanced)    |     | c   | c   | c   | t   | t
F0-F1 vs F2-F4 (significant)    |     | c   | c   | t   | t   | t

In the table, c stands for control  and t for target. The clinical relevance is to distinguish different 
stages of disease. The question is wheater one should include a healthy, untested patient cohort can help building a 
classification model, as e.g. for fibrosis the general prevalence in the population is between 6 to 7 percent. Alternatively a _multi-task model_ with having 5 classes/end-points can be fit.


In addition to fibrosis, the endpoints **steatosis** and **inflamation** can be predicted.

target      | Scale   | unique values              | N samples
-----       | --------| ---------------            | -------
fibrosis    | five    | F0, F1, F2, F3, F4         | 
steatosis   | five    | S0, S1, S2, S3, S4         | 
inflamation | seven   | I0, I1, I2, I3, I4, I5, I6 | 


What is population of interest?
- population at risk
- general population (which we do not have as a "random" sample)


## based on proteomics data

In [31]:
data_ml_proteomics.fibrosis.value_counts(dropna=False)

F0-1    256
hp      136
F2      106
F3-4     93
Name: fibrosis, dtype: int64

If the models are trained on the fibrosis data only, on could expect some predictions of fibrosis patients in the untested healthy patient (hp) cohort.

#### Comment: 
- Yes, that is a good idea. One can also predict on the test set and ask for a re-examination of the patient (or re-evaluation of the histology score), or to look into the outcome data which we might get access to soon. We might expect some "false prediction" to be "under-diagnosed" cases and vice versa.

### Healthy vs Fibrosis patients

ToDo: Verify how dependent variable is exactly constructed

In [37]:
target = data_ml_proteomics['class']
X = data_ml_proteomics.iloc[:, :-2]

*Note on Cross-Validation Procedure*
- Comparing the performance on random split of the entire data will lead to overconfident predicitons.
- Performing the Cross-Validation only on a `Train`-split would allow to have a better evaluation on the test dataset. 
- Cutoff calibration would need a validation split

#### Comment:
- I agree. Ideally we would need an independent cohort for validation but is not possible in this case. So we can split a validation cohort from the beginning, and leave it out of the entire machine learning process - feature selection, model building, parameter fine-tuning. The problem is with random split, there is still a factor of randomness...I doubt if it will be a good reflection of the model performance. Not sure if there is a better solution. Maybe we can split based on the year of sample collection. Something to have in mind. 

In [35]:
from sklearn.model_selection import cross_validate
scoring = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc'] # how to customize cutoff?
# in clinical setting, false-alarms are preferrable than missed detections. Yes we should find a way to customize the cutoffs
# to have a high sensitivity but also decent specificity, but I guess it risks over-tuning on this specific dataset? 
# Can one ramp over and find the optimal based on F1 score? Would MCC be a better alternative?

import pandas as pd
def run_cv_binary(clf_dict:dict, X:pd.DataFrame, y:pd.Series, cv=5, 
                  scoring=['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc'])-> dict:
    """Run Cross Validation (cv) for binary classification example
    for a set of classifiers.
    
    
    Inputs
    ------
    clf_dict: dict
        Dictionary with keys and scikit-learn classifiers as values.
    X: 2D-array, pd.DataFrame
        Input data
    y: 1D-array, pd.Series
        Targets for classification
    cv: int
        Number of splits for Cross-Validation.
    
    Returns
    -------
    dict: dict with keys of clf_dict and computed results for each run. 
    """
    cv_results = {}
    for key, clf in clf_dict.items(): 
        cv_results[key] = cross_validate(clf, X, y=target, cv=5, scoring=scoring)
        cv_results[key]['num_feat'] = X.shape[-1]
    return cv_results
    

To add
-  [x] Stratification of input data
-  [ ] Recursive feature selection
-  [ ] cutoff determination for binary classification (ROC-Curves, Precision-Recall-Curves)
    - needs a validation split

#### Comment: 
- Yes recursive feature selection would be good. I tried forward feature selection before to see where score matrix (e.g test-auc) reaches a plateau. I in the end decided a strategy to first select best 10 features based on auc, then perform a greedy search against all n-feature combos among the 10, then select the best feature combo based on MCC score. Afterwards I validate on the validation dataset which I stratified-split at the beginning.

- As a next-step, I think we should focus on deciding on a strategy for feature selection and model performance validation. Once the pipeline is determined, we can use it for other models. To recap, still for now, 3 models to be built, fibrosis (F0-1 vs. F2-4, and F0-2 vs. F3-4), inflammation (0-1 vs. 2-5) and steatosis (0 vs. >0). Then compare each model with their respective existing best-in-class markers according to their standard cut-offs in clinic. 

In [38]:
cv_results = run_cv_binary(clf_dict, X, y=target)

In [39]:
def _get_cv_means(results_dict:dict) -> pd.DataFrame:
    """Convert result-dictionary of runs to averaged dataframe of results."""
    cv_means = pd.DataFrame(results_dict)
    cv_means = cv_means.applymap(np.mean)
    return cv_means.T

In [40]:
_get_cv_means(cv_results)

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_balanced_accuracy,test_roc_auc,num_feat
xgboost,0.734961,0.014646,0.869868,0.914286,0.890764,0.724868,0.865388,200.0
RF,0.711248,0.046033,0.815132,0.971429,0.885801,0.613757,0.885517,200.0
Logistic,0.027577,0.007629,0.889631,0.874725,0.881588,0.753236,0.878449,200.0
SVM,0.037095,0.014002,0.880168,0.843956,0.861017,0.727269,0.850584,200.0


Using Stratified Splitting is default for [`cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate).

In [41]:
from sklearn.model_selection import StratifiedKFold

cv_results = {}
for key, clf in clf_dict.items(): 
    cv_results[key] = cross_validate(clf, X, y=target, groups=target, cv=StratifiedKFold(5), scoring=scoring)
    cv_results[key]['num_feat'] = X.shape[-1]

In [42]:
_get_cv_means(cv_results)

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_balanced_accuracy,test_roc_auc,num_feat
xgboost,0.777895,0.01506,0.869868,0.914286,0.890764,0.724868,0.865388,200.0
RF,0.736753,0.046814,0.815132,0.971429,0.885801,0.613757,0.885517,200.0
Logistic,0.033666,0.011134,0.889631,0.874725,0.881588,0.753236,0.878449,200.0
SVM,0.043512,0.017439,0.880168,0.843956,0.861017,0.727269,0.850584,200.0


[Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection) based on mutual information

In [43]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

k_best = SelectKBest(mutual_info_classif, k=10)
k_best.fit(X, y=target)

SelectKBest(k=10, score_func=<function mutual_info_classif at 0x118281730>)

In [44]:
mask_feat_selection = k_best.get_support()
proteins_selected = X.columns[mask_feat_selection]
key_ProteinID.loc[proteins_selected]

Unnamed: 0,Gene names
P01834,IGKC
A0A286YEY1,IGHA1
P06396,GSN
P35858,IGFALS
Q08380,LGALS3BP
A0A182DWH7,SELENOP
P01833,PIGR
P02741,CRP
H7BY64,
P05062,ALDOB


In [45]:
results_10_best_feat = run_cv_binary(clf_dict, X.loc[:,mask_feat_selection], y=target)
_get_cv_means(results_10_best_feat)

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_balanced_accuracy,test_roc_auc,num_feat
xgboost,0.070813,0.00916,0.871158,0.896703,0.883043,0.723484,0.870234,10.0
RF,0.364747,0.044608,0.866643,0.916484,0.889667,0.718824,0.888584,10.0
Logistic,0.003516,0.007544,0.86397,0.903297,0.881241,0.708527,0.88354,10.0
SVM,0.005565,0.007508,0.883372,0.894505,0.886804,0.744343,0.885758,10.0


In [46]:
fibrosis_3c = data_cli.fibrosis_class.astype('category')
fibrosis_3c = fibrosis_3c.loc[~fibrosis_3c.isna()]
fibrosis_3c.value_counts()

F0-1    258
F2      106
F3-4     94
Name: fibrosis_class, dtype: int64

In [47]:
shared_samples = fibrosis_3c.index.intersection(X.index)
fibrosis_3c = fibrosis_3c.loc[shared_samples]

In [48]:
X_selected = X.loc[fibrosis_3c.index, mask_feat_selection]
X_selected.shape

(455, 10)

In [49]:
cv_results = {}
scoring = ['balanced_accuracy']
for key, clf in clf_dict.items(): 
    cv_results[key] = cross_validate(clf, X_selected, y=fibrosis_3c, scoring=scoring)
    cv_results[key]['num_feat'] = X.shape[-1]
_get_cv_means(cv_results)

Unnamed: 0,fit_time,score_time,test_balanced_accuracy,num_feat
xgboost,0.1554,0.003692,0.645505,200.0
RF,0.37366,0.021154,0.635447,200.0
Logistic,0.004209,0.001752,0.618016,200.0
SVM,0.006886,0.002132,0.642043,200.0


### Multiclass alternative
- Predict 4 categories for fibrosis

## Based on clinical markers

## Multiclass, Multivariate Model
- predict single classes for each endpoint in a joint model.
- 

## Versions

In [32]:
pip list | grep pandas

pandas              0.25.3             
Note: you may need to restart the kernel to use updated packages.
