# Validation of DeepMECFS, DNN, GDBT with independent cohort
# Also, example of using pretrained models

- To use trained model, check the function named models_compare
- Basically, to predict y with pretrained model, use ```y_pred = DNN.ScoreYModel(omics_score_model, adjusted_score_layer).predict(X)```

In [1]:
import importlib.util
import os

def import_module_with_full_path(file_path):
    base_filename = os.path.basename(file_path)
    module_name = os.path.splitext(base_filename)[0]
    module_spec = importlib.util.spec_from_file_location(module_name, file_path)
    imported_module = importlib.util.module_from_spec(module_spec)
    module_spec.loader.exec_module(imported_module)
    return imported_module

In [2]:
root="../../.."

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import joblib
import tensorflow as tf
DNN = import_module_with_full_path("%s/codes/AI/module/DNN.py"%(root))

2024-01-21 22:55:25.615678: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def load(model_name, model_path):
    feature_meta_path = "%s/codes/AI/input/feature_meta"%(root)
    feature_meta = pd.read_csv("%s/%s_feature_meta.csv"%(feature_meta_path, model_name), index_col=0)
    omics_score_model = tf.keras.models.load_model("%s/DNN_score_%s"%(model_path, model_name))
    adjusted_score_layer = tf.keras.models.load_model("%s/Adjust_score_y_%s"%(model_path, model_name))
    DNN_model = tf.keras.models.load_model("%s/DNN_%s"%(model_path, model_name))
    GDBT_model = joblib.load("%s/GDBT_%s.joblib"%(model_path, model_name))
    return feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model

In [5]:
def roc_curve_plot(y_true_list, y_prob_list, title):
    model_name_list = ['immune','specie', 'kegg', 'metabolome', 'quest','omics']
    colors = sns.color_palette("Set2", 6)
    color_dict = dict(zip(model_name_list, colors))

    assert(y_true_list.keys() == y_prob_list.keys())
    name_list = list(y_true_list.keys())
    sns.set(style='ticks', font_scale=0.95, font='sans-serif')
    fig = plt.figure(figsize=(4.5,4), constrained_layout=True)
    
    for i in name_list:
        y_true_i, y_prob_i = y_true_list[i], y_prob_list[i]
        fpr, tpr, _ = sk_metrics.roc_curve(y_true_i, y_prob_i)
        roc_auc = sk_metrics.auc(fpr, tpr)

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [7]:
def models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage=1.0):
    y_true = y
    metrics_table = pd.DataFrame(index=['DNN_score','DNN', 'GDBT'], 
                                 columns=['accuracy', 'precision', 'recall', 'f1-score'])
    y_pred_DNN_score = DNN.ScoreYModel(omics_score_model, adjusted_score_layer).predict(X)
    y_pred_DNN_score = (y_pred_DNN_score > 0.5).astype(int).flatten()
    metrics_table.loc['DNN_score',:] = calculate_metrics(y_true, y_pred_DNN_score)
    y_pred_DNN = DNN_model.predict(X)
    y_pred_DNN = (y_pred_DNN > 0.5).astype(int).flatten()
    metrics_table.loc['DNN',:] = calculate_metrics(y_true, y_pred_DNN)
    metrics_table.loc['GDBT',:] = calculate_metrics(y_true, GDBT_model.predict(X))
    metrics_table.loc[:,'feature_coverage'] = feature_coverage
    print(y.to_numpy())
    print(y_pred_DNN_score)
    print(y_pred_DNN)
    print(GDBT_model.predict(X))
    return metrics_table

In [8]:
model_folder = "%s/codes/AI/output/cross_validation/full_model/model"%(root)

In [9]:
metrics_list = []

---
# Columbia Microbiome
## Specie

In [10]:
model_name = 'specie'
feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model = load(model_name, model_folder)

In [11]:
# Columbia, Cell Host and Microbe, 2023
data = pd.read_csv("%s/data/indenpendent_cohorts/columbia/metaphlan4_species_full.csv"%(root), index_col=0).transpose()
meta = pd.read_table("%s/data/indenpendent_cohorts/columbia/SraRunTable.txt"%(root), sep = ",", index_col=0)

In [12]:
overlap_len = len(set(data.columns).intersection(feature_meta.index))
#print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
#print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Test dataset feature #1242
Feature Coverage 1.0


In [13]:
X = data.reindex(columns=feature_meta.index, fill_value=0)
y = meta.loc[X.index, 'sampletype'].map({'Control': 0, 'Case': 1})
X = X.astype("float32")
y = y.astype("float32")
metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1.
 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0.
 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0.
 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0.]
[0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 0
 1 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1
 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 1 1
 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 0
 0 0 1 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0
 1 1 1 1 0]
[0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1

In [14]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.721053,0.716981,0.767677,0.741463,1.0
DNN,0.678947,0.682692,0.717172,0.699507,1.0
GDBT,0.7,0.694444,0.757576,0.724638,1.0


## KEGG

In [15]:
model_name = 'kegg'
feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model = load(model_name, model_folder)

In [16]:
# Columbia, Cell Host and Microbe, 2023
data = pd.read_table("%s/data/indenpendent_cohorts/columbia/kegg_gene_normalized.txt"%(root), sep = "\t", index_col=0).transpose()
data = data.iloc[1:,:]
meta = pd.read_table("%s/data/indenpendent_cohorts/columbia/SraRunTable.txt"%(root), sep = ",", index_col=0)

In [17]:
overlap_len = len(set(data.columns).intersection(feature_meta.index))
#print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
#print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Test dataset feature #5897
Feature Coverage 0.8759787825208386


In [18]:
# Columbia, Cell Host and Microbe, 2023
X = data.reindex(columns=feature_meta.index, fill_value=0)
y = meta.loc[X.index, 'sampletype'].map({'Control': 0, 'Case': 1})
X = X.astype("float32")
y = y.astype("float32")

metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1.
 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0.
 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0.
 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0.]
[1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1
 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0
 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0
 0 1 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [19]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.584211,0.65625,0.424242,0.515337,0.875979
DNN,0.521053,0.521053,1.0,0.685121,0.875979
GDBT,0.521053,0.521053,1.0,0.685121,0.875979


# Raijmakers, Ruud P H et al. 2020, Microbiome
## Specie

In [20]:
model_name = 'specie'
feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model = load(model_name, model_folder)

In [21]:
data = pd.read_csv("%s/data/indenpendent_cohorts/Raijmakers/metaphlan4_species_full.csv"%(root), 
                   index_col=0).transpose()

In [22]:
overlap_len = len(set(data.columns).intersection(feature_meta.index))
#print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
#print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Test dataset feature #1165
Feature Coverage 1.0


In [23]:
filter_name = [name for name in data.index if 'CFS' in name or 'HV' in name]
data = data.loc[filter_name,:]

In [24]:
X = data.reindex(columns=feature_meta.index, fill_value=0)
y = [0 if 'HV' in i else 1 for i in data.index]
y = pd.Series(y, index=data.index)
X = X.astype("float32")
y = y.astype("float32")
metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0 0
 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0]
[0 1 0 0 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 1 0
 1 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0
 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [25]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.634409,0.636364,0.714286,0.673077,1.0
DNN,0.763441,0.935484,0.591837,0.725,1.0
GDBT,0.709677,0.805556,0.591837,0.682353,1.0


## KEGG

In [26]:
model_name = 'kegg'
feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model = load(model_name, model_folder)

In [27]:
data = pd.read_table("%s/data/indenpendent_cohorts/Raijmakers/kegg_gene_normalized.txt"%(root),
                     sep = "\t", index_col=0).transpose()
data.index = [i.split('.')[0] for i in data.index]

In [28]:
overlap_len = len(set(data.columns).intersection(feature_meta.index))
#print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
#print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Test dataset feature #8485
Feature Coverage 1.0


In [29]:
filter_name = [name for name in data.index if 'CFS' in name or 'HV' in name]
data = data.loc[filter_name,:]

In [30]:
X = data.reindex(columns=feature_meta.index, fill_value=0)
y = [0 if 'HV' in i else 1 for i in data.index]
y = pd.Series(y, index=data.index)
X = X.astype("float32")
y = y.astype("float32")
metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0
 1 1 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 0
 0 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0]
[0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0]


In [31]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.602151,0.65,0.530612,0.58427,1.0
DNN,0.548387,0.705882,0.244898,0.363636,1.0
GDBT,0.537634,0.615385,0.326531,0.426667,1.0


---
# Metabolome

In [32]:
model_name = 'metabolome'
model_folder = "%s/codes/AI/model"%(root)
feature_meta, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model = load(model_name, model_folder)

---
## Columbia

In [33]:
data = pd.read_csv("%s/data/indenpendent_cohorts/Int.J.Mol.Sci.2022/r56_metabolomics_yali_v2.csv"%(root), 
                   index_col=0)
data_meta = data.iloc[:, :3]
data = data.iloc[:, 3:].transpose().apply(pd.to_numeric, errors='coerce').fillna(0)
print(data.shape)

(170, 888)


In [34]:
meta = pd.read_table("%s/data/indenpendent_cohorts/Int.J.Mol.Sci.2022/r56_pheno_text_binned_v2.txt"%(root), 
                     index_col=0, sep='\t')

In [35]:
overlap_len = len(set(data_meta.InChiKey).intersection(feature_meta.INCHIKEY))
print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Model feature #730
Test dataset feature #888
Overlap #141
Feature Coverage 0.19315068493150686


In [36]:
X = pd.DataFrame(0, columns = feature_meta.index, index=data.index)
for i in X.columns:
    inchi = feature_meta.INCHIKEY[i]
    if any(data_meta.InChiKey == inchi):
        X.loc[:,i] = data.loc[:,data_meta.index[data_meta.InChiKey == inchi][0]].transpose()

  X.loc[:,i] = data.loc[:,data_meta.index[data_meta.InChiKey == inchi][0]].transpose()


In [37]:
X = StandardScaler().fit_transform(X)

In [38]:
y = meta.loc[data.index,'Phenotype'].map({'control': 0, 'case': 1})

In [39]:
metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1
 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0
 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 1
 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1
 1 0 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 0 1 0 1 1
 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0
 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0
 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 0]
[0 1 0 1 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1
 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 0 0
 0 0 1 1



In [40]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.594118,0.538462,0.886076,0.669856,0.193151
DNN,0.541176,0.507692,0.417722,0.458333,0.193151
GDBT,0.464706,0.464706,1.0,0.634538,0.193151


---
# Cornel

In [41]:
data = pd.read_excel("%s/data/indenpendent_cohorts/jci.insight.2022.xlsx"%(root), 
                     sheet_name='ScaledImpDataZeroDrug&Tobacco')
data_feature_meta = data.iloc[6:,:15]
data_feature_meta.columns = data_feature_meta.iloc[0]
data_feature_meta.index = data_feature_meta.loc[:,'COMP ID']
data_feature_meta = data_feature_meta.drop('COMP ID')
meta = data.iloc[:6,15:].transpose()
meta.columns = meta.iloc[0]
meta = meta.drop('ID')
data = data.iloc[7:,16:]
data.index = data_feature_meta.index
data.columns = meta.index
data = data.transpose()

In [42]:
feature_meta.index = feature_meta.COMP_ID

In [43]:
overlap_len = len(set(data.columns).intersection(feature_meta.index))
print("Model feature #%s"%(len(feature_meta.index)))
print("Test dataset feature #%s"%(len(data.columns)))
print("Overlap #%s"%(overlap_len))
feature_coverage = overlap_len/len(feature_meta.index)
print("Feature Coverage %s"%(feature_coverage))

Model feature #730
Test dataset feature #1157
Overlap #573
Feature Coverage 0.7849315068493151


In [44]:
X = data.reindex(columns=feature_meta.index, fill_value=0)
X = X.astype("float32")
y = meta.Phenotype.map({'Control': 0, 'CFS': 1})
y = y.astype("float32")

In [45]:
X = StandardScaler().fit_transform(X)

In [46]:
metrics = models_compare(X, y, omics_score_model, adjusted_score_layer, DNN_model, GDBT_model, feature_coverage)
metrics_list.append(metrics)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.



In [47]:
metrics

Unnamed: 0,accuracy,precision,recall,f1-score,feature_coverage
DNN_score,0.683168,0.733945,0.695652,0.714286,0.784932
DNN,0.589109,0.653846,0.591304,0.621005,0.784932
GDBT,0.574257,0.58011,0.913043,0.709459,0.784932


---
# Save the result

In [48]:
#pd.concat(metrics_list).to_csv("%s/output/AI_figure/Ind_cohort_compare_metrics.csv"%(root))