In [None]:
import sys
path = '/gpfs/commons/groups/gursoy_lab/mstoll/'
sys.path.append(path)

import os
import numpy as np
import pandas as pd
import time
import torch
import pickle
import shap
import tensorboard


from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from functools import partial
import shutil
from tqdm.auto import tqdm

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

from codes.models.data_form.DataForm import DataTransfo_1SNP
from codes.models.metrics import calculate_roc_auc

import featurewiz as gwiz

import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

from codes.models.Decision_tree.utils import get_indice, get_name

In [None]:
### framework constants:
model_type = 'decision_tree'
model_version = 'gradient_boosting'
test_name = '1_test_train_transfo_V1'
tryout = True # True if we are ding a tryout, False otherwise 
### data constants:
### data constants:
CHR = 1
SNP = 'rs673604'
pheno_method = 'Paul' # Paul, Abby
ld = 'no'
rollup_depth = 4
binary_classes = True #nb of classes related to an SNP (here 0 or 1)
vocab_size = None # to be defined with data
padding_token = 0
prop_train_test = 0.8
load_data = False
save_data = True
remove_none = True
decorelate = False
equalize_label = False
threshold_corr = 0.9
threshold_rare = 50
remove_rare = 'all' # None, 'all', 'one_class'
compute_features = True
padding = False
list_env_features = ['age', 'sex']
list_pheno_ids = None #list(np.load(f'/gpfs/commons/groups/gursoy_lab/mstoll/codes/Data_Files/phewas/list_associations_snps/{SNP}_paul.npy'))

### data format

batch_size = 20
data_share = 1

##### model constants


##### training constants
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
dataT = DataTransfo_1SNP(SNP=SNP,
                         CHR=CHR,
                         method=pheno_method,
                         padding=padding,  
                         pad_token=padding_token, 
                         load_data=load_data, 
                         save_data=save_data, 
                         compute_features=compute_features,
                         prop_train_test=prop_train_test,
                         remove_none=remove_none,
                         equalize_label=equalize_label,
                         rollup_depth=rollup_depth,
                         decorelate=decorelate,
                         threshold_corr=threshold_corr,
                         threshold_rare=threshold_rare,
                         remove_rare=remove_rare, 
                         list_env_features=list_env_features,
                         data_share=data_share,
                         list_pheno_ids=list_pheno_ids,
                         binary_classes=binary_classes, 
                         ld = ld)
#patient_list = dataT.get_patientlist()

In [None]:
data, labels_patients, indices_env, name_envs, eids = dataT.get_tree_data(with_env=True, with_counts=True, load_possible=True, only_relevant=False)

In [None]:
equalized = True
interest = False
keep = False
scaled = True
remove = False

In [None]:
if interest:
    data_use, labels_use = data[:nb_patients_interest, :-1], labels_patients[:nb_patients_interest]
else:
    data_use, labels_use = data, labels_patients
if remove:
    eids_remove = np.load('/gpfs/commons/groups/gursoy_lab/mstoll/codes/Data_Files/UKBB/eids_remove_1.npy')
    indices_eids = (1-np.isin(eids, eids_remove)).astype(bool)
    eids_use = eids[indices_eids]
    data_use = data_use[indices_eids]
    labels_use = labels_use[indices_eids]
    
if equalized:
    pheno, labels = DataTransfo_1SNP.equalize_label(data=data_use, labels = labels_use)
else:
    pheno, labels = data_use, labels_use


In [None]:
phenos = np.arange(1, pheno.shape[1]+1).astype(str)

In [None]:
pheno_train, labels_train = pheno, labels

### Feature selection

In [None]:
"""
wiz =gwiz.FeatureWiz(verbose=1)
nb_samples, nb_phenos = 10000, 10000
pheno_df = pd.DataFrame(data=pheno[:, :nb_phenos], columns = phenos[:nb_phenos])
label_df = pd.Series(data=labels, name = 'SNP_label')
#dataset = pheno_df
#dataset['labels'] = label_df
pheno_train, labels_train = wiz.fit_transform(pheno_df, label_df)
"""


In [None]:
permutation_indices = np.random.permutation(len(pheno))

# Réorganiser les tableaux en utilisant l'indice de permutation
pheno = pheno[permutation_indices]
labels = labels[permutation_indices]

### Hist Gradient BOOSt

In [None]:
diseases_patients_train, diseases_patients_test, label_patients_train, label_patients_test = train_test_split(pheno_train, labels_train, test_size = 1-prop_train_test, random_state=42)

In [None]:
class_weight = {0: np.sum(label_patients_train == 1) / np.sum(label_patients_train == 0), 1: 1.0}


In [None]:
frequencies_ini = np.sum(data, axis=0)
number_distinct_diseases = np.sum(diseases_patients_train, axis=0)


In [None]:
data.shape

In [None]:
indices_keep = (frequencies_ini > 0) & (frequencies_ini > 100)
#indices_keep = shaps!=0

In [None]:
indices_keep.sum()

In [None]:
diseases_patients_train_keep = diseases_patients_train[:,indices_keep]
diseases_patients_test_keep = diseases_patients_test[:, indices_keep]

In [None]:
if keep:
    diseases_patients_train_model = diseases_patients_train_keep
    diseases_patients_test_model = diseases_patients_test_keep
else:
    diseases_patients_train_model = diseases_patients_train
    diseases_patients_test_model = diseases_patients_test



In [None]:
    
diseases_patients_train_model_unscaled = diseases_patients_train_model
diseases_patients_test_model_unscaled = diseases_patients_test_model

if scaled:

    scaler = StandardScaler()
    diseases_patients_train_model= scaler.fit_transform(diseases_patients_train_model)
    diseases_patients_test_model = scaler.fit_transform(diseases_patients_test_model)



In [None]:
model = HistGradientBoostingClassifier(class_weight=class_weight)


# Entraîner le modèle sur l'ensemble d'entraînement
model.fit(diseases_patients_train_model, label_patients_train)

# Faire des prédictions sur l'ensemble de test
labels_pred_test = model.predict(diseases_patients_test_model)
labels_pred_train = model.predict(diseases_patients_train_model)
proba_test = model.predict_proba(diseases_patients_test_model)[:, 1]
proba_train = model.predict_proba(diseases_patients_train_model)[:, 1]

In [None]:
nb_positive_train = np.sum(labels_pred_train==0)
nb_negative_train = np.sum(labels_pred_train==1)
nb_positive_test = np.sum(labels_pred_test==0)
nb_negative_test = np.sum(labels_pred_test==1)

TP_test = np.sum((label_patients_test==0 )& (labels_pred_test == 0)) / nb_positive_test
FP_test = np.sum((label_patients_test==1 )& (labels_pred_test == 0)) / nb_positive_test
TN_test = np.sum((label_patients_test==1 )& (labels_pred_test == 1)) / nb_negative_test
FN_test = np.sum((label_patients_test== 0)& (labels_pred_test == 1)) / nb_negative_test

TP_train = np.sum((label_patients_train==0 )& (labels_pred_train == 0)) / nb_positive_train
FP_train = np.sum((label_patients_train==1 )& (labels_pred_train == 0)) / nb_positive_train
TN_train = np.sum((label_patients_train==1 )& (labels_pred_train == 1)) / nb_negative_train
FN_train = np.sum((label_patients_train== 0)& (labels_pred_train == 1)) / nb_negative_train

accuracy_train = accuracy_score(label_patients_train, labels_pred_train)
accuracy_test = accuracy_score(label_patients_test, labels_pred_test)

auc_test = calculate_roc_auc(label_patients_test, proba_test)
auc_train = calculate_roc_auc(label_patients_train, proba_train)

proba_avg_zero_test = 1- np.mean(proba_test[label_patients_test==0])
proba_avg_zero_train = 1- np.mean(proba_train[label_patients_train==0])
proba_avg_one_test = np.mean(proba_test[label_patients_test==1])
proba_avg_one_train = np.mean(proba_train[label_patients_train==1])

In [None]:
print(f'{TP_test=}') 
print(f'{FP_test=}')
print(f'{TN_test=}')
print(f'{FN_test=}')
print(f'{TP_train=}') 
print(f'{FP_train=}')
print(f'{TN_train=}')
print(f'{FN_train=}')
print(' ')
print(f'{auc_test=}')
print(f'{auc_train=}')
print(' ')
print(' ')
print(f'{accuracy_test=}')
print(f'{accuracy_train=}')
print(' ')
print(f'{proba_avg_zero_test=}')
print(f'{proba_avg_zero_train=}')
print(f'{proba_avg_one_test=}')
print(f'{proba_avg_one_train=}')

In [None]:
baseline_accuracy = accuracy_score(label_patients_train, labels_pred_train)
n_features = diseases_patients_train_keep.shape[1]
feature_importances = np.zeros(n_features)

# Iterate over each feature
for i in range(n_features):
    print(i)
    # Copy the original feature values
    X_test_permuted = diseases_patients_train_keep.copy()
    
    # Permute the values of the current feature
    X_test_permuted[:, i] = np.random.permutation(X_test_permuted[:, i])
    
    # Compute accuracy on permuted data
    y_pred_permuted = model.predict(X_test_permuted)
    permuted_accuracy = accuracy_score(label_patients_train, y_pred_permuted)
    
    # Compute feature importance
    feature_importances[i] = baseline_accuracy - permuted_accuracy

# Normalize feature importances
feature_importances /= np.sum(feature_importances)

# Print or visualize feature importances
print("Feature Importances:", feature_importances)

In [None]:
plt.plot(feature_importances, 'o')
#plt.plot(diseases_patients_train.sum(axis=0)/len(diseases_patients_train), 'o')

In [None]:
np.sum(labels_pred_train==0), np.sum(labels_pred_train==1)


In [None]:
np.sum(label_patients_test==1), np.sum(label_patients_test==0)


### score phenos:


In [None]:
frequencies = np.sum(diseases_patients_train_model_unscaled, axis=0)
log_freq = np.log(frequencies+1)
color_values = log_freq



In [None]:
def get_accuracy_pheno(data, labels_true, labels_pred, nb_pheno):
    coherence = labels_true[data[:,nb_pheno-1]==1] == labels_pred[data[:,nb_pheno-1]==1]
    accuracy_pheno = np.sum(coherence)/ len(coherence)
    return accuracy_pheno
accuracy_pheno_par = partial(get_accuracy_pheno, diseases_patients_train_model_unscaled,  label_patients_train, labels_pred_train)
phenos = np.arange(1, diseases_patients_train_model.shape[1]+1)
accuracy_pheno_par = np.vectorize(accuracy_pheno_par)

In [None]:
plt.scatter(np.arange(len(phenos)), accuracy_pheno_par(phenos),  c= color_values , cmap='viridis')
plt.colorbar()

### Calibration plot


In [None]:
predicted_probs_ones = proba_train
true_labels = np.array(label_patients_train)
plt.hist(predicted_probs_ones, bins=100)
plt.xlabel('proba')
plt.ylabel('nb of predictions')
plt.show()

prob_true, prob_pred = calibration_curve(true_labels, predicted_probs_ones, n_bins=80)
auc = calculate_roc_auc(true_labels, predicted_probs_ones)
# Tracer le graphique de calibration
plt.plot(prob_pred, prob_true, marker='o', linestyle='--', label='Calibration Plot')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()

In [None]:
bin = (predicted_probs_ones < 0.2) 
patients_selected  = diseases_patients_train_model_unscaled[bin]
print(f'nb patients selected = {len(patients_selected)}')
frequencies_new = patients_selected.sum(axis=0) / len(patients_selected)
plt.plot(frequencies_new - frequencies/len(diseases_patients_train_model), 'o')
nb_diseases = np.sum(patients_selected, axis=1).mean()
print(f'nb diseases mean= {nb_diseases}')



### Nb diseases

In [None]:


nb_maladies_train = diseases_patients_train_model_unscaled.sum(axis=1)
nb_maladies_test = diseases_patients_test_model_unscaled.sum(axis=1)

In [None]:
accs_train = []
nbs_train = []
for diseases_nb in np.unique(nb_maladies_train):
    patients_nb_diseases = diseases_patients_train_model[nb_maladies_train == diseases_nb]
    labels_nb_diseases = label_patients_train[nb_maladies_train == diseases_nb]
    labels_nb_diseases_pred = labels_pred_train[nb_maladies_train == diseases_nb]
    acc = np.sum(labels_nb_diseases == labels_nb_diseases_pred ) / len(labels_nb_diseases)
    accs_train.append(acc)
    nbs_train.append(len(labels_nb_diseases))
nb_diseases_mean_train = np.mean(nb_maladies_train)
print(f'nb diseases mean train= {nb_diseases_mean_train}')

accs_test = []
nbs_test = []
for diseases_nb in np.unique(nb_maladies_test):
    patients_nb_diseases = diseases_patients_test_model[nb_maladies_test == diseases_nb]
    labels_nb_diseases = label_patients_test[nb_maladies_test == diseases_nb]
    labels_nb_diseases_pred = labels_pred_test[nb_maladies_test == diseases_nb]
    acc = np.sum(labels_nb_diseases == labels_nb_diseases_pred ) / len(labels_nb_diseases)
    accs_test.append(acc)
    nbs_test.append(len(labels_nb_diseases))
nb_diseases_mean_test = np.mean(nb_maladies_test)
print(f'nb diseases mean test= {nb_diseases_mean_test}')


In [None]:
color_values_nbs_train = nbs_train
color_values_nbs_test = nbs_test


In [None]:
plt.scatter(np.unique(nb_maladies_train), accs_train,  c= color_values_nbs_train , cmap='viridis')
plt.xlabel('nb of diseases')
plt.ylabel('accuracy')
plt.colorbar()

In [None]:
plt.scatter(np.unique(nb_maladies_test), accs_test,  c= color_values_nbs_test , cmap='viridis')
plt.xlabel('nb of diseases')
plt.ylabel('accuracy')
plt.colorbar()

In [None]:
plt.plot(np.unique(nb_maladies_train)[1:], nbs_train[1:], 'o')
plt.xlabel('number of diseases per patient')
plt.ylabel('number of patients')

### Shap values

In [None]:
explainer = shap.TreeExplainer(model)

# Compute SHAP values for a set of samples (e.g., X_test)
shap_values = explainer.shap_values(diseases_patients_train_model)

# Plot the SHAP values
#shap.summary_plot(shap_values, diseases_patients_test)

In [None]:
shaps = np.abs(shap_values).mean(axis=0)
plt.scatter(np.arange(len(shaps)), shaps,  c= color_values , cmap='viridis')
plt.xlabel('phenotypes')
plt.ylabel('shap values')
plt.colorbar()


In [None]:
shaps_ordered = np.argsort(shaps)[::-1]

In [None]:
plt.plot(shaps[shaps_ordered][:30], 'o')

In [None]:
shaps_ordered

In [None]:
get_name(dataT, shaps_ordered)

In [None]:
get_indice(dataT, 'Actinic keratosis')

In [None]:
frequencies_ini[227]

In [None]:
get_list_names(shaps_ordered)[:15][9]

In [None]:
frequencies[236]

In [None]:
indices_keep = shaps>0.005
print(f'nb patients keep = {indices_keep.sum()}')

### Random forests

In [None]:
model = rf_classifier = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=10)


# Entraîner le modèle sur l'ensemble d'entraînement
model.fit(diseases_patients_train_model, label_patients_train)

# Faire des prédictions sur l'ensemble de test
labels_pred_test = model.predict(diseases_patients_test_model)
labels_pred_train = model.predict(diseases_patients_train_model)
proba_test = model.predict_proba(diseases_patients_test_model)[:, 1]
proba_train = model.predict_proba(diseases_patients_train_model)[:, 1]

In [None]:
nb_positive_train = np.sum(labels_pred_train==0)
nb_negative_train = np.sum(labels_pred_train==1)
nb_positive_test = np.sum(labels_pred_test==0)
nb_negative_test = np.sum(labels_pred_test==1)



TP_test = np.sum((label_patients_test==0 )& (labels_pred_test == 0)) / nb_positive_test
FP_test = np.sum((label_patients_test==1 )& (labels_pred_test == 0)) / nb_positive_test
TN_test = np.sum((label_patients_test==1 )& (labels_pred_test == 1)) / nb_negative_test
FN_test = np.sum((label_patients_test== 0)& (labels_pred_test == 1)) / nb_negative_test

TP_train = np.sum((label_patients_train==0 )& (labels_pred_train == 0)) / nb_positive_train
FP_train = np.sum((label_patients_train==1 )& (labels_pred_train == 0)) / nb_positive_train
TN_train = np.sum((label_patients_train==1 )& (labels_pred_train == 1)) / nb_negative_train
FN_train = np.sum((label_patients_train== 0)& (labels_pred_train == 1)) / nb_negative_train


auc_test = calculate_roc_auc(label_patients_test, proba_test)
auc_train = calculate_roc_auc(label_patients_train, proba_train)

proba_avg_zero_test = 1- np.mean(proba_test[label_patients_test==0])
proba_avg_zero_train = 1- np.mean(proba_train[label_patients_train==0])
proba_avg_one_test = np.mean(proba_test[label_patients_test==1])
proba_avg_one_train = np.mean(proba_train[label_patients_train==1])

In [None]:
print(f'{TP_test=}') 
print(f'{FP_test=}')
print(f'{TN_test=}')
print(f'{FN_test=}')
print(f'{TP_train=}') 
print(f'{FP_train=}')
print(f'{TN_train=}')
print(f'{FN_train=}')
print(' ')
print(f'{auc_test=}')
print(f'{auc_train=}')
print(' ')
print(f'{proba_avg_zero_test=}')
print(f'{proba_avg_zero_train=}')
print(f'{proba_avg_one_test=}')
print(f'{proba_avg_one_train=}')

In [None]:
predicted_probs_ones = proba_train
true_labels = np.array(label_patients_train)
plt.hist(predicted_probs_ones, bins=100)
plt.show()

prob_true, prob_pred = calibration_curve(true_labels, predicted_probs_ones, n_bins=80)
auc = calculate_roc_auc(true_labels, predicted_probs_ones)
# Tracer le graphique de calibration
plt.plot(prob_pred, prob_true, marker='o', linestyle='--', label='Calibration Plot')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()

### Analyse of the data

In [None]:
def get_F_pheno(data, labels, pheno_nb):    
    labels_1 = labels[data[:,pheno_nb]==1]
    labels_0 = labels[data[:,pheno_nb]==0]
    P0 = np.sum(labels_0==1)/len(labels_0)
    P1 = np.sum(labels_1==1)/len(labels_1)
    F0 = max(P0, 1-P0)
    F1 = max(P1, 1-P1)
    return P0, P1
def get_plots_F(data, labels):
    
    get_risk_pheno = partial(get_F_pheno, data, labels)
    frequencies = np.sum(data, axis=0) / len(data)
    seuil_frequencies = -1
    indices = frequencies*len(data) > seuil_frequencies
    print(indices.sum())
    proba_mean = max(np.sum(labels==0)/len(labels), 1-np.sum(labels==0)/len(labels))
    phenos = np.arange(len(data[0]))[indices]
    Fs = np.array(list(map(get_risk_pheno, phenos)))

    plt.plot(Fs[:,0], 'o')
    plt.plot(Fs[:, 1], 'o')
    plt.xlabel('phenotypes')
    plt.ylabel('probas label 1')
    plt.axhline(proba_mean)
    log_freq = np.log(frequencies*len(data)+1)[indices]
    color_values = log_freq

    diff_p = np.abs(Fs[:,0]-Fs[:,1]) *100
    plt.legend(['P0', 'P1'])

    fig = plt.subplots(figsize=(10, 10))
    plt.scatter(np.arange(len(diff_p)), diff_p, c=color_values, cmap='viridis')
    plt.xlabel('phenos')
    plt.ylabel('diff_p')
    plt.colorbar()
    return Fs


In [None]:
Fs = get_plots_F(diseases_patients_train_model_unscaled, label_patients_train)


In [None]:
Fs[93], frequencies[93]

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
features_names = phenos
# Train a decision tree classifier
model = DecisionTreeClassifier()
model.fit(diseases_patients_train_model, label_patients_train)

# Faire des prédictions sur l'ensemble de test
labels_pred_test = model.predict(diseases_patients_test_model)
labels_pred_train = model.predict(diseases_patients_train_model)
proba_test = model.predict_proba(diseases_patients_test_model)[:, 1]
proba_train = model.predict_proba(diseases_patients_train_model)[:, 1]


In [None]:
nb_positive_train = np.sum(labels_pred_train==0)
nb_negative_train = np.sum(labels_pred_train==1)
nb_positive_test = np.sum(labels_pred_test==0)
nb_negative_test = np.sum(labels_pred_test==1)

TP_test = np.sum((label_patients_test==0 )& (labels_pred_test == 0)) / nb_positive_test
FP_test = np.sum((label_patients_test==1 )& (labels_pred_test == 0)) / nb_positive_test
TN_test = np.sum((label_patients_test==1 )& (labels_pred_test == 1)) / nb_negative_test
FN_test = np.sum((label_patients_test== 0)& (labels_pred_test == 1)) / nb_negative_test

TP_train = np.sum((label_patients_train==0 )& (labels_pred_train == 0)) / nb_positive_train
FP_train = np.sum((label_patients_train==1 )& (labels_pred_train == 0)) / nb_positive_train
TN_train = np.sum((label_patients_train==1 )& (labels_pred_train == 1)) / nb_negative_train
FN_train = np.sum((label_patients_train== 0)& (labels_pred_train == 1)) / nb_negative_train

accuracy_train = accuracy_score(label_patients_train, labels_pred_train)
accuracy_test = accuracy_score(label_patients_test, labels_pred_test)

auc_test = calculate_roc_auc(label_patients_test, proba_test)
auc_train = calculate_roc_auc(label_patients_train, proba_train)

proba_avg_zero_test = 1- np.mean(proba_test[label_patients_test==0])
proba_avg_zero_train = 1- np.mean(proba_train[label_patients_train==0])
proba_avg_one_test = np.mean(proba_test[label_patients_test==1])
proba_avg_one_train = np.mean(proba_train[label_patients_train==1])

In [None]:
print(f'{TP_test=}') 
print(f'{FP_test=}')
print(f'{TN_test=}')
print(f'{FN_test=}')
print(f'{TP_train=}') 
print(f'{FP_train=}')
print(f'{TN_train=}')
print(f'{FN_train=}')
print(' ')
print(f'{auc_test=}')
print(f'{auc_train=}')
print(' ')
print(' ')
print(f'{accuracy_test=}')
print(f'{accuracy_train=}')
print(' ')
print(f'{proba_avg_zero_test=}')
print(f'{proba_avg_zero_train=}')
print(f'{proba_avg_one_test=}')
print(f'{proba_avg_one_train=}')

In [None]:
plt.plot(model.feature_importances_, 'o')

In [None]:
plt.plot(frequencies, 'o')

### Xgboost

In [None]:
import xgboost as xgb

In [None]:
# Créer un objet de modèle XGBoost
model = xgb.XGBClassifier()

# Entraîner le modèle
model.fit(diseases_patients_train_model, label_patients_train)

# Faire des prédictions sur l'ensemble de test
labels_pred_test = model.predict(diseases_patients_test_model)
labels_pred_train = model.predict(diseases_patients_train_model)
proba_test = model.predict_proba(diseases_patients_test_model)[:, 1]
proba_train = model.predict_proba(diseases_patients_train_model)[:, 1]


In [None]:
nb_positive_train = np.sum(labels_pred_train==0)
nb_negative_train = np.sum(labels_pred_train==1)
nb_positive_test = np.sum(labels_pred_test==0)
nb_negative_test = np.sum(labels_pred_test==1)

TP_test = np.sum((label_patients_test==0 )& (labels_pred_test == 0)) / nb_positive_test
FP_test = np.sum((label_patients_test==1 )& (labels_pred_test == 0)) / nb_positive_test
TN_test = np.sum((label_patients_test==1 )& (labels_pred_test == 1)) / nb_negative_test
FN_test = np.sum((label_patients_test== 0)& (labels_pred_test == 1)) / nb_negative_test

TP_train = np.sum((label_patients_train==0 )& (labels_pred_train == 0)) / nb_positive_train
FP_train = np.sum((label_patients_train==1 )& (labels_pred_train == 0)) / nb_positive_train
TN_train = np.sum((label_patients_train==1 )& (labels_pred_train == 1)) / nb_negative_train
FN_train = np.sum((label_patients_train== 0)& (labels_pred_train == 1)) / nb_negative_train

accuracy_train = accuracy_score(label_patients_train, labels_pred_train)
accuracy_test = accuracy_score(label_patients_test, labels_pred_test)

auc_test = calculate_roc_auc(label_patients_test, proba_test)
auc_train = calculate_roc_auc(label_patients_train, proba_train)

proba_avg_zero_test = 1- np.mean(proba_test[label_patients_test==0])
proba_avg_zero_train = 1- np.mean(proba_train[label_patients_train==0])
proba_avg_one_test = np.mean(proba_test[label_patients_test==1])
proba_avg_one_train = np.mean(proba_train[label_patients_train==1])

In [None]:
print(f'{TP_test=}') 
print(f'{FP_test=}')
print(f'{TN_test=}')
print(f'{FN_test=}')
print(f'{TP_train=}') 
print(f'{FP_train=}')
print(f'{TN_train=}')
print(f'{FN_train=}')
print(' ')
print(f'{auc_test=}')
print(f'{auc_train=}')
print(' ')
print(' ')
print(f'{accuracy_test=}')
print(f'{accuracy_train=}')
print(' ')
print(f'{proba_avg_zero_test=}')
print(f'{proba_avg_zero_train=}')
print(f'{proba_avg_one_test=}')
print(f'{proba_avg_one_train=}')

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
#perm_importance = permutation_importance(model, diseases_patients_train_model, label_patients_train)

# Access the importance scores
#importance_scores = perm_importance.importances_mean / perm_importance.importances_mean.sum()
feature_importances = model.feature_importances_


In [None]:
plt.plot(feature_importances, 'o')

In [None]:
np.argmax(feature_importances)

In [None]:
nb_diseases_mean_pheno = [diseases_patients_train_model_unscaled[diseases_patients_train_model_unscaled[:, i]==1].sum(axis=1).mean() for i in range(diseases_patients_train_model.shape[1])]

In [None]:
df_features = pd.DataFrame(data = np.array([np.abs(Fs[:,0]-Fs[:,1]) *100, accuracy_pheno_par(phenos), feature_importances, shaps, nb_diseases_mean_pheno, frequencies]).T, columns = ['diff_p', 'accuracy', 'importance', 'shaps', 'nb_diseases', 'frequencies'])

In [None]:
df_features.corr()

In [None]:
indices_keep = importance_scores > 0.015