# The French Agence de la biomedecine (ABM): data processing
- Topic: Rank Order of Candidates for Heart Transplantation in France: An Explainable Machine Learning analysis
- Authors : Martin Prodel (MS, PhD), Benoit Audry (MS)
- Created in 2025 

## Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

# home-made functions
from analyses import correlation_pearson, correlation_spearman, plot_delai, plot_rank

In [None]:
%load_ext autoreload
%autoreload 2

## Import data

In [None]:
# Import the data from a pickle file : each row is the ranking of a candidate patient for of a given donor (heart)
df = pd.read_pickle("../data/df_formatted_complete.pkl")
df.info(memory_usage="deep")

## EDA & Preprocessing

### Exploratory Data Analysis (EDA) 

In [None]:
# Simple EDA with pandas profiling
profile = ProfileReport(df, title="Pandas Profiling Report")

# Exporting the report to a file
profile.to_file("../results/eda_report.html")

### Preprocessing (1/3)

In [None]:
remove_col = ["ETATLA", "MALADI3", "MALADI2"]  # useless columns
df = df.drop(columns=remove_col)
df.head()

In [None]:
# You may need to silent a certain warning, uncomment the following if needed
# pd.set_option('future.no_silent_downcasting', True)

# Encoding categorical variables : gender of the donor (SEXD) and of the receiver (SEXER)
df.SEXD = df.SEXD.replace({"M": 1, "F": 2}).astype(int)
df.SEXER = df.SEXER.replace({"M": 1, "F": 2}).astype(int)

In [None]:
# Fix the ranking system for "not ranked patients": all patients with a SCORE_C of zero are randomly ranked after the last ranked patient,
# but there is no medical sense in ordering them.
# Solution: assign all these patients the same rank, which is the worse rank observed for this donor with a score of zero
# Example: for a donor, if 10 recipients are ranked, including 4 with a score of 0, then these 4 patients are assigned at rank 6 (and not 7, 8, 9, 10)

rang_min_zero = (
    df[df.SCORE_C == 0]
    .groupby("IDD")["Rang"]
    .min()
    .reset_index()
    .rename(columns={"Rang": "rang_min_zero_score"})
)
df = df.merge(rang_min_zero, on="IDD", how="left")


In [None]:
# Create a column "Rang_clipped" which is equal to "Rang" except for patients with SCORE_C of 0, where it is equal to rang_min_zero_score
df["Rang_clipped"] = df["rang_min_zero_score"].where(df.SCORE_C == 0, df["Rang"])

In [None]:
### Step 1 : Re-scale rank from 0 to 1 for each heart (IDD) : min-max scaling (0 to 1 Linear Transformation)
# Pour chaque donneur, connaître le rang min (qui doit être 1 systématiquement) et le rang max attribué (dépend du nombre de personnes en attente à ce moment là)
bounds_rank = (
    df.groupby("IDD")["Rang_clipped"]
    .agg(["min", "max"])
    .reset_index()
    .rename(columns={"min": "rang_min_D", "max": "rang_max_D"})
)
df = df.merge(bounds_rank, on="IDD", how="left")
# min max scale
df["Rang_rescaled"] = (df["Rang_clipped"] - df["rang_min_D"]) / (
    df["rang_max_D"] - df["rang_min_D"]
)
display(df.head())

# Also, we want 1 to be the better outcome, and 0 the worse, so let's reverse it
df["Rang_rescaled"] = 1 - df["Rang_rescaled"]

### Step 2: Power transformation (Non-Linear Scaling)
p = 2  # Power factor (adjust for desired effect)
df["Rang_rescaled"] = df["Rang_rescaled"] ** p

# remove useless columns
df = df.drop(columns=["rang_min_zero_score", "Rang", "rang_min_D", "rang_max_D"])

In [None]:
# machine learning target
target = "Rang_rescaled"

### Missings values

In [None]:
print(df.CEC2.value_counts())
df["CEC2"] = df["CEC2"].fillna("N")
print(df.CEC2.value_counts())

In [None]:
print(df.CAT2.value_counts())
df["CAT2"] = df["CAT2"].fillna("N")
print(df.CAT2.value_counts())
df.CAT2 = df.CAT2.replace({"?": "N"})
print(df.CAT2.value_counts())

In [None]:
print(df.DIA2.value_counts())
df["DIA2"] = df["DIA2"].fillna("N")
print(df.DIA2.value_counts())
df.DIA2 = df.DIA2.replace({"?": "N"})
print(df.DIA2.value_counts())

In [None]:
print(df.DIA_AVI.value_counts())
df["DIA_AVI"] = df["DIA_AVI"].fillna("N")
print(df.DIA_AVI.value_counts())
df.DIA_AVI = df.DIA_AVI.replace({"?": "N"})
print(df.DIA_AVI.value_counts())

In [None]:
print(df.DRG2.value_counts())
df["DRG2"] = df["DRG2"].fillna("N")
print(df.DRG2.value_counts())
# If there is another device support (CEC, CAT, SIAV), then we set the drugs to N (="No")
df["DRG2"] = np.where(df["CEC2"] == "O", "N", df["DRG2"])
print(df.DRG2.value_counts())
df["DRG2"] = np.where(df["CAT2"] == "O", "N", df["DRG2"])
print(df.DRG2.value_counts())
df["DRG2"] = np.where(df["SIAV2"].isna(), df["DRG2"], "N")
print(df.DRG2.value_counts())

### Preprocessing (2/3)

In [None]:
# Addition of a composite variable of 3 others: glomerular filtration rate (GFR)
# Warning: very high GFR values are obtained for infants (aged between 1 and 3 years)
df["DFG"] = (
    186.3
    * (df["CREAT2"] / 88.4) ** (-1.154)
    * df["AGER"] ** (-0.203)
    * df["SEXER"].mask(df["SEXER"] == 2, 0.742)
)

# Note: there are missing values for this calculated variable DFG, when CREAT2 is NaN  (N=4961 rows in the df)
# # df[df.DFG.isna()][["CREAT2", "AGER", "SEXER"]]

print(df.DFG.describe())
# Imputation of DFG to 15 when DIA2="O"
print(df.DIA2.value_counts())
df["DFG"] = np.where(df["DIA2"] == "O", 15, df["DFG"])
print(df.DFG.describe())

In [None]:
# Same as the cell above, but for the variables with suffix _AVI
df["DFG_AVI"] = (
    186.3
    * (df["CRE_AVI"] / 88.4) ** (-1.154)
    * df["AGER"] ** (-0.203)
    * df["SEXER"].mask(df["SEXER"] == 2, 0.742)
)

print(df.DFG_AVI.describe())
print(df.DIA_AVI.value_counts())
df["DFG_AVI"] = np.where(df["DIA_AVI"] == "O", 15, df["DFG_AVI"])
print(df.DFG_AVI.describe())

In [None]:
# useless columns
drop_crea_cols = [
    "CREAT2",
    "DelaiCREAT2",
    "DIA2",
    "CRE_AVI",
    "DIA_AVI",
]

In [None]:
# Decision for variables DecilePEPT vs. CEC2, CAR2, SIAV2: set DecilePEPT to NaN if any of these variables have certain values.
# This affects 7,515 rows in the data.
# If (CEC2=‘O’ OR CAT2=‘O’ OR SIAV2=‘BV’) then DECILE_PEPT=NaN
# df.CAT2 = df.CAT2.astype(str)
df["DecilePEPT"] = df.DecilePEPT.mask(
    (df.CEC2 == "O") | (df.CAT2 == "O") | (df.SIAV2 == "BV"), other=np.nan
)
df[df.DecilePEPT.isna()]

In [None]:
# Set to NA also when BNP2 and PROBNP2 are NA
print(df.DecilePEPT.value_counts())
df["DecilePEPT"] = np.where(
    df["BNP2"].isna() & df["PROBNP2"].isna(), np.nan, df["DecilePEPT"]
)
print(df.DecilePEPT.value_counts())
df[df.DecilePEPT.isna()]

In [None]:
# Decision for variables DecilePEPT vs. CEC2, CAR2, SIAV2: set DecilePEPT to NaN if any of these variables have certain values.
# This affects 7,515 rows in the data.
# If (CEC2=‘O’ OR CAT2=‘O’ OR SIAV2=‘BV’) then DECILE_PEPT=NaN
# df.CAT2 = df.CAT2.astype(str)
df["DecilePEPT_AVI"] = df.DecilePEPT.mask(
    (df.CEC2 == "O") | (df.CAT2 == "O") | (df.SIAV2 == "BV"), other=np.nan
)
df[df.DecilePEPT.isna()]

In [None]:
# Also set to NA when BNP2 and PROBNP2 are NA
print(df.DecilePEPT_AVI.value_counts())
df["DecilePEPT_AVI"] = np.where(
    df["BNP_AVI"].isna() & df["PBN_AVI"].isna(), np.nan, df["DecilePEPT_AVI"]
)
print(df.DecilePEPT_AVI.value_counts())
df[df.DecilePEPT_AVI.isna()]

In [None]:
# Transform the variable MALADI into a categorical variable with 4 categories
print(df["MALADI"].value_counts())
df["MAL"] = "Other"
df.loc[df["MALADI"] == 159, "MAL"] = "Coronary artery disease"
df.loc[df["MALADI"] == 162, "MAL"] = "Valvular or Congenital heart disease"
df.loc[df["MALADI"] == 163, "MAL"] = "Valvular or Congenital heart disease"
df.loc[df["MALADI"] == 815, "MAL"] = "Valvular or Congenital heart disease"
df.loc[df["MALADI"] == 151, "MAL"] = "Dilated cardiomyopathy"
df.loc[df["MALADI"] == 152, "MAL"] = "Dilated cardiomyopathy"
df.loc[df["MALADI"] == 154, "MAL"] = "Dilated cardiomyopathy"
df.loc[df["MALADI"] == 155, "MAL"] = "Dilated cardiomyopathy"
df.loc[df["MALADI"] == 164, "MAL"] = "Dilated cardiomyopathy"
print(df["MAL"].value_counts())

In [None]:
# Merge CAT and BV
print(df.SIAV2.value_counts())
print(df.CAT2.value_counts())
df["CAT_BV"] = np.where(df["SIAV2"] == "BV", "O", df["CAT2"])
print(df.CAT_BV.value_counts())
df["SIAV2"] = np.where(df["SIAV2"] == "BV", np.nan, df["SIAV2"])
print(df.SIAV2.value_counts())

In [None]:
## Modify ADULTURGENCE to have SIAV
print(df.ALLOC.value_counts())
print(df.SIAV2.value_counts())
df["ALLOC"] = np.where(
    (df["ALLOC"] == "ADULTURG") & (np.invert(df["SIAV2"].isna())),
    "ADULTURG_AV",
    df["ALLOC"],
)
print(df.ALLOC.value_counts())

In [None]:
# Set SIAV2 to NA when ALLOC is in emergency mode
print(df.SIAV2.value_counts())
df["SIAV2"] = np.where(df["ALLOC"] == "ADULTURG_AV", np.nan, df["SIAV2"])
print(df.SIAV2.value_counts())

In [None]:
# Analysis of missing values
missing_data = df.isnull().mean() * 100  # % of missing values
missing_by_target = df.groupby(target).apply(lambda group: group.isnull().mean() * 100)

# Display missing data
print("\nGlobal percentage of missing values:\n", missing_data)
print("\nPercentage of missing values by target group:\n", missing_by_target)

# Visualisation of missing values
plt.figure(figsize=(15, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Map of missing values")
plt.show()

### Correlations: pearson

In [None]:
correlation_pearson(df, target)

### Correlations: spearman

In [None]:
# (warning) 2 min of computation time
correlation_spearman(df, target)

### Preprocessing (3/3)

In [None]:
# Number of patients (reveivers) for each heart
df.groupby("IDD")["IDR"].count().describe()

In [None]:
# Number of times a patient was ranked while waiting for a heart
df.groupby("IDR")["IDD"].count().describe()

In [None]:
# Search for the patients who were ranked a lot of times ... (it migth biased the ML algorithms)
x = df.groupby("IDR")["IDD"].count().reset_index()

In [None]:
# Max value of TimelineD for each patient
delai = df.groupby("IDR")["TimelineD"].agg(["max"]).reset_index().astype(int)

In [None]:
delai = df.groupby("IDR")["TimelineD"].agg(["min", "max"]).reset_index().astype(int)
delai["delai"] = delai["max"] - delai["min"]
delai["delai"].describe()

In [None]:
plot_delai(delai)

In [None]:
x_range = [-0.05, 1.05]
xdtick = 0.1
target_col = "Rang_rescaled"
plot_rank(df, target_col, x_range, xdtick, color="goldenrod", nbins=150)

In [None]:
# First rank when entering the waiting list
# Step 1 : find the row with the earliest time for each patient
first = df.loc[df.groupby("IDR")["TimelineD"].idxmin()]

# Step 2 : keep only the IDR columns and the corresponding rank
first = first[["IDR", target]]

# Step 3 : reset the index for a clean presentation
first = first.reset_index(drop=True)

plot_rank(first, target, x_range, xdtick, color="mediumvioletred", nbins=50)
first[target].describe()

In [None]:
# Study of the best rank ever obtained for each patient
# Warning : if the rank is rescaled in 0-1, then the "best rank" is 1, so we use the max function
best = first.merge(df.groupby("IDR")[target].max().reset_index(), on="IDR").rename(
    columns={f"{target}_x": "1st rank", f"{target}_y": "Best rank"}
)
plot_rank(best, "Best rank", x_range, xdtick, color="seagreen", nbins=90)
display(best.head())
best["Best rank"].describe()

In [None]:
# New variable : number of patients for each donor (heart), which is an estimation of the competition for this heart
nb_pat_for_idd = (
    df[df.SCORE_C > 0]
    .groupby("IDD")["IDR"]
    .count()
    .reset_index()
    .rename(columns={"IDR": "nb_pat_for_idd"})
)
df = df.merge(nb_pat_for_idd, on="IDD", how="left")
nb_pat_for_idd.head()

In [None]:
df.nb_pat_for_idd.describe()

### Feature selection

In [None]:
# Very important decision : we will only train the ML algorithms on patients who have a SCORE_C > 0
# Indeed, patients with a SCORE_C of 0 are not really ranked nor suited for the transplant

df_sup0 = df[df.SCORE_C > 0].copy()

In [None]:
# All observations in the datasets were given the same weight
df_sup0["observation_weight"] = 1

In [None]:
# export in pickle for quicker reuse (and drop useless columns)
df_sup0.drop(columns=["Rang_clipped", "MALADI"] + drop_crea_cols).to_pickle(
    "../data/df_for_ml_article.pkl"
)

In [None]:
# Redo the Pandas Profiling after the preprocessing
profile = ProfileReport(df, title="Pandas Profiling Report of 'df_for_article'")
# Exporting the report to a file
profile.to_file("../results/eda_report_df_for_article.html")