<a href="https://colab.research.google.com/github/olivierduranteau/Neptune/blob/main/Method_Supplemental_material.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#################################################
#                                               #
# Written by Jean Urung and Olivier Duranteau   #
# Commented and annotated by Olivier Duranteau  #
# Corrected by Axel Abels and Benjamin Popoff   #
#                                               #
#################################################

###################################################################
# import of the different modules and display of the procedure    #
###################################################################


import pandas as pd         # a library for data manipulation and analysis, including tools for reading and writing data in various formats.
import numpy as np          # a library for numerical computing in Python.
import datetime             # a module for working with dates and times in Python.
from tqdm import tqdm       # a library for adding progress bars to Python loops and functions.
import pdb                  # a module for debugging Python code.
import os                   # a module for interacting with the operating system.
import sys                  # a module that provides access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter.

import matplotlib.pyplot as plt                                 # a plotting library for Python.
from sklearn.impute import SimpleImputer                        # a class for imputing missing values in datasets.
from sklearn.experimental import enable_iterative_imputer       # a module that enables iterative imputation methods in scikit-learn.
from sklearn.impute import IterativeImputer,KNNImputer          # a class for iterative imputation of missing values & class for imputing missing values using k-nearest neighbors.
from sklearn.neighbors import KNeighborsRegressor               # The KNeighborsRegressor class is a regression model that uses k-nearest neighbors to predict a continuous output variable.
from sklearn.model_selection import train_test_split            # The train_test_split function is used to split your dataset into training and testing sets.
from sklearn.pipeline import make_pipeline                      # The make_pipeline function is used to assemble several steps that can be cross-validated together while setting different parameters.
from sklearn.preprocessing import Normalizer                    # The Normalizer class is used to normalize samples individually to unit norm.
from pandas.plotting import parallel_coordinates                # The parallel_coordinates function is a plotting utility for comparing many variables without subplots.
from sklearn.ensemble import *                                  # The sklearn.ensemble module contains ensemble-based methods for both classification and regression. These include methods such as Random Forests and Gradient Boosting.
from sklearn.tree import DecisionTreeRegressor                  # a class for building decision tree regression models.
from sklearn.neural_network import MLPRegressor                 # a class for building multi-layer perceptron regression models.
from sklearn.pipeline import make_pipeline                      # a function for constructing pipelines of estimators.
from sklearn.preprocessing import StandardScaler                # a class for scaling data by removing the mean and scaling to unit variance.
from sklearn.ensemble import RandomForestRegressor              # a class for building random forest regression models.
from sklearn.ensemble import GradientBoostingRegressor          # a class for building gradient boosting regression models.
from sklearn.linear_model import LinearRegression               # a class for building linear regression models.
from sklearn import datasets, linear_model                      # a function for calculating the mean squared error between predicted and true values.
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_percentage_error,mean_absolute_error        # a function for calculating the R-squared score between predicted and true values.
from lineartree import *                                        # you'll need to install the linear-tree package
import math                                                     # The 'math' module provides mathematical functions, including trigonometric, logarithmic, and other special functions.
import warnings                                                 # The 'warnings' module is used to issue warning messages and to manage the process of handling those warnings in your code.
warnings.filterwarnings("ignore")

##############################################
# Removing nulls and duplicates              #
##############################################

# Using the pandas.DataFrame.dropna function to remove missing values from raw arrays
# Creation of the cleanDataframe function which removes missing values and then in a second step removes duplicates.
# The cleanDataframe function takes a DataFrame as input and uses the pandas.DataFrame.dropna() method to remove any missing or null values, first from rows and then from columns.
# The drop_duplicates() method is then used to remove any duplicate rows from the DataFrame, and the cleaned DataFrame is returned.


def cleanDataframe(df):
    df = df.dropna(axis=0, how="all")                                       # axis=0 deletes rows that have empty or null values
    df = df.dropna(axis=1, how="all")                                       # axis=1 deletes columns that have empty or null values
    df = df.drop_duplicates()                                               # how = "all" selects only those columns or rows that have all values as empty or null
    return df


###################################
# Cleaning of dates and times     #
###################################

# The strToTime function is a helper function that takes a string date and time value and formats it to a datetime.datetime object using the datetime.datetime.strptime() method.
# If the value cannot be formatted with the given format string, it prints an error message using the cprint() method and raises a ValueError.

def formatTime(value, format=["%d/%m/%Y %H:%M", "%d/%m/%Y", "%d/%m/%Y "]):
    if isinstance(value, str):                                              # use of Numpy's Boolean function
        if not isinstance(format, list):
            format = [format]
        for f in format:
            try:
                return datetime.datetime.strptime(value, f)                 # datetime.datetime = A combination of a date and a time. Attributes: year, month, day, hour, minute, second, microsecond, and tzinfo.
            except ValueError:
                pass
        raise ValueError("no time format fit with", value)                  # sets the ValueError exception to "no time format fit with".
    else:
        try:
            return [strToTime(v, format) for v in value]
        except:
            return np.nan

#################################################################
# Retrieving the previous and next value                        #
# Using the pandas module                                       #
# Use of the formula pandas.index.duplicated                    #
# Defined with parameter (keep=)                                #
# Using the formula pandas.index.get_loc.                       #
# Defined with parameters (Key, method, tolerance)              #
#################################################################

# The function getDelaiValeurFromHour takes a DataFrame, a time value, and an index value as input, and returns a dictionary with the time delay and associated value for the closest time value before and after the input time value in the DataFrame.
# It first formats the DataFrame's index using the formatTime function, and removes any duplicate time values using the drop_duplicates() method.
# It then uses the pandas.index.get_loc() method to find the index values before and after the input time value, and calculates the time delay and associated value for each of them.
# The dictionary with these values is then returned.


def getDelaiValeurFromHour(df, heure, value_column_name='Valeur', return_index=False):
    """Given a time of transfusion, returns a dictionnary containing information about blood levels before and after the transfusion (if either is available)
    """
    d = {}
    df.index = df.index.map(formatTime)
    df = df[~df.index.duplicated(keep='first')].sort_index()                # in case of duplication, instruction to keep only the first value

    # get blood level before transfusion
    try:
        ind_before = df.index.get_loc(heure, method='ffill')                # ffil = find the PREVIOUS index value if no exact match
        d['delai avant'] = heure-df.index[ind_before]                       # allows you to recover the time by subtracting the time of the transfusion from the time of the collection
        d['valeur avant'] = df.iloc[ind_before, df.columns.get_loc(value_column_name)]                  # will retrieve the associated value
        if return_index:
            d['index avant'] = ind_before
    except:
        pass

    # get blood level after transfusion
    try:
        ind_after = df.index.get_loc(heure, method='backfill')              # bfill = use NEXT index value if no exact match
        d['delai apres'] = df.index[ind_after]-heure                        # allows you to recover the time by subtracting the time of collection from the time of transfusion
        d['valeur apres'] = df.iloc[ind_after, df.columns.get_loc(value_column_name)]                   # will retrieve the associated value
        if return_index:
            d['index apres'] = ind_after
    except:
        pass
    return d

#########################################
# Definition of the working directory   #
#########################################


datapath = "."
outdir = os.path.join(datapath, 'out')                                      # utilisation du sous-dossier 'out' pour enregistrer les fichiers
os.makedirs(outdir,exist_ok=True)                                           #create output dir if necessary

##########################################################################################################
##########################################################################################################
###########                                  Biological value                    #########################
##########################################################################################################
##########################################################################################################

#############################################################
# Retrieving data from raw source files                     #
# the Transfusion.csv file contains the name                #
#############################################################
csv_transfusion = pd.read_csv(os.path.join(datapath, "transfusion.csv"), encoding="latin-1", sep=';', index_col=[1, 2], header=0)
csv_transfusion = cleanDataframe(csv_transfusion)
patients_inTransfusion = csv_transfusion.index.get_level_values('Nom').unique().astype(str)

csv_calcium = pd.read_csv(os.path.join(datapath, "calcium.csv"), encoding="latin-1", sep=';', index_col=[0, 3, 9], header=5)
csv_calcium = cleanDataframe(csv_calcium)
csv_calcium.index.names=['Nom du patient', 'Nom du paramètre', 'Heure']
patients_inCalcium = csv_calcium.index.get_level_values('Nom du patient').unique().astype(str)

csv_transfu = pd.read_csv(os.path.join(datapath, "transfusion.csv"), encoding="latin-1", sep=';')

######################################################################

transfusion_types = csv_transfusion.index.get_level_values('Parametre principal de la prescription').unique()
patient_parameter_levels = {}                                                                           # This line initializes an empty dictionary outdf that will be used to store the processed data.

# Loop over each patient's transfusions, and for each transfusion, extract calcium values before and after transfusion
for patient in tqdm(patients_inTransfusion,desc="Patient iteration"):                                   # This line loops over a list of patients patients_inTransfusion using the tqdm function to display a progress bar.
    sub_calcium = csv_calcium.loc[patient]  if (patient in patients_inCalcium) else None                # This line initializes a variable sub_calcium to None.
    #since we're interested in predicting calcium, we can already skip patients with no calcium values here:
    if sub_calcium is None:
        continue
    unique_param_names = sub_calcium.index.get_level_values('Nom du paramètre').unique()                # get unique parameters here

    count_per_transfusion_type = {t: 0 for t in transfusion_types}                                      # This line creates a dictionary count that will store the number of transfusions for each type of transfusion.

    sub_trans = csv_transfusion.loc[patient]                                                            # This line retrieves the transfusion data for the current patient from the csv_transfusion DataFrame.
    for transfusion_type,entry in (sub_trans.iterrows()):                                               # 'enumerate' allows to launch a loop in the request, This line loops over the transfusion data for the current patient using enumerate to get both the index i and the value transfusion.
        count_per_transfusion_type[transfusion_type] += 1                                               # Says to take a step of 1 in the enumeration
        transfusion_id = transfusion_type+' '+str(count_per_transfusion_type[transfusion_type])         # This line increments the count of the current type of transfusion by 1.
        trans_heure = formatTime(entry["Heure de debut"])                                               # This line appends the count to the end of the current transfusion type, creating a unique identifier for each transfusion.
        patient_parameter_levels[(patient, transfusion_id)] = {('heure', ''): trans_heure}              # This line retrieves the time of the current transfusion and formats it using a function called formatTime.
        # use of the 'zip' function in python which allows different tables to be linked together

        for param in unique_param_names:                                                                # This line loops over the unique values of the first level of the index of the current DataFrame df.
            for delay, value in getDelaiValeurFromHour(sub_calcium.loc[param], trans_heure).items():    # This line retrieves a dictionary of delay and value pairs from a function called `getDelaiValeurFrom
                if param == ".TP %":                                                                    # converted '.TP%' to 'TP%'.
                    param = "TP %"                                                                      # converted '.TP%' en 'TP%'
                patient_parameter_levels[(patient, transfusion_id)][(param, delay)] = value             # This line adds a new row to the outdf DataFrame. The row is identified by a tuple of (patient, transfusion) and the column is identified by a tuple of (param, k). The value stored in this cell is v. This line appears to be a key step in populating the outdf DataFrame with data for each patient and transfusion.

outdf = pd.DataFrame.from_dict(patient_parameter_levels).T                                              # This line creates a new DataFrame called outdf by converting the outdf dictionary into a DataFrame. The .T at the end of this line transposes the DataFrame so that rows become columns and vice versa.
outdf = outdf.sort_index(ascending=False)                                       # This line sorts the outdf DataFrame in descending order by the index. This may be useful for downstream analysis or plotting.
outdf.insert(0, "Centre", 'PSL', allow_duplicates=False)                        # This line inserts a new column called "Centre" at the beginning of the DataFrame with a constant value of 'PSL'. The allow_duplicates=False argument ensures that there are no duplicate column names.

ca_data = outdf.reset_index()                                                   # This line creates a new DataFrame called ca_data by resetting the index of the outdf DataFrame. This converts the row index into columns and creates a new default index. The resulting DataFrame has columns for (patient, transfusion), "Centre", and any additional columns that were created by the outdf DataFrame.
ca_data.columns = ['Nom', 'Type', 'centre', 'heure', 'ca_del_avt', 'ca_avt', 'ca_del_ap', 'ca_ap', 'caio_del_ap', 'caio_ap', 'caio_del_avt', 'caio_avt']      # This line sets the column names of a DataFrame ca_data.

ca_data['Type'] = ca_data['Type'].astype(str).str[:8]                           # This line converts the Type column of ca_data to a string and slices the first 8 characters of the resulting string.

ca_data_df = ca_data.copy()                                                     # This line creates a new DataFrame ca_data_df by copying the contents of the ca_data DataFrame.

ca_data_groupby = ca_data_df.groupby(by=['Nom', 'ca_avt', 'Type']).size()       # This line groups the ca_data_df DataFrame by the Nom, ca_avt, and Type columns and counts the number of occurrences of each group.
ca_data_groupby_df = ca_data_groupby.to_frame()                                 # This line converts the result of the groupby operation to a DataFrame.
ca_data_groupby_df_pivot = pd.pivot_table(ca_data_groupby_df, index=['Nom', 'ca_avt'], columns=['Type'], aggfunc=np.sum)    # This line creates a pivot table from the ca_data_groupby_df DataFrame, with Nom and ca_avt as the row index and Type as the column index, and calculates the sum of the values for each cell.
ca_merged = ca_data_groupby_df_pivot.merge(ca_data, on=['Nom','ca_avt'])        # This line merges the pivot table with the ca_data DataFrame on the Nom and ca_avt columns.
ca_drop = ca_merged.drop(['Type', 'caio_del_ap', 'caio_ap', 'caio_del_avt', 'caio_avt'], axis=1)      # This line drops the columns Type, caio_del_ap, caio_ap, caio_del_avt, and caio_avt from the ca_merged DataFrame.
ca_final = ca_drop.drop_duplicates(subset =['Nom', 'ca_avt'])                   # This line drops duplicate rows from ca_drop, based on the Nom and ca_avt columns, and assigns the result to ca_final.
ca_final = ca_final.rename(columns={ ca_final.columns[2]: "CGR", ca_final.columns[3]: "ATIII", ca_final.columns[4]: "Cell_Saver",
                                    ca_final.columns[5]: "Clottafac", ca_final.columns[6]: "CPA", ca_final.columns[7]: "Novoseven" ,
                                    ca_final.columns[8]: "Fibrinogene", ca_final.columns[9]: "Melange_plq", ca_final.columns[10]: "PPSB",
                                    ca_final.columns[11]: "PFC"})               # This line renames the columns of ca_final based on a dictionary that maps the current column names to the desired column names.
ca_final[['CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven',
        'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']] = ca_final[['CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven',
                                                                        'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']].fillna(0)     # This line of code fills any missing (NaN) values with 0 in the columns specified in the double brackets. The ca_final DataFrame is updated with these new values.
ca_final['ca_avt'] = ca_final['ca_avt'].str.replace(',','.')                    # This line of code replaces commas (',') with dots ('.') in the 'ca_avt' column of the ca_final DataFrame. This is likely done to change the format of the values in the column from a string to a float.
ca_final['ca_ap'] = ca_final['ca_ap'].str.replace(',','.')                      # This line of code replaces commas (',') with dots ('.') in the 'ca_ap' column of the ca_final DataFrame. This is likely done to change the format of the values in the column from a string to a float.

df3 = ca_final.merge(csv_transfu, on='Nom', how='inner')
df3 = df3.drop_duplicates(subset =['Nom', 'ca_avt', 'ca_ap'])
df3 = df3.drop(['NIPP', 'Parametre principal de la prescription', 'Quantite donnee','Quantite donnee UnitName',
                'Heure de debut', 'Heure de fin', 'date_TH', 'heure fermeture',
                '3Lprenom', '3Lnom'], axis=1)

df4 = df3[df3["ca_del_avt"]<= "0 days 06:00:00"]
df5 = df4[['Nom', 'ca_avt', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']]
df5.columns = ['Nom', 'ca_avt_2', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']
df6 = df3.merge(df5, on=['Nom', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC'], how='outer')
df6 = df6.drop_duplicates(subset =['Nom', 'ca_avt', 'ca_ap'])

df7 = df6[df6["ca_del_ap"]<= "0 days 06:00:00"]
df8 = df7[['Nom', 'ca_ap', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']]
df8.columns = ['Nom', 'ca_ap_2', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']
df9 = df6.merge(df8, on=['Nom', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC'], how='outer')
df9 = df9.drop_duplicates(subset =['Nom', 'ca_avt', 'ca_ap'])

df9.to_csv(os.path.join(outdir, "calcium_total2.csv"), sep='\t', encoding="latin-1")

df10 = df9[df9["ca_ap_2"] != "0"]
df11 = df10[['Nom', 'ca_ap_2', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']]
df11.columns = ['Nom', 'ca_ap_3', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']
df12 = df9.merge(df11, on=['Nom', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC'], how='outer')
df12 = df12.drop_duplicates(subset =['Nom', 'ca_avt', 'ca_ap'])

df13 = df12[df12["ca_avt_2"] != "0"]
df14 = df13[['Nom', 'ca_avt_2', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']]
df14.columns = ['Nom', 'ca_avt_3', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC']
df15 = df12.merge(df14, on=['Nom', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC'], how='outer')
df15 = df15.drop_duplicates(subset =['Nom', 'ca_avt', 'ca_ap'])

df = df15[['Nom', 'CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA',
       'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC', 'centre', 'heure', 'heure incision', 'Sexe', 'poids', 'taille',
        'ca_del_avt', 'ca_del_ap','ca_ap', 'ca_avt', 'ca_ap_3', 'ca_avt_3']]

df['heure incision']= pd.to_datetime(df['heure incision'])
df['delais'] = df['heure'] - df['heure incision']

dfJ1 = df[df['delais']<= "1 days"]
dfS1_temp = df[df['delais']>= "1 days"]
dfS1 = dfS1_temp[dfS1_temp['delais'] <= "7 days"]

df.to_csv(os.path.join(outdir, "calcium_total.csv"), sep='\t', encoding="latin-1")
dfJ1.to_csv(os.path.join(outdir, "calcium_J1.csv"), sep='\t', encoding="latin-1")
dfS1.to_csv(os.path.join(outdir, "calcium_S1.csv"), sep='\t', encoding="latin-1")


################################################################################################################################
dfJ1 = dfJ1.dropna(subset=['ca_avt_3', 'ca_ap_3'])
dfS1 = dfS1.dropna(subset=['ca_avt_3', 'ca_ap_3'])

for (period_name,period_df) in (("J1",dfJ1),("S1",dfS1)):
    print("#"*100)
    print("Traitement de la période",period_name)
    print("-"*50)
    X = period_df[['CGR', 'ATIII', 'Cell_Saver', 'Clottafac', 'CPA', 'Novoseven', 'Fibrinogene', 'Melange_plq', 'PPSB', 'PFC', 'ca_avt_3']]  # Selects the columns CGR, ATIII, Cell_Saver, Clottafac, CPA, Novoseven, Fibrinogene, Melange_plq, PPSB, PFC, and ca_avt_3 from the DataFrame and assigns it to X.

    y =  period_df['ca_ap_3']                                                         # Selects the column ca_ap_3 from the DataFrame and assigns it to y.

    # Axel: the way you're doing splits now some people will be in both train and test sets, I believe it's best to ensure any given patient is in either the train or test set, but not both
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    X_scaler = StandardScaler().fit(X_train)
    y_scaler = StandardScaler().fit(y_train[:,None])                                  # StandardScaler expects 2D array,

    tree = DecisionTreeRegressor()                                                    # Creates an instance of the DecisionTreeRegressor class from the sklearn.tree module.
    mlp = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(100, 100), tol=1e-2, max_iter=500, random_state=0),) # Creates a pipeline with a StandardScaler and a MLPRegressor with 2 hidden layers of 100 units each, a tolerance of 1e-2, a maximum number of iterations of 500, and a random seed of 0.
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)                # Creates an instance of the RandomForestRegressor class from the sklearn.ensemble module with 1000 decision trees and a random seed of 42.
    XGboost = GradientBoostingRegressor(random_state=0)                               # Creates an instance of the GradientBoostingRegressor class from the sklearn.ensemble module with a random seed of 0.
    linregr = linear_model.LinearRegression()                                         # Creates an instance of the LinearRegression class from the sklearn.linear_model module.
    kneighbors = KNeighborsRegressor()
    lintree = LinearForestRegressor(base_estimator=LinearRegression())


    models = { 'tree':tree,'MLP':mlp,'RandomForest':rf,'XGBoost':XGboost,'Linear Model':linregr,'kmeans':kneighbors,'lintree':lintree}
    average = VotingRegressor([('RandomForest',rf),('XGBoost',XGboost),('lintree',lintree), ('tree':tree),('MLP':mlp), ('kmeans':kneighbors), ('Linear Model':linregr)])

    models["average"] = average

    colors =  { name:f'C{i}' for i,name in enumerate(models)}
    model_metric_list=[]
    for name,model in models.items():

        model.fit(X_scaler.transform(X_train),y_scaler.transform(y_train[:,None]).flatten())
        model_prediction = y_scaler.inverse_transform(model.predict(X_scaler.transform(X_test))[:,None])

        print("graphique et valeurs pour ",name)
        plt.title(name)
        plt.scatter(y_test, model_prediction, color=colors[name])
        plt.scatter(y_test, model_prediction, color=colors[name], linewidth=0.1)

        plt.axis('equal') # use same scale for x and y axis, makes comparison between figures easier
        plt.show()
        R2 = r2_score(y_test, model_prediction)
        MSE = mean_squared_error(y_test, model_prediction)
        RMSE = math.sqrt(MSE)
        sd = stdev(y_test)
        print("Coefficient of determination: %.5f" % R2)
        print("Root Mean squared error: %.5f" % RMSE)
        print("Standard deviation %.5f" % sd)
        model_metric_list.append([name,-R2,RMSE, sd]) # negating R2 since, unlike other metrics it's something we want to maximize
        print()

    # add ranking  here
    print("Model average rank for J1")
    metric_df = pd.DataFrame(model_metric_list,columns=["model","R2","RMSE", "sd"])
    average_ranks = metric_df[["R2","RMSE","sd"]].rank().sum(axis=1)
    for model,average_rank,model_rank in zip(metric_df.model.values,average_ranks,np.argsort(np.argsort(average_ranks))):
        print(model,"average rank:",average_rank,"model rank:",model_rank)
    print()

    # display ranking for the different models and according to different metrics
    metric_df_rank = metric_df.copy()
    metric_df_rank[['R2','RMSE','sd']]=metric_df_rank[['R2','RMSE','sd']].rank()
    metric_df_rank = metric_df_rank.sort_values(by='R2',ascending=False)
    parallel_coordinates(metric_df_rank,'model',colormap='tab20')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.title("Model ranking per metric")
    plt.show()

    print("#"*100)

X_demo = df[[ 'heure', 'Sexe', 'poids', 'taille', 'heure incision', 'ca_ap_3', 'ca_avt_3']]
X_demo_train, X_demo_test,= train_test_split(X_demo, test_size=0.25, random_state=42)