## Import

In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
from statannotations.Annotator import Annotator
from lifelines import KaplanMeierFitter
from lifelines.utils import median_survival_times

In [24]:
df = pd.read_excel("./data/df_for_analysis.xlsx",index_col=0)

## Utils function

In [25]:
def keep_weight_post_infection(x,weight_end = "weight_T13"):
    """
    Given a DataFrame `x`, returns a Series containing the weights post-infection.

    *Arguments*
    - x: DataFrame containing the data.
    - weight_end: Column name of the last weight measurement.

    *Returns*
    - shifted_series: Series containing the weights post-infection.
    """
    # Extract relevant columns from the DataFrame
    dates = x['Dates']
    t_infection = x['Time_infection']
    datas = x['weight_T_infection':weight_end]

    # Find the date closest to the infection time
    new_time_infection = dates[dates <= t_infection][-1]
    location_of_TI = dates.get_loc(new_time_infection)

    # Return the original series if the data at the infection time is NaN
    if np.isnan(datas[location_of_TI]):
        return datas
    
    # Shift the values of the input series by the specified index
    shifted_series = pd.Series([np.nan] * len(datas), index=datas.index)
    if location_of_TI == 0:
        return datas
    else:
        shifted_series[:-location_of_TI] = datas.values.tolist()[location_of_TI:]
    # Shift the values of the input series by the specified index
    return shifted_series

## Transform data to long format

Keep only data that are after the time of infeciton

In [26]:
# change dates column to datetimindex and transform weight datas to numeric only
df.loc[:,"weight_T_infection":"weight_T13"] = df.loc[:,"weight_T_infection":"weight_T13"].apply(pd.to_numeric,errors='coerce')
serie_dates = df['Time_point'].apply(lambda x: pd.to_datetime(x.split(','),dayfirst=True))
df['Dates'] = serie_dates
data = df.apply(lambda x: keep_weight_post_infection(x),axis=1)
data

  df.loc[:,"weight_T_infection":"weight_T13"] = df.loc[:,"weight_T_infection":"weight_T13"].apply(pd.to_numeric,errors='coerce')


Unnamed: 0,weight_T_infection,weight_T1,weight_T2,weight_T3,weight_T4,weight_T5,weight_T6,weight_T7,weight_T8,weight_T9,weight_T10,weight_T11,weight_T12,weight_T13
0,23.92,21.72,20.96,19.38,18.16,16.44,15.49,15.44,15.05,,,,,
1,21.40,19.45,18.84,17.82,16.80,15.02,14.14,14.40,14.73,,,,,
2,22.56,21.45,20.83,18.67,16.82,15.30,14.74,14.80,14.42,,,,,
3,20.39,18.69,16.60,15.58,14.17,13.54,,,,,,,,
4,23.72,21.74,20.29,19.56,18.50,16.80,16.78,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,27.70,25.00,22.80,21.10,,,,,,,,,,
2353,26.10,24.90,22.60,,,,,,,,,,,
2354,24.60,25.80,25.70,24.90,22.80,,,,,,,,,
2355,27.40,27.30,28.20,27.80,27.70,27.80,27.60,,,,,,,


Normalization by weight at T infection and replace the normalize data into the original dataframe

In [27]:

normalize = data.div(data['weight_T_infection'],axis=0)*100
df_normalize = df.copy()
df_normalize.loc[:,"weight_T_infection":"weight_T13"] = normalize

df_normalize['min_weight'] = df_normalize.loc[:,"weight_T_infection":"weight_T13"].min(axis=1)
df_normalize['t_origin'] = df_normalize['time_original']
df_normalize

Unnamed: 0,Mouse_ID,ID_Experiment,Cage,Strain,Date,Experiment,Group,Group_info,H0,Pre_traitment,...,survival_0.06,time_0.05,survival_0.05,time_original,survival_original,max_loss_weight_percentage,exp,sub_exp,min_weight,t_origin
0,TRO-05432,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,9.0,1,0.629181,1,A,62.918060,9.0
1,TRO-05433,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,9.0,1,0.660748,1,A,66.074766,9.0
2,TRO-05434,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,2.5,1,9.0,1,0.639184,1,A,63.918440,9.0
3,TRO-05435,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,6.0,1,0.664051,1,A,66.405101,6.0
4,TRO-05456,ID_001,B,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,7.0,1,0.707420,1,A,70.741990,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,TRO-028337,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,1.5,1,5.0,1,0.761733,3,no,76.173285,5.0
2353,TRO-028338,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,2.5,1,4.0,1,0.865900,3,no,86.590038,4.0
2354,TRO-028339,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,5.5,1,6.0,1,0.926829,3,no,92.682927,6.0
2355,TRO-028342,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,0,8.0,0,8.0,0,0.996350,3,no,99.635036,8.0


Find only releavent columns


In [28]:
columns = df_normalize.loc[:,"weight_T_infection":"weight_T13"].columns.tolist()
columns_index = df_normalize.loc[:, ~df_normalize.columns.isin(columns)]
column_time = [n for n in df_normalize.columns.tolist() if "time_" in n]
column_time = [column_time[-1]] + column_time[:-1]
columns_index_time = ['ID_Experiment','Mouse_ID','Date','Infection','Group','exp','survival_original','t_origin']

Transform to tidy data for releavent columns

In [29]:
df_longer_weight = df_normalize.melt(id_vars=columns_index_time,value_vars=columns,var_name="Time",value_name="weight")
df_longer_weight['Time'] = df_longer_weight['Time'].apply(lambda x: "".join(x.split("_")[1:]))
df_longer_weight

Unnamed: 0,ID_Experiment,Mouse_ID,Date,Infection,Group,exp,survival_original,t_origin,Time,weight
0,ID_001,TRO-05432,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
1,ID_001,TRO-05433,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
2,ID_001,TRO-05434,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
3,ID_001,TRO-05435,2014-06-05,C. albicans,1A,1,1,6.0,Tinfection,100.0
4,ID_001,TRO-05456,2014-06-05,C. albicans,1A,1,1,7.0,Tinfection,100.0
...,...,...,...,...,...,...,...,...,...,...
32993,ID_096,TRO-028337,2023-03-03,S. pneumoniae,3,3,1,5.0,T13,
32994,ID_096,TRO-028338,2023-03-03,S. pneumoniae,3,3,1,4.0,T13,
32995,ID_096,TRO-028339,2023-03-03,S. pneumoniae,3,3,1,6.0,T13,
32996,ID_096,TRO-028342,2023-03-03,S. pneumoniae,3,3,0,8.0,T13,


save the datas

In [30]:
df_longer_weight.to_excel("./data/df_long_format_for_analysis.xlsx")

## Weight loss by day of infection

In [31]:
df_longer_weight.groupby(["Infection",'Time'])['weight'].agg(['mean',"median","count","std"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,count,std
Infection,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C. albicans,T1,93.279207,93.953488,251,3.862355
C. albicans,T10,88.030905,90.481172,117,11.002648
C. albicans,T11,89.093038,91.262136,109,11.051094
C. albicans,T12,89.51235,92.839506,103,11.733767
C. albicans,T13,91.540737,94.016363,36,8.440398
C. albicans,T2,87.984191,88.369218,234,5.431728
C. albicans,T3,85.058591,83.976864,232,7.532595
C. albicans,T4,85.233445,83.467694,230,9.461354
C. albicans,T5,85.575387,85.300926,224,11.491712
C. albicans,T6,84.637715,85.512367,153,12.198206


### weight at day 6 non survivor mice

In [32]:
df_dead = df_longer_weight[df_longer_weight['survival_original']==1]
df_survive = df_longer_weight[df_longer_weight['survival_original']==0]
df_dead_T6 = df_dead[df_dead['Time'] == 'T6']

In [33]:
stats = df_dead_T6.groupby(['Infection'])['weight'].agg(['mean','median','count','std'])
stats['ci95_hi'] = stats['median'] + 1.96*stats['std']/np.sqrt(stats['count'])
stats['ci95_low'] = stats['median'] - 1.96*stats['std']/np.sqrt(stats['count'])
stats


Unnamed: 0_level_0,mean,median,count,std,ci95_hi,ci95_low
Infection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C. albicans,72.756838,70.700637,51,7.88219,72.863943,68.537331
H1N1,74.404462,72.855544,66,6.899504,74.520114,71.190974
Listeria,74.637131,74.015748,7,3.225896,76.405525,71.625971
S. pneumoniae,91.887905,91.666667,57,8.13382,93.778274,89.555059


## Median time to death

In [34]:
df = pd.read_excel("./data/df_long_format_for_analysis.xlsx",index_col=0)

In [35]:
df

Unnamed: 0,ID_Experiment,Mouse_ID,Date,Infection,Group,exp,survival_original,t_origin,Time,weight
0,ID_001,TRO-05432,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
1,ID_001,TRO-05433,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
2,ID_001,TRO-05434,2014-06-05,C. albicans,1A,1,1,9.0,Tinfection,100.0
3,ID_001,TRO-05435,2014-06-05,C. albicans,1A,1,1,6.0,Tinfection,100.0
4,ID_001,TRO-05456,2014-06-05,C. albicans,1A,1,1,7.0,Tinfection,100.0
...,...,...,...,...,...,...,...,...,...,...
32993,ID_096,TRO-028337,2023-03-03,S. pneumoniae,3,3,1,5.0,T13,
32994,ID_096,TRO-028338,2023-03-03,S. pneumoniae,3,3,1,4.0,T13,
32995,ID_096,TRO-028339,2023-03-03,S. pneumoniae,3,3,1,6.0,T13,
32996,ID_096,TRO-028342,2023-03-03,S. pneumoniae,3,3,0,8.0,T13,


### cleaning datas for computation

In [36]:
df_clean = df.copy()
status_mapping = {0: 'Alive', 1: 'Dead'}
df_clean["survival_original"] = df_clean['survival_original'].replace(status_mapping)

infection_mapping = {'Listeria':"L. monocytogenes"}
df_clean['Infection'] =df_clean['Infection'].replace(infection_mapping)

df_clean.rename(columns={'survival_original':'Mice'},inplace=True)

time_rename = list(np.arange(0,15,1))
time_unique = df_clean['Time'].unique()
time_mapping = dict(zip(time_unique,time_rename))
df_clean['Time'] = df_clean['Time'].replace(time_mapping)
df_clean


Unnamed: 0,ID_Experiment,Mouse_ID,Date,Infection,Group,exp,Mice,t_origin,Time,weight
0,ID_001,TRO-05432,2014-06-05,C. albicans,1A,1,Dead,9.0,0,100.0
1,ID_001,TRO-05433,2014-06-05,C. albicans,1A,1,Dead,9.0,0,100.0
2,ID_001,TRO-05434,2014-06-05,C. albicans,1A,1,Dead,9.0,0,100.0
3,ID_001,TRO-05435,2014-06-05,C. albicans,1A,1,Dead,6.0,0,100.0
4,ID_001,TRO-05456,2014-06-05,C. albicans,1A,1,Dead,7.0,0,100.0
...,...,...,...,...,...,...,...,...,...,...
32993,ID_096,TRO-028337,2023-03-03,S. pneumoniae,3,3,Dead,5.0,13,
32994,ID_096,TRO-028338,2023-03-03,S. pneumoniae,3,3,Dead,4.0,13,
32995,ID_096,TRO-028339,2023-03-03,S. pneumoniae,3,3,Dead,6.0,13,
32996,ID_096,TRO-028342,2023-03-03,S. pneumoniae,3,3,Alive,8.0,13,


### time to event

In [37]:
kmf = KaplanMeierFitter()
# A4 page dimensions in inches

df_KP = df_clean.copy()
df_KP['Mice'] = df_KP['Mice'].replace({'Dead':1,'Alive':0})

kmf.fit(df_KP["t_origin"], df_KP["Mice"], label='Total')
median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(median_)
print(median_confidence_interval_)

for name, grouped_df in df_KP.groupby('Infection'):
    print(name)
    kmf.fit(grouped_df["t_origin"], grouped_df["Mice"], label=name)
    median_ = kmf.median_survival_time_
    median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
    print(median_)
    print(median_confidence_interval_)


11.0
     Total_lower_0.95  Total_upper_0.95
0.5              11.0              11.0
C. albicans
inf
     C. albicans_lower_0.95  C. albicans_upper_0.95
0.5                     inf                     inf
H1N1
inf
     H1N1_lower_0.95  H1N1_upper_0.95
0.5              inf              inf
L. monocytogenes
6.0
     L. monocytogenes_lower_0.95  L. monocytogenes_upper_0.95
0.5                          6.0                          6.0
S. pneumoniae
9.0
     S. pneumoniae_lower_0.95  S. pneumoniae_upper_0.95
0.5                       8.0                       9.0


In [38]:
kmf = KaplanMeierFitter()

df_KP = df_clean.copy()
df_KP['Mice'] = df_KP['Mice'].replace({'Dead':1,'Alive':0})
df_dead = df_KP[df_KP['Mice']==1]

kmf.fit(df_dead["t_origin"], df_dead["Mice"], label='Total')
median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(median_)
print(median_confidence_interval_)

for name, grouped_df in df_dead.groupby('Infection'):
    kmf.fit(grouped_df["t_origin"], grouped_df["Mice"], label=name)
    median_ = kmf.median_survival_time_
    median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
    print(median_)
    print(median_confidence_interval_)


5.0
     Total_lower_0.95  Total_upper_0.95
0.5               5.0               5.0
8.0
     C. albicans_lower_0.95  C. albicans_upper_0.95
0.5                     8.0                     8.0
7.0
     H1N1_lower_0.95  H1N1_upper_0.95
0.5              7.0              7.0
4.0
     L. monocytogenes_lower_0.95  L. monocytogenes_upper_0.95
0.5                          4.0                          4.0
5.0
     S. pneumoniae_lower_0.95  S. pneumoniae_upper_0.95
0.5                       5.0                       5.0


In [39]:
df = pd.read_excel("./data/df_for_analysis.xlsx",index_col=0)
df.groupby('Infection')['time_original'].mean()


Infection
C. albicans      11.472222
H1N1              9.764881
Listeria          5.665076
S. pneumoniae     7.554785
Name: time_original, dtype: float64

In [40]:
df['time_original'].mean()

7.448451421298261