# Statistical Analysis
## Import packages and load data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import chisquare
import ast

In [2]:
df = pd.read_excel("./data/df_for_analysis.xlsx",index_col=0)

## General information

In [3]:
start_date = datetime(2013,1,1)
end_date = datetime.now()

#Select data between two dates
mask_date = (df['Date'] > start_date) & (df['Date'] <= end_date)
df = df.loc[mask_date]

In [4]:
df

Unnamed: 0,Mouse_ID,ID_Experiment,Cage,Strain,Date,Experiment,Group,Group_info,H0,Pre_traitment,...,survival_0.07,time_0.06,survival_0.06,time_0.05,survival_0.05,time_original,survival_original,max_loss_weight_percentage,exp,sub_exp
0,TRO-05432,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,9.0,1,0.629181,1,A
1,TRO-05433,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,9.0,1,0.660748,1,A
2,TRO-05434,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,2.5,1,2.5,1,9.0,1,0.639184,1,A
3,TRO-05435,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,6.0,1,0.664051,1,A
4,TRO-05456,ID_001,B,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,7.0,1,0.707420,1,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,TRO-028337,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,1.5,1,1.5,1,5.0,1,0.761733,3,no
2353,TRO-028338,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,2.5,1,2.5,1,4.0,1,0.865900,3,no
2354,TRO-028339,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,5.5,1,5.5,1,6.0,1,0.926829,3,no
2355,TRO-028342,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,0,8.0,0,8.0,0,8.0,0,0.996350,3,no


In [5]:
#change name of dead and alive for result formatting
df['survival_original'] = df['survival_original'].replace({1:'Dead',0:'Alive'})

In [6]:
#RESULT: N_Experiment; N_Mice
group_by_infection = df.groupby(['Infection'])
n_unique = group_by_infection.nunique()
n_unique_infos = n_unique.loc[:,:'ID_Experiment']
n_unique_infos = n_unique_infos.rename(columns={'ID_Experiment':'N_Experiment','Mouse_ID':'N_Mice'})

#RESULT: Alive; Dead; Alive_in_%
dead_alive = group_by_infection['survival_original'].value_counts().sort_index(ascending=False).unstack()

In [7]:
result = pd.concat([n_unique_infos,dead_alive],axis=1)
result

Unnamed: 0_level_0,N_Mice,N_Experiment,Alive,Dead
Infection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C. albicans,252,6,164,88
H1N1,336,19,203,133
Listeria,1048,39,555,493
S. pneumoniae,721,32,328,393


In [8]:
# Create a dictionary with the sum of each column
total_sum = result.sum()

# Create a DataFrame from the sum with the index name 'Total'
total_df = pd.DataFrame(total_sum).T
total_df.index = ['Total']
total_df.index.name = 'Infection'
total_df

Unnamed: 0_level_0,N_Mice,N_Experiment,Alive,Dead
Infection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total,2357,96,1250,1107


In [9]:
general_info = pd.concat([result,total_df],axis=0)

In [10]:
general_info['mortality'] = round(general_info['Dead']/(general_info['Alive']+general_info['Dead'])*100,1)
general_info

Unnamed: 0_level_0,N_Mice,N_Experiment,Alive,Dead,mortality
Infection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C. albicans,252,6,164,88,34.9
H1N1,336,19,203,133,39.6
Listeria,1048,39,555,493,47.0
S. pneumoniae,721,32,328,393,54.5
Total,2357,96,1250,1107,47.0


## Survival percentage

In [11]:
data = df.loc[:,['Infection','survival_original','survival_0.3','survival_0.25','survival_0.2','survival_0.15','survival_0.1']]
data = data.replace({0:'Alive',1:'Dead'})
data

Unnamed: 0,Infection,survival_original,survival_0.3,survival_0.25,survival_0.2,survival_0.15,survival_0.1
0,C. albicans,Dead,Dead,Dead,Dead,Dead,Dead
1,C. albicans,Dead,Dead,Dead,Dead,Dead,Dead
2,C. albicans,Dead,Dead,Dead,Dead,Dead,Dead
3,C. albicans,Dead,Dead,Dead,Dead,Dead,Dead
4,C. albicans,Dead,Dead,Dead,Dead,Dead,Dead
...,...,...,...,...,...,...,...
2352,S. pneumoniae,Dead,Dead,Dead,Dead,Dead,Dead
2353,S. pneumoniae,Dead,Dead,Dead,Dead,Dead,Dead
2354,S. pneumoniae,Dead,Dead,Dead,Dead,Dead,Dead
2355,S. pneumoniae,Alive,Alive,Alive,Alive,Alive,Alive


In [12]:
survival = data.melt(id_vars=["Infection"],value_name="survival",var_name="threshold")
survival = survival.groupby('Infection').value_counts().reset_index().rename(columns={0:"number_of_mice"})
survival

Unnamed: 0,Infection,threshold,survival,number_of_mice
0,C. albicans,survival_0.1,Dead,220
1,C. albicans,survival_0.15,Dead,187
2,C. albicans,survival_original,Alive,164
3,C. albicans,survival_0.2,Dead,149
4,C. albicans,survival_0.3,Alive,148
5,C. albicans,survival_0.25,Alive,131
6,C. albicans,survival_0.25,Dead,121
7,C. albicans,survival_0.3,Dead,104
8,C. albicans,survival_0.2,Alive,103
9,C. albicans,survival_original,Dead,88


In [13]:
# Pivot the table to have 'Infection' as columns and calculate the ratio
statistics = survival.pivot(index=['Infection',"threshold"], columns='survival', values='number_of_mice')
statistics['Ratio'] = statistics['Dead'] / (statistics['Dead'] + statistics['Alive'])
statistics = statistics.reset_index(level=1)
statistics['supplementary_death'] = statistics.groupby('Infection').apply(lambda x: x['Dead'] - x[x['threshold']=="survival_original"]['Dead']).values
statistics = statistics.reset_index()
statistics

survival,Infection,threshold,Alive,Dead,Ratio,supplementary_death
0,C. albicans,survival_0.1,32,220,0.873016,132
1,C. albicans,survival_0.15,65,187,0.742063,99
2,C. albicans,survival_0.2,103,149,0.59127,61
3,C. albicans,survival_0.25,131,121,0.480159,33
4,C. albicans,survival_0.3,148,104,0.412698,16
5,C. albicans,survival_original,164,88,0.349206,0
6,H1N1,survival_0.1,53,283,0.842262,150
7,H1N1,survival_0.15,87,249,0.741071,116
8,H1N1,survival_0.2,127,209,0.622024,76
9,H1N1,survival_0.25,158,178,0.529762,45


## Chi2 tests

In [14]:
def chi_square_in_lambda_function(x):
    observed = x[x['threshold']=='survival_original'][['Dead','Alive']].values.tolist()[0]
    result = []
    for index, rows in x.iterrows():
        data = rows[['Dead','Alive']]
        chi2, p = chisquare(observed,data)
        result += [p]
    x.index
    return pd.Series(result,x.index.values)

In [15]:
chi_result = statistics.groupby("Infection").apply(lambda x: chi_square_in_lambda_function(x))
statistics["chi2"] = chi_result.values
statistics


survival,Infection,threshold,Alive,Dead,Ratio,supplementary_death,chi2
0,C. albicans,survival_0.1,32,220,0.873016,132,1.172261e-137
1,C. albicans,survival_0.15,65,187,0.742063,99,4.191188e-46
2,C. albicans,survival_0.2,103,149,0.59127,61,5.426556e-15
3,C. albicans,survival_0.25,131,121,0.480159,33,3.170145e-05
4,C. albicans,survival_0.3,148,104,0.412698,16,0.04063269
5,C. albicans,survival_original,164,88,0.349206,0,1.0
6,H1N1,survival_0.1,53,283,0.842262,150,1.259931e-111
7,H1N1,survival_0.15,87,249,0.741071,116,2.630289e-47
8,H1N1,survival_0.2,127,209,0.622024,76,1.222074e-17
9,H1N1,survival_0.25,158,178,0.529762,45,8.715478e-07


### rearange the dataframe

In [16]:
statistics.pivot_table(values=["Alive","Dead","Ratio","supplementary_death","chi2"],index=["threshold","Infection"])

Unnamed: 0_level_0,survival,Alive,Dead,Ratio,chi2,supplementary_death
threshold,Infection,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
survival_0.1,C. albicans,32,220,0.873016,1.172261e-137,132
survival_0.1,H1N1,53,283,0.842262,1.259931e-111,150
survival_0.1,Listeria,272,776,0.740458,1.786476e-88,283
survival_0.1,S. pneumoniae,275,446,0.618585,4.832398e-05,53
survival_0.15,C. albicans,65,187,0.742063,4.191188e-46,99
survival_0.15,H1N1,87,249,0.741071,2.630289e-47,116
survival_0.15,Listeria,371,677,0.645992,1.40803e-32,184
survival_0.15,S. pneumoniae,306,415,0.575589,0.09737833,22
survival_0.2,C. albicans,103,149,0.59127,5.426556e-15,61
survival_0.2,H1N1,127,209,0.622024,1.222074e-17,76


## Mice under 30% THR

In [17]:
df

Unnamed: 0,Mouse_ID,ID_Experiment,Cage,Strain,Date,Experiment,Group,Group_info,H0,Pre_traitment,...,survival_0.07,time_0.06,survival_0.06,time_0.05,survival_0.05,time_original,survival_original,max_loss_weight_percentage,exp,sub_exp
0,TRO-05432,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,9.0,Dead,0.629181,1,A
1,TRO-05433,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,9.0,Dead,0.660748,1,A
2,TRO-05434,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,2.5,1,2.5,1,9.0,Dead,0.639184,1,A
3,TRO-05435,ID_001,A,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,6.0,Dead,0.664051,1,A
4,TRO-05456,ID_001,B,BALB/cByJ,2014-06-05,Candida/Propionate,1A,Propionate / 2*10^5,1,propionate,...,1,1.5,1,1.5,1,7.0,Dead,0.707420,1,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,TRO-028337,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,1.5,1,1.5,1,5.0,Dead,0.761733,3,no
2353,TRO-028338,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,2.5,1,2.5,1,4.0,Dead,0.865900,3,no
2354,TRO-028339,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,1,5.5,1,5.5,1,6.0,Dead,0.926829,3,no
2355,TRO-028342,ID_096,ETRO-01911,C57BL/6J,2023-03-03,Pneumococcus/Training/Cross-fostering/male,3,D. Zy-Zy,1,training/cross-fostering,...,0,8.0,0,8.0,0,8.0,Alive,0.996350,3,no


In [18]:
df_under_30 = df[df['max_loss_weight_percentage']<0.7]
df_under_30.groupby('Infection')['Mouse_ID'].count()

Infection
C. albicans      64
H1N1             68
Listeria          8
S. pneumoniae     2
Name: Mouse_ID, dtype: int64

In [19]:
len(df_under_30)

142

In [20]:
df_under_30_survivor = df_under_30[df_under_30['survival_original']=='Alive']
df_under_30_survivor.groupby('Infection')['Mouse_ID'].count()

Infection
C. albicans      16
H1N1             17
Listeria          5
S. pneumoniae     1
Name: Mouse_ID, dtype: int64

In [21]:
len(df_under_30_survivor)

39

## Number of Mice per ethical autorization (included in analysis)

In [27]:
df_auth1 = df[df['Date'] < datetime(2018,5,1)] # 2 were already excluded from the data
df_auth2 = df[df['Date']>= datetime(2018,5,1)] # 41 were already excluded from the data
df_auth2.head()

Unnamed: 0,Mouse_ID,ID_Experiment,Cage,Strain,Date,Experiment,Group,Group_info,H0,Pre_traitment,...,survival_0.07,time_0.06,survival_0.06,time_0.05,survival_0.05,time_original,survival_original,max_loss_weight_percentage,exp,sub_exp
777,TRO-18099,ID_029,ETRO-00940,C57BL/6J,2018-05-28,Listeria/Clodronate/Training,1,Zymosam + PBS liposome + Listeria,-1,training/zymosan,...,0,6.5,1,2.5,1,8.0,Alive,0.935644,1,no
778,TRO-18100,ID_029,ETRO-00940,C57BL/6J,2018-05-28,Listeria/Clodronate/Training,1,Zymosam + PBS liposome + Listeria,-1,training/zymosan,...,1,2.5,1,2.5,1,8.0,Alive,0.876289,1,no
779,TRO-18101,ID_029,ETRO-00940,C57BL/6J,2018-05-28,Listeria/Clodronate/Training,1,Zymosam + PBS liposome + Listeria,-1,training/zymosan,...,0,8.0,0,8.0,0,8.0,Alive,0.973404,1,no
780,TRO-18102,ID_029,ETRO-00940,C57BL/6J,2018-05-28,Listeria/Clodronate/Training,1,Zymosam + PBS liposome + Listeria,-1,training/zymosan,...,1,2.5,1,2.5,1,8.0,Alive,0.893023,1,no
781,TRO-18103,ID_029,ETRO-00940,C57BL/6J,2018-05-28,Listeria/Clodronate/Training,1,Zymosam + PBS liposome + Listeria,-1,training/zymosan,...,0,6.5,1,2.5,1,8.0,Alive,0.935135,1,no


### authorization 1, from 2013-2018

In [31]:
df_auth1.groupby(['Infection'])['Mouse_ID'].count()

Infection
C. albicans      252
H1N1              63
Listeria         263
S. pneumoniae    211
Name: Mouse_ID, dtype: int64

In [33]:
#total
df_auth1.groupby(['Infection'])['Mouse_ID'].count().sum()

789

### authorization 2, from 2018-2023

In [32]:
df_auth2.groupby(['Infection'])['Mouse_ID'].count()

Infection
H1N1             273
Listeria         785
S. pneumoniae    510
Name: Mouse_ID, dtype: int64

In [34]:
#total
df_auth2.groupby(['Infection'])['Mouse_ID'].count().sum()

1568

## Average maximum weight loss for non survivor mice

In [42]:
df_dead = df[df['survival_original']=='Dead']
df_dead.groupby('Infection').max_loss_weight_percentage.mean()*100

Infection
C. albicans      72.624314
H1N1             75.015886
Listeria         81.643642
S. pneumoniae    84.984221
Name: max_loss_weight_percentage, dtype: float64

In [43]:
df_dead.max_loss_weight_percentage.mean()*100

81.31940006860864