# Dataset statistics

In this file we compute the statistics of the dataset so that it is used for the writing of the project report. 
The statistics only concern the clinical data. 

Author: Pierre-Louis Benveniste

## Library import

In [100]:
# Import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import yaml
from prettytable import PrettyTable
# import ptitprince as pt

## Dataset import

Here we load the merged dataset containing both the clinical and the radiomics data. 

In [101]:
dataset = pd.read_csv('~/Documents/lung-treatment-response/data/merged_data.csv')

# Print the shape of the dataset
print('The shape of the dataset is:', dataset.shape)

# Print the columns of the dataset
print('The columns of the dataset are:\n', list(dataset.columns))

The shape of the dataset is: (181, 161)
The columns of the dataset are:
 ['sexe', 'age', 'BMI', 'score_charlson', 'OMS', 'tabac', 'tabac_PA', 'tabac_sevre', 'histo', 'T', 'centrale', 'dose_tot', 'etalement', 'vol_GTV', 'vol_PTV', 'vol_ITV', 'couv_PTV', 'BED_10', 'DC', 'DDD', 'cause_DC', 'Date_R_PTV', 'Date_R_homo', 'Date_R_med', 'Date_R_contro', 'Date_R_horspoum', 'Reponse', 'rechute_PTV', 'rechute_homo', 'rechute_med', 'rechute_contro', 'rechute_horspoum', 'delai_fin_DC', 'delai_fin_rechutePTV', 'delai_fin_rechuteHomo', 'delai_fin_rechuteMed', 'delai_fin_rechuteContro', 'delai_fin_rechuteHorspoum', 'MORPHOLOGICAL_Volume', 'MORPHOLOGICAL_ApproximateVolume', 'MORPHOLOGICAL_voxelsCounting', 'MORPHOLOGICAL_SurfaceArea', 'MORPHOLOGICAL_SurfaceToVolumeRatio', 'MORPHOLOGICAL_Compacity', 'MORPHOLOGICAL_Compactness1', 'MORPHOLOGICAL_Compactness2', 'MORPHOLOGICAL_SphericalDisproportion', 'MORPHOLOGICAL_Sphericity', 'MORPHOLOGICAL_Asphericity', 'MORPHOLOGICAL_CentreOfMassShift', 'MORPHOLOGICAL_M

## Participants statistics

In this first part we detail the statistics of the dataset regarding specific to the participant. 

To do so, we average the different features across nodules (if a participant has multiple nodule). 

In [102]:
# We remove the dates which are not useful for the analysis
data = dataset.drop(columns=[ 'DDD', 'Date_R_PTV', 'Date_R_homo', 'Date_R_med', 'Date_R_contro',
                            'Date_R_horspoum','subject_nodule', 'nodule'])

# We average the columns for the same patients across the different nodules
data_grouped = data.groupby('subject_id').mean().reset_index()

# We correct some values which wouldn't make sense if they have been averaged (such as yes and no categories)
data_grouped['rechute_homo'] = data_grouped['rechute_homo'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_med'] = data_grouped['rechute_med'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_contro'] = data_grouped['rechute_contro'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_horspoum'] = data_grouped['rechute_horspoum'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_PTV'] = data_grouped['rechute_PTV'].apply(lambda x: 1 if x > 0 else 0)
# Replace nan values in 'DC' by 0
data_grouped['DC'] = data_grouped['DC'].fillna(0)


Now we perform data analysis for each of the following features: 
- `sexe`: sex of the participant (1=female, 0=male)
- `age`: age of the participant
- `BMI`: Body Mass Index of the participant
- `score_charlson`: The Charlson score of the participant
- `OMS`: 
- `tabac`: if the participant smokes (1=yes, 0=no)
- `tabac_PA`: pack years (the number of packs smoked per year multiplied by the number of years smoked)
- `tabac_sevre`: if the participant is weaned off tobacco
- `DC`: if the participant died (1=yes, 0=no)
- `cause_DC`: the cause of death (1=SPE, 2=infection, 3=AEG, 4=progression, 5=other)
- `Reponse`: how the participant reacted to the treatment (1=stable, 0= no response)
- `rechute_PTV`: if the participant had a local relapse (1=yes, 0=no)
- `rechute_homo`: if the participant had a homo-lateral relapse (1=yes, 0=no)
- `rechute_med`: if the participant had a mediastinal relapse (1=yes, 0=no)
- `rechute_contro`: if the participant had a controlateral relapse (1=yes, 0=no)
- `rechute_horspoum`: if the participant had a relapse outside of the lungs (1=yes, 0=no)
- `delai_fin_DC`: the interval between the end of the treatment and death (if death occurs)
- `delai_fin_rechutePTV`: the interval between the end of the treatment and a local relapse (if it occurs)
- `delai_fin_rechuteHomo`: the interval between the end of the treatment and an homolateral relapse (if it occurs)
- `delai_fin_rechuteMed`: the interval between the end of the treatment and a mediastinal relapse (if it occurs)
- `delai_fin_rechuteContro`: the interval between the end of the treatment and a controlateral relapse (if it occurs)
- `delai_fin_rechuteHorspoum`: the interval between the end of the treatment and a relapse outside of the lungs (if it occurs)

In [103]:
# First let's look into sex
total_participant = data_grouped.shape[0]
table = PrettyTable()
table.field_names = ['Sex', 'Count', '%']
table.add_row(['Female', data_grouped[data_grouped["sexe"]==1]["sexe"].count(), round(data_grouped[data_grouped["sexe"]==1]["sexe"].count()/total_participant*100,1)])
table.add_row(['Male', data_grouped[data_grouped["sexe"]==0]["sexe"].count(), round(data_grouped[data_grouped["sexe"]==0]["sexe"].count()/total_participant*100,1)])
print(table)

+--------+-------+------+
|  Sex   | Count |  %   |
+--------+-------+------+
| Female |   63  | 38.7 |
|  Male  |  100  | 61.3 |
+--------+-------+------+


In [104]:
# Now let's look into age
table = PrettyTable()
table.field_names = ['Age', '']
table.add_row(['Mean', round(data_grouped["age"].mean(),1)])
table.add_row(['Min', data_grouped["age"].min()])
table.add_row(['Max', data_grouped["age"].max()])
table.add_row(['Median', data_grouped["age"].median()])
table.add_row(['Std', round(data_grouped["age"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Age', 'Count', '%']
table.add_row(['<=50', data_grouped[data_grouped["age"]<=50]["age"].count(), round(data_grouped[data_grouped["age"]<=50]["age"].count()/total_participant*100,1)])
table.add_row(['50<...<=60', data_grouped[(data_grouped["age"]>50) & (data_grouped["age"]<=60)]["age"].count(), round(data_grouped[(data_grouped["age"]>50) & (data_grouped["age"]<=60)]["age"].count()/total_participant*100,1)])
table.add_row(['60<...<=70', data_grouped[(data_grouped["age"]>60) & (data_grouped["age"]<=70)]["age"].count(), round(data_grouped[(data_grouped["age"]>60) & (data_grouped["age"]<=70)]["age"].count()/total_participant*100,1)])
table.add_row(['70<...<=80', data_grouped[(data_grouped["age"]>70) & (data_grouped["age"]<=80)]["age"].count(), round(data_grouped[(data_grouped["age"]>70) & (data_grouped["age"]<=80)]["age"].count()/total_participant*100,1)])
table.add_row(['>80', data_grouped[data_grouped["age"]>80]["age"].count(), round(data_grouped[data_grouped["age"]>80]["age"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped[data_grouped["age"].isnull()]["age"].count(), round(data_grouped[data_grouped["age"].isnull()]["age"].count()/total_participant*100,1)])
print(table)

+--------+------+
|  Age   |      |
+--------+------+
|  Mean  | 71.6 |
|  Min   | 46.0 |
|  Max   | 93.0 |
| Median | 71.0 |
|  Std   | 9.7  |
+--------+------+
+------------+-------+------+
|    Age     | Count |  %   |
+------------+-------+------+
|    <=50    |   3   | 1.8  |
| 50<...<=60 |   16  | 9.8  |
| 60<...<=70 |   58  | 35.6 |
| 70<...<=80 |   56  | 34.4 |
|    >80     |   30  | 18.4 |
|  Missing   |   0   | 0.0  |
+------------+-------+------+


In [105]:
# Now let's focus on the BMI
table = PrettyTable()
table.field_names = ['BMI', '']
table.add_row(['Mean', round(data_grouped["BMI"].mean(),1)])
table.add_row(['Min', data_grouped["BMI"].min()])
table.add_row(['Max', data_grouped["BMI"].max()])
table.add_row(['Median', data_grouped["BMI"].median()])
table.add_row(['Std', round(data_grouped["BMI"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['BMI', 'Count', '%']
table.add_row(['<=18.5', data_grouped[data_grouped["BMI"]<=18.5]["BMI"].count(), round(data_grouped[data_grouped["BMI"]<=18.5]["BMI"].count()/total_participant*100,1)])
table.add_row(['18.5<...<=25', data_grouped[(data_grouped["BMI"]>18.5) & (data_grouped["BMI"]<=25)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>18.5) & (data_grouped["BMI"]<=25)]["BMI"].count()/total_participant*100,1)])
table.add_row(['25<...<=30', data_grouped[(data_grouped["BMI"]>25) & (data_grouped["BMI"]<=30)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>25) & (data_grouped["BMI"]<=30)]["BMI"].count()/total_participant*100,1)])
table.add_row(['30<...<=35', data_grouped[(data_grouped["BMI"]>30) & (data_grouped["BMI"]<=35)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>30) & (data_grouped["BMI"]<=35)]["BMI"].count()/total_participant*100,1)])
table.add_row(['>35', data_grouped[data_grouped["BMI"]>35]["BMI"].count(), round(data_grouped[data_grouped["BMI"]>35]["BMI"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['BMI'].isnull().sum(), round(data_grouped['BMI'].isnull().sum()/total_participant*100,1)])
print(table)

+--------+------+
|  BMI   |      |
+--------+------+
|  Mean  | 25.4 |
|  Min   | 15.6 |
|  Max   | 41.9 |
| Median | 24.2 |
|  Std   | 6.0  |
+--------+------+
+--------------+-------+------+
|     BMI      | Count |  %   |
+--------------+-------+------+
|    <=18.5    |   3   | 1.8  |
| 18.5<...<=25 |   29  | 17.8 |
|  25<...<=30  |   14  | 8.6  |
|  30<...<=35  |   4   | 2.5  |
|     >35      |   5   | 3.1  |
|   Missing    |  108  | 66.3 |
+--------------+-------+------+


In [106]:
# Now let's focus on the Charlson score
table = PrettyTable()
table.field_names = ['Charlson score', '']
table.add_row(['Mean', round(data_grouped["score_charlson"].mean(),1)])
table.add_row(['Min', data_grouped["score_charlson"].min()])
table.add_row(['Max', data_grouped["score_charlson"].max()])
table.add_row(['Median', data_grouped["score_charlson"].median()])
table.add_row(['Std', round(data_grouped["score_charlson"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Charlson score', 'Count', '%']
table.add_row(['<=3', data_grouped[data_grouped["score_charlson"]<=3]["score_charlson"].count(), round(data_grouped[data_grouped["score_charlson"]<=3]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['3<...<=6', data_grouped[(data_grouped["score_charlson"]>3) & (data_grouped["score_charlson"]<=6)]["score_charlson"].count(), round(data_grouped[(data_grouped["score_charlson"]>3) & (data_grouped["score_charlson"]<=6)]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['6<...<=9', data_grouped[(data_grouped["score_charlson"]>6) & (data_grouped["score_charlson"]<=9)]["score_charlson"].count(), round(data_grouped[(data_grouped["score_charlson"]>6) & (data_grouped["score_charlson"]<=9)]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['>9', data_grouped[data_grouped["score_charlson"]>9]["score_charlson"].count(), round(data_grouped[data_grouped["score_charlson"]>9]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['score_charlson'].isnull().sum(), round(data_grouped['score_charlson'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------+------+
| Charlson score |      |
+----------------+------+
|      Mean      | 7.2  |
|      Min       | 2.0  |
|      Max       | 13.0 |
|     Median     | 7.0  |
|      Std       | 2.3  |
+----------------+------+
+----------------+-------+------+
| Charlson score | Count |  %   |
+----------------+-------+------+
|      <=3       |   4   | 2.5  |
|    3<...<=6    |   69  | 42.3 |
|    6<...<=9    |   58  | 35.6 |
|       >9       |   32  | 19.6 |
|    Missing     |   0   | 0.0  |
+----------------+-------+------+


In [107]:
# Now the OMS
table = PrettyTable()
table.field_names = ['OMS', '']
table.add_row(['Mean', round(data_grouped["OMS"].mean(),1)])
table.add_row(['Min', data_grouped["OMS"].min()])
table.add_row(['Max', data_grouped["OMS"].max()])
table.add_row(['Median', data_grouped["OMS"].median()])
table.add_row(['Std', round(data_grouped["OMS"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['OMS', 'Count', '%']
table.add_row(['0', data_grouped[data_grouped["OMS"]==0]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==0]["OMS"].count()/total_participant*100,1)])
table.add_row(['1', data_grouped[data_grouped["OMS"]==1]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==1]["OMS"].count()/total_participant*100,1)])
table.add_row(['2', data_grouped[data_grouped["OMS"]==2]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==2]["OMS"].count()/total_participant*100,1)])
table.add_row(['3', data_grouped[data_grouped["OMS"]==3]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==3]["OMS"].count()/total_participant*100,1)])
table.add_row(['4', data_grouped[data_grouped["OMS"]==4]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==4]["OMS"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['OMS'].isnull().sum(), round(data_grouped['OMS'].isnull().sum()/total_participant*100,1)])
print(table)

+--------+-----+
|  OMS   |     |
+--------+-----+
|  Mean  | 0.8 |
|  Min   | 0.0 |
|  Max   | 2.0 |
| Median | 1.0 |
|  Std   | 0.5 |
+--------+-----+
+---------+-------+------+
|   OMS   | Count |  %   |
+---------+-------+------+
|    0    |   38  | 23.3 |
|    1    |  106  | 65.0 |
|    2    |   11  | 6.7  |
|    3    |   0   | 0.0  |
|    4    |   0   | 0.0  |
| Missing |   8   | 4.9  |
+---------+-------+------+


In [108]:
# Now the smoking status
table = PrettyTable()
table.field_names = ['Smoking status', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["tabac"]==1]["tabac"].count(), round(data_grouped[data_grouped["tabac"]==1]["tabac"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["tabac"]==0]["tabac"].count(), round(data_grouped[data_grouped["tabac"]==0]["tabac"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac'].isnull().sum(), round(data_grouped['tabac'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------+-------+------+
| Smoking status | Count |  %   |
+----------------+-------+------+
|      Yes       |  123  | 75.5 |
|       No       |   27  | 16.6 |
|    Missing     |   13  | 8.0  |
+----------------+-------+------+


In [109]:
# Now the pack-years
table = PrettyTable()
table.field_names = ['Pack-years', '']
table.add_row(['Mean', round(data_grouped["tabac_PA"].mean(),1)])
table.add_row(['Min', data_grouped["tabac_PA"].min()])
table.add_row(['Max', data_grouped["tabac_PA"].max()])
table.add_row(['Median', data_grouped["tabac_PA"].median()])
table.add_row(['Std', round(data_grouped["tabac_PA"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Pack-years', 'Count', '%']
table.add_row(['<=20', data_grouped[data_grouped["tabac_PA"]<=20]["tabac_PA"].count(), round(data_grouped[data_grouped["tabac_PA"]<=20]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['20<...<=40', data_grouped[(data_grouped["tabac_PA"]>20) & (data_grouped["tabac_PA"]<=40)]["tabac_PA"].count(), round(data_grouped[(data_grouped["tabac_PA"]>20) & (data_grouped["tabac_PA"]<=40)]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['40<...<=60', data_grouped[(data_grouped["tabac_PA"]>40) & (data_grouped["tabac_PA"]<=60)]["tabac_PA"].count(), round(data_grouped[(data_grouped["tabac_PA"]>40) & (data_grouped["tabac_PA"]<=60)]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['>60', data_grouped[data_grouped["tabac_PA"]>60]["tabac_PA"].count(), round(data_grouped[data_grouped["tabac_PA"]>60]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac_PA'].isnull().sum(), round(data_grouped['tabac_PA'].isnull().sum()/total_participant*100,1)])
print(table)

+------------+-------+
| Pack-years |       |
+------------+-------+
|    Mean    |  48.8 |
|    Min     |  7.0  |
|    Max     | 120.0 |
|   Median   |  45.0 |
|    Std     |  24.3 |
+------------+-------+
+------------+-------+------+
| Pack-years | Count |  %   |
+------------+-------+------+
|    <=20    |   11  | 6.7  |
| 20<...<=40 |   39  | 23.9 |
| 40<...<=60 |   36  | 22.1 |
|    >60     |   23  | 14.1 |
|  Missing   |   54  | 33.1 |
+------------+-------+------+


In [110]:
# Now if the participant had weaned off smoking
table = PrettyTable()
table.field_names = ['Weaned off smoking', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["tabac_sevre"]==1]["tabac_sevre"].count(), round(data_grouped[data_grouped["tabac_sevre"]==1]["tabac_sevre"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["tabac_sevre"]==0]["tabac_sevre"].count(), round(data_grouped[data_grouped["tabac_sevre"]==0]["tabac_sevre"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac_sevre'].isnull().sum(), round(data_grouped['tabac_sevre'].isnull().sum()/total_participant*100,1)])
print(table)

+--------------------+-------+------+
| Weaned off smoking | Count |  %   |
+--------------------+-------+------+
|        Yes         |   80  | 49.1 |
|         No         |   37  | 22.7 |
|      Missing       |   46  | 28.2 |
+--------------------+-------+------+


In [111]:
# Now if the person has died or not
table = PrettyTable()
table.field_names = ['Death', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["DC"]==1]["DC"].count(), round(data_grouped[data_grouped["DC"]==1]["DC"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["DC"]==0]["DC"].count(), round(data_grouped[data_grouped["DC"]==0]["DC"].count()/total_participant*100,1)])
print(table)
# And the interval between the end of treatment and death
table = PrettyTable()
table.field_names = ['Interval between end of treatment and death (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_DC"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_DC"].min()])
table.add_row(['Max', data_grouped["delai_fin_DC"].max()])
table.add_row(['Median', data_grouped["delai_fin_DC"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_DC"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Interval between end of treatment and death (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_DC"]<=365]["delai_fin_DC"].count(), round(data_grouped[data_grouped["delai_fin_DC"]<=365]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_DC"]>365) & (data_grouped["delai_fin_DC"]<=730)]["delai_fin_DC"].count(), round(data_grouped[(data_grouped["delai_fin_DC"]>365) & (data_grouped["delai_fin_DC"]<=730)]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_DC"]>730) & (data_grouped["delai_fin_DC"]<=1095)]["delai_fin_DC"].count(), round(data_grouped[(data_grouped["delai_fin_DC"]>730) & (data_grouped["delai_fin_DC"]<=1095)]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_DC"]>1095]["delai_fin_DC"].count(), round(data_grouped[data_grouped["delai_fin_DC"]>1095]["delai_fin_DC"].count()/total_participant*100,1)])  
table.add_row(['Missing', data_grouped['delai_fin_DC'].isnull().sum(), round(data_grouped['delai_fin_DC'].isnull().sum()/total_participant*100,1)])
print(table)

+-------+-------+------+
| Death | Count |  %   |
+-------+-------+------+
|  Yes  |   47  | 28.8 |
|   No  |  116  | 71.2 |
+-------+-------+------+
+----------------------------------------------------+--------+
| Interval between end of treatment and death (days) |        |
+----------------------------------------------------+--------+
|                        Mean                        | 801.5  |
|                        Min                         |  0.0   |
|                        Max                         | 2033.0 |
|                       Median                       | 739.0  |
|                        Std                         | 474.8  |
+----------------------------------------------------+--------+
+----------------------------------------------------+-------+------+
| Interval between end of treatment and death (days) | Count |  %   |
+----------------------------------------------------+-------+------+
|                       <=365                        |   10  | 6

In [112]:
# Now we look at the cause of death
table = PrettyTable()
table.field_names = ['Cause of death', 'Count', '% of total death']
table.add_row(['SPE', data_grouped[data_grouped["cause_DC"]==1]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==1]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Infection', data_grouped[data_grouped["cause_DC"]==2]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==2]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['AEG', data_grouped[data_grouped["cause_DC"]==3]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==3]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Progression', data_grouped[data_grouped["cause_DC"]==4]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==4]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Other', data_grouped[data_grouped["cause_DC"]==5]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==5]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Missing', data_grouped['cause_DC'].isnull().sum()-data_grouped[data_grouped["DC"]==0]["DC"].count(), round((data_grouped['cause_DC'].isnull().sum()-data_grouped[data_grouped["DC"]==0]["DC"].count())/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
print(table)

+----------------+-------+------------------+
| Cause of death | Count | % of total death |
+----------------+-------+------------------+
|      SPE       |   7   |       14.9       |
|   Infection    |   4   |       8.5        |
|      AEG       |   2   |       4.3        |
|  Progression   |   3   |       6.4        |
|     Other      |   2   |       4.3        |
|    Missing     |   29  |       61.7       |
+----------------+-------+------------------+


In [113]:
# Now we look at the response to the treatment
table = PrettyTable()
table.field_names = ['Response to treatment', 'Count', '%']
table.add_row(['Stable', data_grouped[data_grouped['Reponse']==1]["Reponse"].count(), round(data_grouped[data_grouped['Reponse']==1]["Reponse"].count()/total_participant*100,1)])
table.add_row(['No response', data_grouped[data_grouped['Reponse']==0]["Reponse"].count(), round(data_grouped[data_grouped['Reponse']==0]["Reponse"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['Reponse'].isnull().sum(), round(data_grouped['Reponse'].isnull().sum()/total_participant*100,1)])
print(table)

+-----------------------+-------+------+
| Response to treatment | Count |  %   |
+-----------------------+-------+------+
|         Stable        |  112  | 68.7 |
|      No response      |   3   | 1.8  |
|        Missing        |   48  | 29.4 |
+-----------------------+-------+------+


In [114]:
# Now we look at local relapse
table = PrettyTable()
table.field_names = ['Local relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_PTV']==1]["rechute_PTV"].count(), round(data_grouped[data_grouped['rechute_PTV']==1]["rechute_PTV"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_PTV']==0]["rechute_PTV"].count(), round(data_grouped[data_grouped['rechute_PTV']==0]["rechute_PTV"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_PTV'].isnull().sum(), round(data_grouped['rechute_PTV'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the local relapse
table = PrettyTable()
table.field_names = ['Local relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechutePTV"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechutePTV"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechutePTV"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechutePTV"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechutePTV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Local relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechutePTV"]<=365]["delai_fin_rechutePTV"].count(), round(data_grouped[data_grouped["delai_fin_rechutePTV"]<=365]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechutePTV"]>365) & (data_grouped["delai_fin_rechutePTV"]<=730)]["delai_fin_rechutePTV"].count(), round(data_grouped[(data_grouped["delai_fin_rechutePTV"]>365) & (data_grouped["delai_fin_rechutePTV"]<=730)]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechutePTV"]>730) & (data_grouped["delai_fin_rechutePTV"]<=1095)]["delai_fin_rechutePTV"].count(), round(data_grouped[(data_grouped["delai_fin_rechutePTV"]>730) & (data_grouped["delai_fin_rechutePTV"]<=1095)]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechutePTV"]>1095]["delai_fin_rechutePTV"].count(), round(data_grouped[data_grouped["delai_fin_rechutePTV"]>1095]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechutePTV'].isnull().sum(), round(data_grouped['delai_fin_rechutePTV'].isnull().sum()/total_participant*100,1)])
print(table)


+---------------+-------+------+
| Local relapse | Count |  %   |
+---------------+-------+------+
|      Yes      |   24  | 14.7 |
|       No      |  139  | 85.3 |
|    Missing    |   0   | 0.0  |
+---------------+-------+------+
+-------------------------------+--------+
| Local relapse interval (days) |        |
+-------------------------------+--------+
|              Mean             | 451.5  |
|              Min              |  0.0   |
|              Max              | 1175.0 |
|             Median            | 405.5  |
|              Std              | 292.2  |
+-------------------------------+--------+
+-------------------------------+-------+------+
| Local relapse interval (days) | Count |  %   |
+-------------------------------+-------+------+
|             <=365             |   10  | 6.1  |
|          365<...<=730         |   11  | 6.7  |
|         730<...<=1095         |   1   | 0.6  |
|             >1095             |   2   | 1.2  |
|            Missing            |  139 

In [115]:
# Now we look at relapse in the homolateral lung
table = PrettyTable()
table.field_names = ['Homolateral relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_homo']==1]["rechute_homo"].count(), round(data_grouped[data_grouped['rechute_homo']==1]["rechute_homo"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_homo']==0]["rechute_homo"].count(), round(data_grouped[data_grouped['rechute_homo']==0]["rechute_homo"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_homo'].isnull().sum(), round(data_grouped['rechute_homo'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the homolateral relapse
table = PrettyTable()
table.field_names = ['Homolateral relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteHomo"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteHomo"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteHomo"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteHomo"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteHomo"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Homolateral relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteHomo"]<=365]["delai_fin_rechuteHomo"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHomo"]<=365]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteHomo"]>365) & (data_grouped["delai_fin_rechuteHomo"]<=730)]["delai_fin_rechuteHomo"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHomo"]>365) & (data_grouped["delai_fin_rechuteHomo"]<=730)]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteHomo"]>730) & (data_grouped["delai_fin_rechuteHomo"]<=1095)]["delai_fin_rechuteHomo"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHomo"]>730) & (data_grouped["delai_fin_rechuteHomo"]<=1095)]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteHomo"]>1095]["delai_fin_rechuteHomo"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHomo"]>1095]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteHomo'].isnull().sum(), round(data_grouped['delai_fin_rechuteHomo'].isnull().sum()/total_participant*100,1)])
print(table)

+---------------------+-------+------+
| Homolateral relapse | Count |  %   |
+---------------------+-------+------+
|         Yes         |   25  | 15.3 |
|          No         |  138  | 84.7 |
|       Missing       |   0   | 0.0  |
+---------------------+-------+------+
+-------------------------------------+--------+
| Homolateral relapse interval (days) |        |
+-------------------------------------+--------+
|                 Mean                | 516.8  |
|                 Min                 |  64.0  |
|                 Max                 | 1077.0 |
|                Median               | 503.0  |
|                 Std                 | 308.4  |
+-------------------------------------+--------+
+-------------------------------------+-------+------+
| Homolateral relapse interval (days) | Count |  %   |
+-------------------------------------+-------+------+
|                <=365                |   9   | 5.5  |
|             365<...<=730            |   9   | 5.5  |
|          

In [116]:
# Now we look at relapse in the mediastinum
table = PrettyTable()
table.field_names = ['Mediastinal relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_med']==1]["rechute_med"].count(), round(data_grouped[data_grouped['rechute_med']==1]["rechute_med"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_med']==0]["rechute_med"].count(), round(data_grouped[data_grouped['rechute_med']==0]["rechute_med"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_med'].isnull().sum(), round(data_grouped['rechute_med'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the mediastinum relapse
table = PrettyTable()
table.field_names = ['Mediastinal relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteMed"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteMed"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteMed"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteMed"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteMed"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Mediastinal relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteMed"]<=365]["delai_fin_rechuteMed"].count(), round(data_grouped[data_grouped["delai_fin_rechuteMed"]<=365]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteMed"]>365) & (data_grouped["delai_fin_rechuteMed"]<=730)]["delai_fin_rechuteMed"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteMed"]>365) & (data_grouped["delai_fin_rechuteMed"]<=730)]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteMed"]>730) & (data_grouped["delai_fin_rechuteMed"]<=1095)]["delai_fin_rechuteMed"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteMed"]>730) & (data_grouped["delai_fin_rechuteMed"]<=1095)]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteMed"]>1095]["delai_fin_rechuteMed"].count(), round(data_grouped[data_grouped["delai_fin_rechuteMed"]>1095]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteMed'].isnull().sum(), round(data_grouped['delai_fin_rechuteMed'].isnull().sum()/total_participant*100,1)])
print(table)

+---------------------+-------+------+
| Mediastinal relapse | Count |  %   |
+---------------------+-------+------+
|         Yes         |   19  | 11.7 |
|          No         |  144  | 88.3 |
|       Missing       |   0   | 0.0  |
+---------------------+-------+------+
+-------------------------------------+--------+
| Mediastinal relapse interval (days) |        |
+-------------------------------------+--------+
|                 Mean                | 419.4  |
|                 Min                 |  59.0  |
|                 Max                 | 1413.0 |
|                Median               | 413.0  |
|                 Std                 | 327.3  |
+-------------------------------------+--------+
+-------------------------------------+-------+------+
| Mediastinal relapse interval (days) | Count |  %   |
+-------------------------------------+-------+------+
|                <=365                |   8   | 4.9  |
|             365<...<=730            |   9   | 5.5  |
|          

In [117]:
# Now we look at relapse in the contralateral lung
table = PrettyTable()
table.field_names = ['Contralateral relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_contro']==1]["rechute_contro"].count(), round(data_grouped[data_grouped['rechute_contro']==1]["rechute_contro"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_contro']==0]["rechute_contro"].count(), round(data_grouped[data_grouped['rechute_contro']==0]["rechute_contro"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_contro'].isnull().sum(), round(data_grouped['rechute_contro'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the contralateral relapse
table = PrettyTable()
table.field_names = ['Contralateral relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteContro"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteContro"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteContro"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteContro"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteContro"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Contralateral relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteContro"]<=365]["delai_fin_rechuteContro"].count(), round(data_grouped[data_grouped["delai_fin_rechuteContro"]<=365]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteContro"]>365) & (data_grouped["delai_fin_rechuteContro"]<=730)]["delai_fin_rechuteContro"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteContro"]>365) & (data_grouped["delai_fin_rechuteContro"]<=730)]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteContro"]>730) & (data_grouped["delai_fin_rechuteContro"]<=1095)]["delai_fin_rechuteContro"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteContro"]>730) & (data_grouped["delai_fin_rechuteContro"]<=1095)]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteContro"]>1095]["delai_fin_rechuteContro"].count(), round(data_grouped[data_grouped["delai_fin_rechuteContro"]>1095]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteContro'].isnull().sum(), round(data_grouped['delai_fin_rechuteContro'].isnull().sum()/total_participant*100,1)])
print(table)


+-----------------------+-------+------+
| Contralateral relapse | Count |  %   |
+-----------------------+-------+------+
|          Yes          |   23  | 14.1 |
|           No          |  140  | 85.9 |
|        Missing        |   0   | 0.0  |
+-----------------------+-------+------+
+---------------------------------------+--------+
| Contralateral relapse interval (days) |        |
+---------------------------------------+--------+
|                  Mean                 | 503.5  |
|                  Min                  |  0.0   |
|                  Max                  | 1409.0 |
|                 Median                | 503.0  |
|                  Std                  | 347.1  |
+---------------------------------------+--------+
+---------------------------------------+-------+------+
| Contralateral relapse interval (days) | Count |  %   |
+---------------------------------------+-------+------+
|                 <=365                 |   8   | 4.9  |
|              365<...<=73

In [120]:
# Now we look at relapse outside the lungs
table = PrettyTable()
table.field_names = ['Outside lung relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_horspoum']==1]["rechute_horspoum"].count(), round(data_grouped[data_grouped['rechute_horspoum']==1]["rechute_horspoum"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_horspoum']==0]["rechute_horspoum"].count(), round(data_grouped[data_grouped['rechute_horspoum']==0]["rechute_horspoum"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_horspoum'].isnull().sum(), round(data_grouped['rechute_horspoum'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the outside lung relapse
table = PrettyTable()
table.field_names = ['Outside lung relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteHorspoum"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteHorspoum"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteHorspoum"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteHorspoum"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteHorspoum"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Outside lung relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteHorspoum"]<=365]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHorspoum"]<=365]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>365) & (data_grouped["delai_fin_rechuteHorspoum"]<=730)]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>365) & (data_grouped["delai_fin_rechuteHorspoum"]<=730)]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>730) & (data_grouped["delai_fin_rechuteHorspoum"]<=1095)]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>730) & (data_grouped["delai_fin_rechuteHorspoum"]<=1095)]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteHorspoum"]>1095]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHorspoum"]>1095]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteHorspoum'].isnull().sum(), round(data_grouped['delai_fin_rechuteHorspoum'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------------+-------+------+
| Outside lung relapse | Count |  %   |
+----------------------+-------+------+
|         Yes          |   35  | 21.5 |
|          No          |  128  | 78.5 |
|       Missing        |   0   | 0.0  |
+----------------------+-------+------+
+--------------------------------------+--------+
| Outside lung relapse interval (days) |        |
+--------------------------------------+--------+
|                 Mean                 | 497.7  |
|                 Min                  |  59.0  |
|                 Max                  | 1413.0 |
|                Median                | 476.5  |
|                 Std                  | 299.7  |
+--------------------------------------+--------+
+--------------------------------------+-------+------+
| Outside lung relapse interval (days) | Count |  %   |
+--------------------------------------+-------+------+
|                <=365                 |   14  | 8.6  |
|             365<...<=730             |   12 