# Dataset statistics

In this file we compute the statistics of the dataset so that it is used for the writing of the project report. 
The statistics only concern the clinical data. 

Author: Pierre-Louis Benveniste

## Library import

In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import yaml
from prettytable import PrettyTable

## Dataset import

Here we load the merged dataset containing both the clinical and the radiomics data. 

In [2]:
dataset = pd.read_csv('~/Documents/lung-treatment-response/data/merged_data.csv')

# Print the shape of the dataset
print('The shape of the dataset is:', dataset.shape)

# Print the columns of the dataset
print('The columns of the dataset are:\n', list(dataset.columns))

The shape of the dataset is: (181, 162)
The columns of the dataset are:
 ['sexe', 'age', 'BMI', 'score_charlson', 'OMS', 'tabac', 'tabac_PA', 'tabac_sevre', 'histo', 'T', 'centrale', 'dose_tot', 'etalement', 'vol_GTV', 'vol_PTV', 'vol_ITV', 'couv_PTV', 'BED_10', 'DC', 'DDD', 'cause_DC', 'Date_R_PTV', 'Date_R_homo', 'Date_R_med', 'Date_R_contro', 'Date_R_horspoum', 'Reponse', 'rechute_PTV', 'rechute_homo', 'rechute_med', 'rechute_contro', 'rechute_horspoum', 'delai_fin_DC', 'delai_fin_rechutePTV', 'delai_fin_rechuteHomo', 'delai_fin_rechuteMed', 'delai_fin_rechuteContro', 'delai_fin_rechuteHorspoum', 'follow_up', 'MORPHOLOGICAL_Volume', 'MORPHOLOGICAL_ApproximateVolume', 'MORPHOLOGICAL_voxelsCounting', 'MORPHOLOGICAL_SurfaceArea', 'MORPHOLOGICAL_SurfaceToVolumeRatio', 'MORPHOLOGICAL_Compacity', 'MORPHOLOGICAL_Compactness1', 'MORPHOLOGICAL_Compactness2', 'MORPHOLOGICAL_SphericalDisproportion', 'MORPHOLOGICAL_Sphericity', 'MORPHOLOGICAL_Asphericity', 'MORPHOLOGICAL_CentreOfMassShift', 'MO

## Participants statistics

In this first part we detail the statistics of the dataset regarding specific to the participant. 

To do so, we average the different features across nodules (if a participant has multiple nodule). 

In [3]:
# We remove the dates which are not useful for the analysis
data = dataset.drop(columns=[ 'DDD', 'Date_R_PTV', 'Date_R_homo', 'Date_R_med', 'Date_R_contro',
                            'Date_R_horspoum','subject_nodule', 'nodule'])

# We average the columns for the same patients across the different nodules
data_grouped = data.groupby('subject_id').mean().reset_index()

# We correct some values which wouldn't make sense if they have been averaged (such as yes and no categories)
data_grouped['rechute_homo'] = data_grouped['rechute_homo'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_med'] = data_grouped['rechute_med'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_contro'] = data_grouped['rechute_contro'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_horspoum'] = data_grouped['rechute_horspoum'].apply(lambda x: 1 if x > 0 else 0)
data_grouped['rechute_PTV'] = data_grouped['rechute_PTV'].apply(lambda x: 1 if x > 0 else 0)
# Replace nan values in 'DC' by 0
data_grouped['DC'] = data_grouped['DC'].fillna(0)


Now we perform data analysis for each of the following features: 
- `sexe`: sex of the participant (1=female, 0=male)
- `age`: age of the participant
- `BMI`: Body Mass Index of the participant
- `score_charlson`: The Charlson score of the participant
- `OMS`: 
- `tabac`: if the participant smokes (1=yes, 0=no)
- `tabac_PA`: pack years (the number of packs smoked per year multiplied by the number of years smoked)
- `tabac_sevre`: if the participant is weaned off tobacco
- `DC`: if the participant died (1=yes, 0=no)
- `cause_DC`: the cause of death (1=SPE, 2=infection, 3=AEG, 4=progression, 5=other)
- `Reponse`: how the participant reacted to the treatment (1=stable, 0= no response)
- `rechute_PTV`: if the participant had a local relapse (1=yes, 0=no)
- `rechute_homo`: if the participant had a homo-lateral relapse (1=yes, 0=no)
- `rechute_med`: if the participant had a mediastinal relapse (1=yes, 0=no)
- `rechute_contro`: if the participant had a controlateral relapse (1=yes, 0=no)
- `rechute_horspoum`: if the participant had a relapse outside of the lungs (1=yes, 0=no)
- `delai_fin_DC`: the interval between the end of the treatment and death (if death occurs)
- `delai_fin_rechutePTV`: the interval between the end of the treatment and a local relapse (if it occurs)
- `delai_fin_rechuteHomo`: the interval between the end of the treatment and an homolateral relapse (if it occurs)
- `delai_fin_rechuteMed`: the interval between the end of the treatment and a mediastinal relapse (if it occurs)
- `delai_fin_rechuteContro`: the interval between the end of the treatment and a controlateral relapse (if it occurs)
- `delai_fin_rechuteHorspoum`: the interval between the end of the treatment and a relapse outside of the lungs (if it occurs)

In [4]:
# First let's look into sex
total_participant = data_grouped.shape[0]
table = PrettyTable()
table.field_names = ['Sex', 'Count', '%']
table.add_row(['Female', data_grouped[data_grouped["sexe"]==1]["sexe"].count(), round(data_grouped[data_grouped["sexe"]==1]["sexe"].count()/total_participant*100,1)])
table.add_row(['Male', data_grouped[data_grouped["sexe"]==0]["sexe"].count(), round(data_grouped[data_grouped["sexe"]==0]["sexe"].count()/total_participant*100,1)])
table.add_row(['Total, ', total_participant, '100'])
print(table)

+---------+-------+------+
|   Sex   | Count |  %   |
+---------+-------+------+
|  Female |   63  | 38.7 |
|   Male  |  100  | 61.3 |
| Total,  |  163  | 100  |
+---------+-------+------+


In [5]:
# Now let's look into age
table = PrettyTable()
table.field_names = ['Age', '']
table.add_row(['Mean', round(data_grouped["age"].mean(),1)])
table.add_row(['Min', data_grouped["age"].min()])
table.add_row(['Max', data_grouped["age"].max()])
table.add_row(['Median', data_grouped["age"].median()])
table.add_row(['Std', round(data_grouped["age"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Age', 'Count', '%']
table.add_row(['<=50', data_grouped[data_grouped["age"]<=50]["age"].count(), round(data_grouped[data_grouped["age"]<=50]["age"].count()/total_participant*100,1)])
table.add_row(['50<...<=60', data_grouped[(data_grouped["age"]>50) & (data_grouped["age"]<=60)]["age"].count(), round(data_grouped[(data_grouped["age"]>50) & (data_grouped["age"]<=60)]["age"].count()/total_participant*100,1)])
table.add_row(['60<...<=70', data_grouped[(data_grouped["age"]>60) & (data_grouped["age"]<=70)]["age"].count(), round(data_grouped[(data_grouped["age"]>60) & (data_grouped["age"]<=70)]["age"].count()/total_participant*100,1)])
table.add_row(['70<...<=80', data_grouped[(data_grouped["age"]>70) & (data_grouped["age"]<=80)]["age"].count(), round(data_grouped[(data_grouped["age"]>70) & (data_grouped["age"]<=80)]["age"].count()/total_participant*100,1)])
table.add_row(['>80', data_grouped[data_grouped["age"]>80]["age"].count(), round(data_grouped[data_grouped["age"]>80]["age"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped[data_grouped["age"].isnull()]["age"].count(), round(data_grouped[data_grouped["age"].isnull()]["age"].count()/total_participant*100,1)])
print(table)

+--------+------+
|  Age   |      |
+--------+------+
|  Mean  | 71.6 |
|  Min   | 46.0 |
|  Max   | 93.0 |
| Median | 71.0 |
|  Std   | 9.7  |
+--------+------+
+------------+-------+------+
|    Age     | Count |  %   |
+------------+-------+------+
|    <=50    |   3   | 1.8  |
| 50<...<=60 |   16  | 9.8  |
| 60<...<=70 |   58  | 35.6 |
| 70<...<=80 |   56  | 34.4 |
|    >80     |   30  | 18.4 |
|  Missing   |   0   | 0.0  |
+------------+-------+------+


In [6]:
# Now let's focus on the BMI
table = PrettyTable()
table.field_names = ['BMI', '']
table.add_row(['Mean', round(data_grouped["BMI"].mean(),1)])
table.add_row(['Min', data_grouped["BMI"].min()])
table.add_row(['Max', data_grouped["BMI"].max()])
table.add_row(['Median', data_grouped["BMI"].median()])
table.add_row(['Std', round(data_grouped["BMI"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['BMI', 'Count', '%']
table.add_row(['<=18.5', data_grouped[data_grouped["BMI"]<=18.5]["BMI"].count(), round(data_grouped[data_grouped["BMI"]<=18.5]["BMI"].count()/total_participant*100,1)])
table.add_row(['18.5<...<=25', data_grouped[(data_grouped["BMI"]>18.5) & (data_grouped["BMI"]<=25)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>18.5) & (data_grouped["BMI"]<=25)]["BMI"].count()/total_participant*100,1)])
table.add_row(['25<...<=30', data_grouped[(data_grouped["BMI"]>25) & (data_grouped["BMI"]<=30)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>25) & (data_grouped["BMI"]<=30)]["BMI"].count()/total_participant*100,1)])
table.add_row(['30<...<=35', data_grouped[(data_grouped["BMI"]>30) & (data_grouped["BMI"]<=35)]["BMI"].count(), round(data_grouped[(data_grouped["BMI"]>30) & (data_grouped["BMI"]<=35)]["BMI"].count()/total_participant*100,1)])
table.add_row(['>35', data_grouped[data_grouped["BMI"]>35]["BMI"].count(), round(data_grouped[data_grouped["BMI"]>35]["BMI"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['BMI'].isnull().sum(), round(data_grouped['BMI'].isnull().sum()/total_participant*100,1)])
print(table)

+--------+------+
|  BMI   |      |
+--------+------+
|  Mean  | 25.4 |
|  Min   | 15.6 |
|  Max   | 41.9 |
| Median | 24.2 |
|  Std   | 6.0  |
+--------+------+
+--------------+-------+------+
|     BMI      | Count |  %   |
+--------------+-------+------+
|    <=18.5    |   3   | 1.8  |
| 18.5<...<=25 |   29  | 17.8 |
|  25<...<=30  |   14  | 8.6  |
|  30<...<=35  |   4   | 2.5  |
|     >35      |   5   | 3.1  |
|   Missing    |  108  | 66.3 |
+--------------+-------+------+


In [7]:
# Now let's focus on the Charlson score
table = PrettyTable()
table.field_names = ['Charlson score', '']
table.add_row(['Mean', round(data_grouped["score_charlson"].mean(),1)])
table.add_row(['Min', data_grouped["score_charlson"].min()])
table.add_row(['Max', data_grouped["score_charlson"].max()])
table.add_row(['Median', data_grouped["score_charlson"].median()])
table.add_row(['Std', round(data_grouped["score_charlson"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Charlson score', 'Count', '%']
table.add_row(['<=3', data_grouped[data_grouped["score_charlson"]<=3]["score_charlson"].count(), round(data_grouped[data_grouped["score_charlson"]<=3]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['3<...<=6', data_grouped[(data_grouped["score_charlson"]>3) & (data_grouped["score_charlson"]<=6)]["score_charlson"].count(), round(data_grouped[(data_grouped["score_charlson"]>3) & (data_grouped["score_charlson"]<=6)]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['6<...<=9', data_grouped[(data_grouped["score_charlson"]>6) & (data_grouped["score_charlson"]<=9)]["score_charlson"].count(), round(data_grouped[(data_grouped["score_charlson"]>6) & (data_grouped["score_charlson"]<=9)]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['>9', data_grouped[data_grouped["score_charlson"]>9]["score_charlson"].count(), round(data_grouped[data_grouped["score_charlson"]>9]["score_charlson"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['score_charlson'].isnull().sum(), round(data_grouped['score_charlson'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------+------+
| Charlson score |      |
+----------------+------+
|      Mean      | 7.2  |
|      Min       | 2.0  |
|      Max       | 13.0 |
|     Median     | 7.0  |
|      Std       | 2.3  |
+----------------+------+
+----------------+-------+------+
| Charlson score | Count |  %   |
+----------------+-------+------+
|      <=3       |   4   | 2.5  |
|    3<...<=6    |   69  | 42.3 |
|    6<...<=9    |   58  | 35.6 |
|       >9       |   32  | 19.6 |
|    Missing     |   0   | 0.0  |
+----------------+-------+------+


In [8]:
# Now the OMS
table = PrettyTable()
table.field_names = ['OMS', '']
table.add_row(['Mean', round(data_grouped["OMS"].mean(),1)])
table.add_row(['Min', data_grouped["OMS"].min()])
table.add_row(['Max', data_grouped["OMS"].max()])
table.add_row(['Median', data_grouped["OMS"].median()])
table.add_row(['Std', round(data_grouped["OMS"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['OMS', 'Count', '%']
table.add_row(['0', data_grouped[data_grouped["OMS"]==0]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==0]["OMS"].count()/total_participant*100,1)])
table.add_row(['1', data_grouped[data_grouped["OMS"]==1]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==1]["OMS"].count()/total_participant*100,1)])
table.add_row(['2', data_grouped[data_grouped["OMS"]==2]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==2]["OMS"].count()/total_participant*100,1)])
table.add_row(['3', data_grouped[data_grouped["OMS"]==3]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==3]["OMS"].count()/total_participant*100,1)])
table.add_row(['4', data_grouped[data_grouped["OMS"]==4]["OMS"].count(), round(data_grouped[data_grouped["OMS"]==4]["OMS"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['OMS'].isnull().sum(), round(data_grouped['OMS'].isnull().sum()/total_participant*100,1)])
print(table)

+--------+-----+
|  OMS   |     |
+--------+-----+
|  Mean  | 0.8 |
|  Min   | 0.0 |
|  Max   | 2.0 |
| Median | 1.0 |
|  Std   | 0.5 |
+--------+-----+
+---------+-------+------+
|   OMS   | Count |  %   |
+---------+-------+------+
|    0    |   38  | 23.3 |
|    1    |  106  | 65.0 |
|    2    |   11  | 6.7  |
|    3    |   0   | 0.0  |
|    4    |   0   | 0.0  |
| Missing |   8   | 4.9  |
+---------+-------+------+


In [9]:
# Now the smoking status
table = PrettyTable()
table.field_names = ['Smoking status', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["tabac"]==1]["tabac"].count(), round(data_grouped[data_grouped["tabac"]==1]["tabac"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["tabac"]==0]["tabac"].count(), round(data_grouped[data_grouped["tabac"]==0]["tabac"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac'].isnull().sum(), round(data_grouped['tabac'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------+-------+------+
| Smoking status | Count |  %   |
+----------------+-------+------+
|      Yes       |  123  | 75.5 |
|       No       |   27  | 16.6 |
|    Missing     |   13  | 8.0  |
+----------------+-------+------+


In [10]:
# Now the pack-years
table = PrettyTable()
table.field_names = ['Pack-years', '']
table.add_row(['Mean', round(data_grouped["tabac_PA"].mean(),1)])
table.add_row(['Min', data_grouped["tabac_PA"].min()])
table.add_row(['Max', data_grouped["tabac_PA"].max()])
table.add_row(['Median', data_grouped["tabac_PA"].median()])
table.add_row(['Std', round(data_grouped["tabac_PA"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Pack-years', 'Count', '%']
table.add_row(['<=20', data_grouped[data_grouped["tabac_PA"]<=20]["tabac_PA"].count(), round(data_grouped[data_grouped["tabac_PA"]<=20]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['20<...<=40', data_grouped[(data_grouped["tabac_PA"]>20) & (data_grouped["tabac_PA"]<=40)]["tabac_PA"].count(), round(data_grouped[(data_grouped["tabac_PA"]>20) & (data_grouped["tabac_PA"]<=40)]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['40<...<=60', data_grouped[(data_grouped["tabac_PA"]>40) & (data_grouped["tabac_PA"]<=60)]["tabac_PA"].count(), round(data_grouped[(data_grouped["tabac_PA"]>40) & (data_grouped["tabac_PA"]<=60)]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['>60', data_grouped[data_grouped["tabac_PA"]>60]["tabac_PA"].count(), round(data_grouped[data_grouped["tabac_PA"]>60]["tabac_PA"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac_PA'].isnull().sum(), round(data_grouped['tabac_PA'].isnull().sum()/total_participant*100,1)])
print(table)

+------------+-------+
| Pack-years |       |
+------------+-------+
|    Mean    |  48.8 |
|    Min     |  7.0  |
|    Max     | 120.0 |
|   Median   |  45.0 |
|    Std     |  24.3 |
+------------+-------+
+------------+-------+------+
| Pack-years | Count |  %   |
+------------+-------+------+
|    <=20    |   11  | 6.7  |
| 20<...<=40 |   39  | 23.9 |
| 40<...<=60 |   36  | 22.1 |
|    >60     |   23  | 14.1 |
|  Missing   |   54  | 33.1 |
+------------+-------+------+


In [11]:
# Now if the participant had weaned off smoking
table = PrettyTable()
table.field_names = ['Weaned off smoking', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["tabac_sevre"]==1]["tabac_sevre"].count(), round(data_grouped[data_grouped["tabac_sevre"]==1]["tabac_sevre"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["tabac_sevre"]==0]["tabac_sevre"].count(), round(data_grouped[data_grouped["tabac_sevre"]==0]["tabac_sevre"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['tabac_sevre'].isnull().sum(), round(data_grouped['tabac_sevre'].isnull().sum()/total_participant*100,1)])
print(table)

+--------------------+-------+------+
| Weaned off smoking | Count |  %   |
+--------------------+-------+------+
|        Yes         |   80  | 49.1 |
|         No         |   37  | 22.7 |
|      Missing       |   46  | 28.2 |
+--------------------+-------+------+


In [12]:
# Now if the person has died or not
table = PrettyTable()
table.field_names = ['Death', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped["DC"]==1]["DC"].count(), round(data_grouped[data_grouped["DC"]==1]["DC"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped["DC"]==0]["DC"].count(), round(data_grouped[data_grouped["DC"]==0]["DC"].count()/total_participant*100,1)])
print(table)
# And the interval between the end of treatment and death
table = PrettyTable()
table.field_names = ['Interval between end of treatment and death (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_DC"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_DC"].min()])
table.add_row(['Max', data_grouped["delai_fin_DC"].max()])
table.add_row(['Median', data_grouped["delai_fin_DC"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_DC"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Interval between end of treatment and death (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_DC"]<=365]["delai_fin_DC"].count(), round(data_grouped[data_grouped["delai_fin_DC"]<=365]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_DC"]>365) & (data_grouped["delai_fin_DC"]<=730)]["delai_fin_DC"].count(), round(data_grouped[(data_grouped["delai_fin_DC"]>365) & (data_grouped["delai_fin_DC"]<=730)]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_DC"]>730) & (data_grouped["delai_fin_DC"]<=1095)]["delai_fin_DC"].count(), round(data_grouped[(data_grouped["delai_fin_DC"]>730) & (data_grouped["delai_fin_DC"]<=1095)]["delai_fin_DC"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_DC"]>1095]["delai_fin_DC"].count(), round(data_grouped[data_grouped["delai_fin_DC"]>1095]["delai_fin_DC"].count()/total_participant*100,1)])  
table.add_row(['Missing', data_grouped['delai_fin_DC'].isnull().sum(), round(data_grouped['delai_fin_DC'].isnull().sum()/total_participant*100,1)])
print(table)

+-------+-------+------+
| Death | Count |  %   |
+-------+-------+------+
|  Yes  |   47  | 28.8 |
|   No  |  116  | 71.2 |
+-------+-------+------+
+----------------------------------------------------+--------+
| Interval between end of treatment and death (days) |        |
+----------------------------------------------------+--------+
|                        Mean                        | 801.5  |
|                        Min                         |  0.0   |
|                        Max                         | 2033.0 |
|                       Median                       | 739.0  |
|                        Std                         | 474.8  |
+----------------------------------------------------+--------+
+----------------------------------------------------+-------+------+
| Interval between end of treatment and death (days) | Count |  %   |
+----------------------------------------------------+-------+------+
|                       <=365                        |   10  | 6

In [13]:
# Now we look at the cause of death
table = PrettyTable()
table.field_names = ['Cause of death', 'Count', '% of total death']
table.add_row(['SPE', data_grouped[data_grouped["cause_DC"]==1]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==1]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Infection', data_grouped[data_grouped["cause_DC"]==2]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==2]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['AEG', data_grouped[data_grouped["cause_DC"]==3]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==3]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Progression', data_grouped[data_grouped["cause_DC"]==4]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==4]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Other', data_grouped[data_grouped["cause_DC"]==5]["cause_DC"].count(), round(data_grouped[data_grouped["cause_DC"]==5]["cause_DC"].count()/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
table.add_row(['Missing', data_grouped['cause_DC'].isnull().sum()-data_grouped[data_grouped["DC"]==0]["DC"].count(), round((data_grouped['cause_DC'].isnull().sum()-data_grouped[data_grouped["DC"]==0]["DC"].count())/data_grouped[data_grouped["DC"]==1]["DC"].count()*100,1)])
print(table)

+----------------+-------+------------------+
| Cause of death | Count | % of total death |
+----------------+-------+------------------+
|      SPE       |   7   |       14.9       |
|   Infection    |   4   |       8.5        |
|      AEG       |   2   |       4.3        |
|  Progression   |   3   |       6.4        |
|     Other      |   2   |       4.3        |
|    Missing     |   29  |       61.7       |
+----------------+-------+------------------+


In [14]:
# Now we look at the response to the treatment
table = PrettyTable()
table.field_names = ['Response to treatment', 'Count', '%']
table.add_row(['Stable', data_grouped[data_grouped['Reponse']==1]["Reponse"].count(), round(data_grouped[data_grouped['Reponse']==1]["Reponse"].count()/total_participant*100,1)])
table.add_row(['No response', data_grouped[data_grouped['Reponse']==0]["Reponse"].count(), round(data_grouped[data_grouped['Reponse']==0]["Reponse"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['Reponse'].isnull().sum(), round(data_grouped['Reponse'].isnull().sum()/total_participant*100,1)])
print(table)

+-----------------------+-------+------+
| Response to treatment | Count |  %   |
+-----------------------+-------+------+
|         Stable        |  112  | 68.7 |
|      No response      |   3   | 1.8  |
|        Missing        |   48  | 29.4 |
+-----------------------+-------+------+


In [15]:
# Now we look at local relapse
table = PrettyTable()
table.field_names = ['Local relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_PTV']==1]["rechute_PTV"].count(), round(data_grouped[data_grouped['rechute_PTV']==1]["rechute_PTV"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_PTV']==0]["rechute_PTV"].count(), round(data_grouped[data_grouped['rechute_PTV']==0]["rechute_PTV"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_PTV'].isnull().sum(), round(data_grouped['rechute_PTV'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the local relapse
table = PrettyTable()
table.field_names = ['Local relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechutePTV"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechutePTV"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechutePTV"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechutePTV"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechutePTV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Local relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechutePTV"]<=365]["delai_fin_rechutePTV"].count(), round(data_grouped[data_grouped["delai_fin_rechutePTV"]<=365]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechutePTV"]>365) & (data_grouped["delai_fin_rechutePTV"]<=730)]["delai_fin_rechutePTV"].count(), round(data_grouped[(data_grouped["delai_fin_rechutePTV"]>365) & (data_grouped["delai_fin_rechutePTV"]<=730)]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechutePTV"]>730) & (data_grouped["delai_fin_rechutePTV"]<=1095)]["delai_fin_rechutePTV"].count(), round(data_grouped[(data_grouped["delai_fin_rechutePTV"]>730) & (data_grouped["delai_fin_rechutePTV"]<=1095)]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechutePTV"]>1095]["delai_fin_rechutePTV"].count(), round(data_grouped[data_grouped["delai_fin_rechutePTV"]>1095]["delai_fin_rechutePTV"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechutePTV'].isnull().sum(), round(data_grouped['delai_fin_rechutePTV'].isnull().sum()/total_participant*100,1)])
print(table)


+---------------+-------+------+
| Local relapse | Count |  %   |
+---------------+-------+------+
|      Yes      |   24  | 14.7 |
|       No      |  139  | 85.3 |
|    Missing    |   0   | 0.0  |
+---------------+-------+------+
+-------------------------------+--------+
| Local relapse interval (days) |        |
+-------------------------------+--------+
|              Mean             | 451.5  |
|              Min              |  0.0   |
|              Max              | 1175.0 |
|             Median            | 405.5  |
|              Std              | 292.2  |
+-------------------------------+--------+
+-------------------------------+-------+------+
| Local relapse interval (days) | Count |  %   |
+-------------------------------+-------+------+
|             <=365             |   10  | 6.1  |
|          365<...<=730         |   11  | 6.7  |
|         730<...<=1095         |   1   | 0.6  |
|             >1095             |   2   | 1.2  |
|            Missing            |  139 

In [16]:
# Now we look at relapse in the homolateral lung
table = PrettyTable()
table.field_names = ['Homolateral relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_homo']==1]["rechute_homo"].count(), round(data_grouped[data_grouped['rechute_homo']==1]["rechute_homo"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_homo']==0]["rechute_homo"].count(), round(data_grouped[data_grouped['rechute_homo']==0]["rechute_homo"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_homo'].isnull().sum(), round(data_grouped['rechute_homo'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the homolateral relapse
table = PrettyTable()
table.field_names = ['Homolateral relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteHomo"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteHomo"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteHomo"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteHomo"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteHomo"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Homolateral relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteHomo"]<=365]["delai_fin_rechuteHomo"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHomo"]<=365]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteHomo"]>365) & (data_grouped["delai_fin_rechuteHomo"]<=730)]["delai_fin_rechuteHomo"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHomo"]>365) & (data_grouped["delai_fin_rechuteHomo"]<=730)]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteHomo"]>730) & (data_grouped["delai_fin_rechuteHomo"]<=1095)]["delai_fin_rechuteHomo"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHomo"]>730) & (data_grouped["delai_fin_rechuteHomo"]<=1095)]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteHomo"]>1095]["delai_fin_rechuteHomo"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHomo"]>1095]["delai_fin_rechuteHomo"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteHomo'].isnull().sum(), round(data_grouped['delai_fin_rechuteHomo'].isnull().sum()/total_participant*100,1)])
print(table)

+---------------------+-------+------+
| Homolateral relapse | Count |  %   |
+---------------------+-------+------+
|         Yes         |   25  | 15.3 |
|          No         |  138  | 84.7 |
|       Missing       |   0   | 0.0  |
+---------------------+-------+------+
+-------------------------------------+--------+
| Homolateral relapse interval (days) |        |
+-------------------------------------+--------+
|                 Mean                | 516.8  |
|                 Min                 |  64.0  |
|                 Max                 | 1077.0 |
|                Median               | 503.0  |
|                 Std                 | 308.4  |
+-------------------------------------+--------+
+-------------------------------------+-------+------+
| Homolateral relapse interval (days) | Count |  %   |
+-------------------------------------+-------+------+
|                <=365                |   9   | 5.5  |
|             365<...<=730            |   9   | 5.5  |
|          

In [17]:
# Now we look at relapse in the mediastinum
table = PrettyTable()
table.field_names = ['Mediastinal relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_med']==1]["rechute_med"].count(), round(data_grouped[data_grouped['rechute_med']==1]["rechute_med"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_med']==0]["rechute_med"].count(), round(data_grouped[data_grouped['rechute_med']==0]["rechute_med"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_med'].isnull().sum(), round(data_grouped['rechute_med'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the mediastinum relapse
table = PrettyTable()
table.field_names = ['Mediastinal relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteMed"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteMed"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteMed"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteMed"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteMed"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Mediastinal relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteMed"]<=365]["delai_fin_rechuteMed"].count(), round(data_grouped[data_grouped["delai_fin_rechuteMed"]<=365]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteMed"]>365) & (data_grouped["delai_fin_rechuteMed"]<=730)]["delai_fin_rechuteMed"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteMed"]>365) & (data_grouped["delai_fin_rechuteMed"]<=730)]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteMed"]>730) & (data_grouped["delai_fin_rechuteMed"]<=1095)]["delai_fin_rechuteMed"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteMed"]>730) & (data_grouped["delai_fin_rechuteMed"]<=1095)]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteMed"]>1095]["delai_fin_rechuteMed"].count(), round(data_grouped[data_grouped["delai_fin_rechuteMed"]>1095]["delai_fin_rechuteMed"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteMed'].isnull().sum(), round(data_grouped['delai_fin_rechuteMed'].isnull().sum()/total_participant*100,1)])
print(table)

+---------------------+-------+------+
| Mediastinal relapse | Count |  %   |
+---------------------+-------+------+
|         Yes         |   19  | 11.7 |
|          No         |  144  | 88.3 |
|       Missing       |   0   | 0.0  |
+---------------------+-------+------+
+-------------------------------------+--------+
| Mediastinal relapse interval (days) |        |
+-------------------------------------+--------+
|                 Mean                | 419.4  |
|                 Min                 |  59.0  |
|                 Max                 | 1413.0 |
|                Median               | 413.0  |
|                 Std                 | 327.3  |
+-------------------------------------+--------+
+-------------------------------------+-------+------+
| Mediastinal relapse interval (days) | Count |  %   |
+-------------------------------------+-------+------+
|                <=365                |   8   | 4.9  |
|             365<...<=730            |   9   | 5.5  |
|          

In [18]:
# Now we look at relapse in the contralateral lung
table = PrettyTable()
table.field_names = ['Contralateral relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_contro']==1]["rechute_contro"].count(), round(data_grouped[data_grouped['rechute_contro']==1]["rechute_contro"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_contro']==0]["rechute_contro"].count(), round(data_grouped[data_grouped['rechute_contro']==0]["rechute_contro"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_contro'].isnull().sum(), round(data_grouped['rechute_contro'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the contralateral relapse
table = PrettyTable()
table.field_names = ['Contralateral relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteContro"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteContro"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteContro"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteContro"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteContro"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Contralateral relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteContro"]<=365]["delai_fin_rechuteContro"].count(), round(data_grouped[data_grouped["delai_fin_rechuteContro"]<=365]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteContro"]>365) & (data_grouped["delai_fin_rechuteContro"]<=730)]["delai_fin_rechuteContro"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteContro"]>365) & (data_grouped["delai_fin_rechuteContro"]<=730)]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteContro"]>730) & (data_grouped["delai_fin_rechuteContro"]<=1095)]["delai_fin_rechuteContro"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteContro"]>730) & (data_grouped["delai_fin_rechuteContro"]<=1095)]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteContro"]>1095]["delai_fin_rechuteContro"].count(), round(data_grouped[data_grouped["delai_fin_rechuteContro"]>1095]["delai_fin_rechuteContro"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteContro'].isnull().sum(), round(data_grouped['delai_fin_rechuteContro'].isnull().sum()/total_participant*100,1)])
print(table)


+-----------------------+-------+------+
| Contralateral relapse | Count |  %   |
+-----------------------+-------+------+
|          Yes          |   23  | 14.1 |
|           No          |  140  | 85.9 |
|        Missing        |   0   | 0.0  |
+-----------------------+-------+------+
+---------------------------------------+--------+
| Contralateral relapse interval (days) |        |
+---------------------------------------+--------+
|                  Mean                 | 503.5  |
|                  Min                  |  0.0   |
|                  Max                  | 1409.0 |
|                 Median                | 503.0  |
|                  Std                  | 347.1  |
+---------------------------------------+--------+
+---------------------------------------+-------+------+
| Contralateral relapse interval (days) | Count |  %   |
+---------------------------------------+-------+------+
|                 <=365                 |   8   | 4.9  |
|              365<...<=73

In [19]:
# Now we look at relapse outside the lungs
table = PrettyTable()
table.field_names = ['Outside lung relapse', 'Count', '%']
table.add_row(['Yes', data_grouped[data_grouped['rechute_horspoum']==1]["rechute_horspoum"].count(), round(data_grouped[data_grouped['rechute_horspoum']==1]["rechute_horspoum"].count()/total_participant*100,1)])
table.add_row(['No', data_grouped[data_grouped['rechute_horspoum']==0]["rechute_horspoum"].count(), round(data_grouped[data_grouped['rechute_horspoum']==0]["rechute_horspoum"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['rechute_horspoum'].isnull().sum(), round(data_grouped['rechute_horspoum'].isnull().sum()/total_participant*100,1)])
print(table)
# And the interval between the end of the treatment and the outside lung relapse
table = PrettyTable()
table.field_names = ['Outside lung relapse interval (days)', '']
table.add_row(['Mean', round(data_grouped["delai_fin_rechuteHorspoum"].mean(),1)])
table.add_row(['Min', data_grouped["delai_fin_rechuteHorspoum"].min()])
table.add_row(['Max', data_grouped["delai_fin_rechuteHorspoum"].max()])
table.add_row(['Median', data_grouped["delai_fin_rechuteHorspoum"].median()])
table.add_row(['Std', round(data_grouped["delai_fin_rechuteHorspoum"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Outside lung relapse interval (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["delai_fin_rechuteHorspoum"]<=365]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHorspoum"]<=365]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>365) & (data_grouped["delai_fin_rechuteHorspoum"]<=730)]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>365) & (data_grouped["delai_fin_rechuteHorspoum"]<=730)]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>730) & (data_grouped["delai_fin_rechuteHorspoum"]<=1095)]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[(data_grouped["delai_fin_rechuteHorspoum"]>730) & (data_grouped["delai_fin_rechuteHorspoum"]<=1095)]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["delai_fin_rechuteHorspoum"]>1095]["delai_fin_rechuteHorspoum"].count(), round(data_grouped[data_grouped["delai_fin_rechuteHorspoum"]>1095]["delai_fin_rechuteHorspoum"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['delai_fin_rechuteHorspoum'].isnull().sum(), round(data_grouped['delai_fin_rechuteHorspoum'].isnull().sum()/total_participant*100,1)])
print(table)

+----------------------+-------+------+
| Outside lung relapse | Count |  %   |
+----------------------+-------+------+
|         Yes          |   35  | 21.5 |
|          No          |  128  | 78.5 |
|       Missing        |   0   | 0.0  |
+----------------------+-------+------+
+--------------------------------------+--------+
| Outside lung relapse interval (days) |        |
+--------------------------------------+--------+
|                 Mean                 | 497.7  |
|                 Min                  |  59.0  |
|                 Max                  | 1413.0 |
|                Median                | 476.5  |
|                 Std                  | 299.7  |
+--------------------------------------+--------+
+--------------------------------------+-------+------+
| Outside lung relapse interval (days) | Count |  %   |
+--------------------------------------+-------+------+
|                <=365                 |   14  | 8.6  |
|             365<...<=730             |   12 

In [20]:
# Now we look at the follow-up time
table = PrettyTable()
table.field_names = ['Follow-up time (days)', '']
table.add_row(['Mean', round(data_grouped["follow_up"].mean(),1)])
table.add_row(['Min', data_grouped["follow_up"].min()])
table.add_row(['Max', data_grouped["follow_up"].max()])
table.add_row(['Median', data_grouped["follow_up"].median()])
table.add_row(['Std', round(data_grouped["follow_up"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Follow-up time (days)', 'Count', '%']
table.add_row(['<=365', data_grouped[data_grouped["follow_up"]<=365]["follow_up"].count(), round(data_grouped[data_grouped["follow_up"]<=365]["follow_up"].count()/total_participant*100,1)])
table.add_row(['365<...<=730', data_grouped[(data_grouped["follow_up"]>365) & (data_grouped["follow_up"]<=730)]["follow_up"].count(), round(data_grouped[(data_grouped["follow_up"]>365) & (data_grouped["follow_up"]<=730)]["follow_up"].count()/total_participant*100,1)])
table.add_row(['730<...<=1095', data_grouped[(data_grouped["follow_up"]>730) & (data_grouped["follow_up"]<=1095)]["follow_up"].count(), round(data_grouped[(data_grouped["follow_up"]>730) & (data_grouped["follow_up"]<=1095)]["follow_up"].count()/total_participant*100,1)])
table.add_row(['>1095', data_grouped[data_grouped["follow_up"]>1095]["follow_up"].count(), round(data_grouped[data_grouped["follow_up"]>1095]["follow_up"].count()/total_participant*100,1)])
table.add_row(['Missing', data_grouped['follow_up'].isnull().sum(), round(data_grouped['follow_up'].isnull().sum()/total_participant*100,1)])
print(table)

+-----------------------+--------+
| Follow-up time (days) |        |
+-----------------------+--------+
|          Mean         | 949.7  |
|          Min          |  21.0  |
|          Max          | 2025.0 |
|         Median        | 782.0  |
|          Std          | 512.8  |
+-----------------------+--------+
+-----------------------+-------+------+
| Follow-up time (days) | Count |  %   |
+-----------------------+-------+------+
|         <=365         |   9   | 5.5  |
|      365<...<=730     |   54  | 33.1 |
|     730<...<=1095     |   41  | 25.2 |
|         >1095         |   44  | 27.0 |
|        Missing        |   15  | 9.2  |
+-----------------------+-------+------+


## Nodule analysis

In this section of the dataset analysis we focus on the analysis of the nodules. 

We will specifically detail the following features:
- `histo`: the nodule histology (1=ADK, 2=CE, 3=other, 4=NC)
- `T`: T stage of the nodule (1=T1=<3cm, 2=T2: Tumor > 3 cm but ≤ 5 cm, or involving bronchus, visceral pleura, causing atelectasis or obstructive pneumonia, 3=T3: Tumor > 5 cm but ≤ 7 cm, or involving pericardium, parietal pleura, phrenic nerve, or separate tumor nodules in the same lobe, 4=T4: Tumor > 7 cm or involving critical organs, or separate tumor nodules in different lobes, 5=Primary tumor size cannot be assessed or information is insufficient.)
- `centrale`: whether the tumor is centrally located within the lung (1=yes=<2cm, 0=no)
- `dose_tot`: total dose of radiation (in Gray)
- `etalement`: the duration of the radiation therapy (in days)
- `vol_GTV`: volume of the Gross Tumor Volume (in cm3)
- `vol_PTV`: volume of the Planning Target Volume (in cm3)
- `vol_ITV`: volume of the Internal Target Volume (in cm3)
- `couv-PTV`: coverage of the Planning Target Volume (in cm3)
- `BED_10`: Biologically Effective Dose with an α/β ratio of 10

In [21]:
# We first describe the data
## Print the number of nodules
print("Number of nodules:",data.shape[0])
# Average number of nodule per patient
print("Average number of nodules per patient:",round(data.shape[0]/data_grouped.shape[0],2))

Number of nodules: 181
Average number of nodules per patient: 1.11


In [22]:
# We first look into histology
total_nodules = data.shape[0]
table = PrettyTable()
table.field_names = ['Histology', 'Count', '% of nodules']
table.add_row(['ADK', data[data['histo']==1]["histo"].count(), round(data[data['histo']==1]["histo"].count()/total_nodules*100,1)])
table.add_row(['CE', data[data['histo']==2]["histo"].count(), round(data[data['histo']==2]["histo"].count()/total_nodules*100,1)])
table.add_row(['Other', data[data['histo']==3]["histo"].count(), round(data[data['histo']==3]["histo"].count()/total_nodules*100,1)])
table.add_row(['NC', data[data['histo']==4]["histo"].count(), round(data[data['histo']==4]["histo"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['histo'].isnull().sum(), round(data['histo'].isnull().sum()/total_nodules*100,1)])
print(table)

+-----------+-------+--------------+
| Histology | Count | % of nodules |
+-----------+-------+--------------+
|    ADK    |   29  |     16.0     |
|     CE    |   15  |     8.3      |
|   Other   |   6   |     3.3      |
|     NC    |  131  |     72.4     |
|  Missing  |   0   |     0.0      |
+-----------+-------+--------------+


In [23]:
# We now look at T
table = PrettyTable()
table.field_names = ['T stage', 'Count', '% of nodules']
table.add_row(['T1', data[data['T']==1]["T"].count(), round(data[data['T']==1]["T"].count()/total_nodules*100,1)])
table.add_row(['T2', data[data['T']==2]["T"].count(), round(data[data['T']==2]["T"].count()/total_nodules*100,1)])
table.add_row(['T3', data[data['T']==3]["T"].count(), round(data[data['T']==3]["T"].count()/total_nodules*100,1)])
table.add_row(['T4', data[data['T']==4]["T"].count(), round(data[data['T']==4]["T"].count()/total_nodules*100,1)])
table.add_row(['Tx', data[data['T']==5]["T"].count(), round(data[data['T']==5]["T"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['T'].isnull().sum(), round(data['T'].isnull().sum()/total_nodules*100,1)])
print(table)

+---------+-------+--------------+
| T stage | Count | % of nodules |
+---------+-------+--------------+
|    T1   |   97  |     53.6     |
|    T2   |   16  |     8.8      |
|    T3   |   0   |     0.0      |
|    T4   |   0   |     0.0      |
|    Tx   |   1   |     0.6      |
| Missing |   67  |     37.0     |
+---------+-------+--------------+


In [24]:
# Now we look at 'centrale' 
table = PrettyTable()
table.field_names = ['Central', 'Count', '% of nodules']
table.add_row(['Yes, <=2cm', data[data['centrale']==1]["centrale"].count(), round(data[data['centrale']==1]["centrale"].count()/total_nodules*100,1)])
table.add_row(['No', data[data['centrale']==0]["centrale"].count(), round(data[data['centrale']==0]["centrale"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['centrale'].isnull().sum(), round(data['centrale'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------+-------+--------------+
|  Central   | Count | % of nodules |
+------------+-------+--------------+
| Yes, <=2cm |   48  |     26.5     |
|     No     |  133  |     73.5     |
|  Missing   |   0   |     0.0      |
+------------+-------+--------------+


In [25]:
# Now we look at the total dose (in Gray)
table = PrettyTable()
table.field_names = ['Total dose (in Gray)', '']
table.add_row(['Mean', round(data["dose_tot"].mean(),1)])
table.add_row(['Min', data["dose_tot"].min()])
table.add_row(['Max', data["dose_tot"].max()])
table.add_row(['Median', data["dose_tot"].median()])
table.add_row(['Std', round(data["dose_tot"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Total dose (in Gray)', 'Count', '%']
table.add_row(['<=50', data[data["dose_tot"]<=50]["dose_tot"].count(), round(data[data["dose_tot"]<=50]["dose_tot"].count()/total_nodules*100,1)])
table.add_row(['50<...<=55', data[(data["dose_tot"]>50) & (data["dose_tot"]<=55)]["dose_tot"].count(), round(data[(data["dose_tot"]>50) & (data["dose_tot"]<=55)]["dose_tot"].count()/total_nodules*100,1)])
table.add_row(['>55', data[data["dose_tot"]>55]["dose_tot"].count(), round(data[data["dose_tot"]>55]["dose_tot"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['dose_tot'].isnull().sum(), round(data['dose_tot'].isnull().sum()/total_nodules*100,1)])
print(table)

+----------------------+------+
| Total dose (in Gray) |      |
+----------------------+------+
|         Mean         | 56.2 |
|         Min          | 30.0 |
|         Max          | 60.0 |
|        Median        | 60.0 |
|         Std          | 5.8  |
+----------------------+------+
+----------------------+-------+------+
| Total dose (in Gray) | Count |  %   |
+----------------------+-------+------+
|         <=50         |   32  | 17.7 |
|      50<...<=55      |   41  | 22.7 |
|         >55          |  108  | 59.7 |
|       Missing        |   0   | 0.0  |
+----------------------+-------+------+


In [26]:
# Now we look at the duration (etalement) of the treatment
table = PrettyTable()
table.field_names = ['Treatment duration (in days)', '']
table.add_row(['Mean', round(data["etalement"].mean(),1)])
table.add_row(['Min', data["etalement"].min()])
table.add_row(['Max', data["etalement"].max()])
table.add_row(['Median', data["etalement"].median()])
table.add_row(['Std', round(data["etalement"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Treatment duration (in days)', 'Count', '%']
table.add_row(['<=5', data[data["etalement"]<=5]["etalement"].count(), round(data[data["etalement"]<=5]["etalement"].count()/total_nodules*100,1)])
table.add_row(['5<...<=10', data[(data["etalement"]>5) & (data["etalement"]<=10)]["etalement"].count(), round(data[(data["etalement"]>5) & (data["etalement"]<=10)]["etalement"].count()/total_nodules*100,1)])
table.add_row(['10<...<=15', data[(data["etalement"]>10) & (data["etalement"]<=15)]["etalement"].count(), round(data[(data["etalement"]>10) & (data["etalement"]<=15)]["etalement"].count()/total_nodules*100,1)])
table.add_row(['15<...<=20', data[(data["etalement"]>15) & (data["etalement"]<=20)]["etalement"].count(), round(data[(data["etalement"]>15) & (data["etalement"]<=20)]["etalement"].count()/total_nodules*100,1)])
table.add_row(['>20', data[data["etalement"]>20]["etalement"].count(), round(data[data["etalement"]>20]["etalement"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['etalement'].isnull().sum(), round(data['etalement'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------------------+------+
| Treatment duration (in days) |      |
+------------------------------+------+
|             Mean             | 11.2 |
|             Min              |  2   |
|             Max              |  46  |
|            Median            | 10.0 |
|             Std              | 6.4  |
+------------------------------+------+
+------------------------------+-------+------+
| Treatment duration (in days) | Count |  %   |
+------------------------------+-------+------+
|             <=5              |   33  | 18.2 |
|          5<...<=10           |   69  | 38.1 |
|          10<...<=15          |   35  | 19.3 |
|          15<...<=20          |   32  | 17.7 |
|             >20              |   12  | 6.6  |
|           Missing            |   0   | 0.0  |
+------------------------------+-------+------+


In [27]:
# Now we focus on the vol_GTV (in cm3)
table = PrettyTable()
table.field_names = ['Volume of GTV (in cm3)', '']
table.add_row(['Mean', round(data["vol_GTV"].mean(),1)])
table.add_row(['Min', data["vol_GTV"].min()])
table.add_row(['Max', data["vol_GTV"].max()])
table.add_row(['Median', data["vol_GTV"].median()])
table.add_row(['Std', round(data["vol_GTV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Volume of GTV (in cm3)', 'Count', '%']
table.add_row(['<=5', data[data["vol_GTV"]<=5]["vol_GTV"].count(), round(data[data["vol_GTV"]<=5]["vol_GTV"].count()/total_nodules*100,1)])
table.add_row(['5<...<=10', data[(data["vol_GTV"]>5) & (data["vol_GTV"]<=10)]["vol_GTV"].count(), round(data[(data["vol_GTV"]>5) & (data["vol_GTV"]<=10)]["vol_GTV"].count()/total_nodules*100,1)])
table.add_row(['>10', data[data["vol_GTV"]>10]["vol_GTV"].count(), round(data[data["vol_GTV"]>10]["vol_GTV"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['vol_GTV'].isnull().sum(), round(data['vol_GTV'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------------+-------+
| Volume of GTV (in cm3) |       |
+------------------------+-------+
|          Mean          |  6.9  |
|          Min           |  0.13 |
|          Max           | 51.24 |
|         Median         |  3.83 |
|          Std           |  8.3  |
+------------------------+-------+
+------------------------+-------+------+
| Volume of GTV (in cm3) | Count |  %   |
+------------------------+-------+------+
|          <=5           |   99  | 54.7 |
|       5<...<=10        |   39  | 21.5 |
|          >10           |   35  | 19.3 |
|        Missing         |   8   | 4.4  |
+------------------------+-------+------+


In [28]:
# Now we focus on the vol_PTV (in cm3)
table = PrettyTable()
table.field_names = ['Volume of PTV (in cm3)', '']
table.add_row(['Mean', round(data["vol_PTV"].mean(),1)])
table.add_row(['Min', data["vol_PTV"].min()])
table.add_row(['Max', data["vol_PTV"].max()])
table.add_row(['Median', data["vol_PTV"].median()])
table.add_row(['Std', round(data["vol_PTV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Volume of PTV (in cm3)', 'Count', '%']
table.add_row(['<=10', data[data["vol_PTV"]<=10]["vol_PTV"].count(), round(data[data["vol_PTV"]<=10]["vol_PTV"].count()/total_nodules*100,1)])
table.add_row(['10<...<=20', data[(data["vol_PTV"]>10) & (data["vol_PTV"]<=20)]["vol_PTV"].count(), round(data[(data["vol_PTV"]>10) & (data["vol_PTV"]<=20)]["vol_PTV"].count()/total_nodules*100,1)])
table.add_row(['20<...<=30', data[(data["vol_PTV"]>20) & (data["vol_PTV"]<=30)]["vol_PTV"].count(), round(data[(data["vol_PTV"]>20) & (data["vol_PTV"]<=30)]["vol_PTV"].count()/total_nodules*100,1)])
table.add_row(['>30', data[data["vol_PTV"]>30]["vol_PTV"].count(), round(data[data["vol_PTV"]>30]["vol_PTV"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['vol_PTV'].isnull().sum(), round(data['vol_PTV'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------------+-------+
| Volume of PTV (in cm3) |       |
+------------------------+-------+
|          Mean          |  24.2 |
|          Min           |  1.79 |
|          Max           | 97.21 |
|         Median         |  18.1 |
|          Std           |  19.8 |
+------------------------+-------+
+------------------------+-------+------+
| Volume of PTV (in cm3) | Count |  %   |
+------------------------+-------+------+
|          <=10          |   41  | 22.7 |
|       10<...<=20       |   46  | 25.4 |
|       20<...<=30       |   36  | 19.9 |
|          >30           |   40  | 22.1 |
|        Missing         |   18  | 9.9  |
+------------------------+-------+------+


In [29]:
# Now we look at vol_ITV (in cm3)
table = PrettyTable()
table.field_names = ['Volume of ITV (in cm3)', '']
table.add_row(['Mean', round(data["vol_ITV"].mean(),1)])
table.add_row(['Min', data["vol_ITV"].min()])
table.add_row(['Max', data["vol_ITV"].max()])
table.add_row(['Median', data["vol_ITV"].median()])
table.add_row(['Std', round(data["vol_ITV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Volume of ITV (in cm3)', 'Count', '%']
table.add_row(['<=5', data[data["vol_ITV"]<=5]["vol_ITV"].count(), round(data[data["vol_ITV"]<=5]["vol_ITV"].count()/total_nodules*100,1)])
table.add_row(['5<...<=10', data[(data["vol_ITV"]>5) & (data["vol_ITV"]<=10)]["vol_ITV"].count(), round(data[(data["vol_ITV"]>5) & (data["vol_ITV"]<=10)]["vol_ITV"].count()/total_nodules*100,1)])
table.add_row(['10<...<=15', data[(data["vol_ITV"]>10) & (data["vol_ITV"]<=15)]["vol_ITV"].count(), round(data[(data["vol_ITV"]>10) & (data["vol_ITV"]<=15)]["vol_ITV"].count()/total_nodules*100,1)])
table.add_row(['>15', data[data["vol_ITV"]>15]["vol_ITV"].count(), round(data[data["vol_ITV"]>15]["vol_ITV"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['vol_ITV'].isnull().sum(), round(data['vol_ITV'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------------+-------+
| Volume of ITV (in cm3) |       |
+------------------------+-------+
|          Mean          |  11.5 |
|          Min           |  0.59 |
|          Max           | 47.25 |
|         Median         |  6.98 |
|          Std           |  10.8 |
+------------------------+-------+
+------------------------+-------+------+
| Volume of ITV (in cm3) | Count |  %   |
+------------------------+-------+------+
|          <=5           |   34  | 18.8 |
|       5<...<=10        |   26  | 14.4 |
|       10<...<=15       |   8   | 4.4  |
|          >15           |   27  | 14.9 |
|        Missing         |   86  | 47.5 |
+------------------------+-------+------+


In [30]:
# Now we look at couv-PTV (%)
table = PrettyTable()
table.field_names = ['Coverage PTV (%)', '']
table.add_row(['Mean', round(data["couv_PTV"].mean(),1)])
table.add_row(['Min', data["couv_PTV"].min()])
table.add_row(['Max', data["couv_PTV"].max()])
table.add_row(['Median', data["couv_PTV"].median()])
table.add_row(['Std', round(data["couv_PTV"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['Coverage PTV (%)', 'Count', '%']
table.add_row(['<=90', data[data["couv_PTV"]<=90]["couv_PTV"].count(), round(data[data["couv_PTV"]<=90]["couv_PTV"].count()/total_nodules*100,1)])
table.add_row(['90<...<=95', data[(data["couv_PTV"]>90) & (data["couv_PTV"]<=95)]["couv_PTV"].count(), round(data[(data["couv_PTV"]>90) & (data["couv_PTV"]<=95)]["couv_PTV"].count()/total_nodules*100,1)])
table.add_row(['95<...<=100', data[(data["couv_PTV"]>95) & (data["couv_PTV"]<=100)]["couv_PTV"].count(), round(data[(data["couv_PTV"]>95) & (data["couv_PTV"]<=100)]["couv_PTV"].count()/total_nodules*100,1)])
table.add_row(['>100', data[data["couv_PTV"]>100]["couv_PTV"].count(), round(data[data["couv_PTV"]>100]["couv_PTV"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['couv_PTV'].isnull().sum(), round(data['couv_PTV'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------+--------+
| Coverage PTV (%) |        |
+------------------+--------+
|       Mean       |  96.5  |
|       Min        |  70.2  |
|       Max        | 107.6  |
|      Median      | 98.095 |
|       Std        |  4.9   |
+------------------+--------+
+------------------+-------+------+
| Coverage PTV (%) | Count |  %   |
+------------------+-------+------+
|       <=90       |   13  | 7.2  |
|    90<...<=95    |   24  | 13.3 |
|   95<...<=100    |  140  | 77.3 |
|       >100       |   1   | 0.6  |
|     Missing      |   3   | 1.7  |
+------------------+-------+------+


In [31]:
# Finally we look at BED_10 (in Gray)
table = PrettyTable()
table.field_names = ['BED_10 (in Gray)', '']
table.add_row(['Mean', round(data["BED_10"].mean(),1)])
table.add_row(['Min', data["BED_10"].min()])
table.add_row(['Max', data["BED_10"].max()])
table.add_row(['Median', data["BED_10"].median()])
table.add_row(['Std', round(data["BED_10"].std(),1)])
print(table)
table = PrettyTable()
table.field_names = ['BED_10 (in Gray)', 'Count', '%']
table.add_row(['<=80', data[data["BED_10"]<=80]["BED_10"].count(), round(data[data["BED_10"]<=80]["BED_10"].count()/total_nodules*100,1)])
table.add_row(['80<...<=100', data[(data["BED_10"]>80) & (data["BED_10"]<=100)]["BED_10"].count(), round(data[(data["BED_10"]>80) & (data["BED_10"]<=100)]["BED_10"].count()/total_nodules*100,1)])
table.add_row(['100<...<=120', data[(data["BED_10"]>100) & (data["BED_10"]<=120)]["BED_10"].count(), round(data[(data["BED_10"]>100) & (data["BED_10"]<=120)]["BED_10"].count()/total_nodules*100,1)])
table.add_row(['120<...<=140', data[(data["BED_10"]>120) & (data["BED_10"]<=140)]["BED_10"].count(), round(data[(data["BED_10"]>120) & (data["BED_10"]<=140)]["BED_10"].count()/total_nodules*100,1)])
table.add_row(['>140', data[data["BED_10"]>140]["BED_10"].count(), round(data[data["BED_10"]>140]["BED_10"].count()/total_nodules*100,1)])
table.add_row(['Missing', data['BED_10'].isnull().sum(), round(data['BED_10'].isnull().sum()/total_nodules*100,1)])
print(table)

+------------------+-------+
| BED_10 (in Gray) |       |
+------------------+-------+
|       Mean       | 127.4 |
|       Min        |  45.0 |
|       Max        | 180.0 |
|      Median      | 132.0 |
|       Std        |  33.2 |
+------------------+-------+
+------------------+-------+------+
| BED_10 (in Gray) | Count |  %   |
+------------------+-------+------+
|       <=80       |   16  | 8.8  |
|   80<...<=100    |   14  | 7.7  |
|   100<...<=120   |   55  | 30.4 |
|   120<...<=140   |   39  | 21.5 |
|       >140       |   57  | 31.5 |
|     Missing      |   0   | 0.0  |
+------------------+-------+------+
