In [1]:
import pandas as pd
import numpy as np
import traceback
import scipy.stats as stats

In [2]:
#Export created using https://github.com/id-b3/BPAnalysis
df=pd.read_csv("LifeLines_emphysema_demographics.csv") #Read demographics tables for participants in emphysema experiment (LifeLines variables)
df.columns

Index(['patientID', 'weight+AF8-at+AF8-scan', 'length+AF8-at+AF8-scan',
       'first+AF8-scan+AF8-date', 'age+AF8-at+AF8-scan', 'gender',
       'bp+AF8-tlv', 'age+AF8-5yr', 'age+AF8-10yr', 'never+AF8-smoker',
       'ever+AF8-smoker', 'current+AF8-smoker', 'ex+AF8-smoker',
       'pack+AF8-years', 'smoking+AF8-end+AC0-age',
       'smoking+AF8-start+AC0-age', 'smoking+AF8-duration',
       'total+AF8-frequency', 'copd+AF8-diagnosis', 'asthma+AF8-diagnosis',
       'cancer+AF8-type', 'resp+AF8-other', 'fev1', 'fvc', 'fev1+AF8-pp',
       'fev1fvc+AF8-lln', 'fev1+AF8-fvc', 'GOLD+AF8-stage'],
      dtype='object')

In [3]:
df=df[df['patientID']!=592863] #Ignore participant with only one FN>300mm3

In [4]:
noemph=[136154, 184429, 295789, 335382, 341417, 353491, 369762, 370347, 382098, 383275, 384136, 395464, 406668, 410655, 
        427498, 429789, 435703, 440453, 451989, 452500, 493907, 537519, 570103, 591162, 789586, 808262, 146007, 248597, 
        388787, 428859, 449790, 475503, 485925, 585377, 632817, 673634, 817358, 135915, 136470,  225858, 225969, 
        278319, 320656, 425409, 490144, 499832, 518709, 582854, 663854, 706029, 870199, 910698, 986374, 988394,
        662368, 199391, 427158, 429703, 458362, 545508, 720754, 845334, 891238, 951248, 100785, 113137, 135984, 136012, 
        136109, 136116, 136185, 136307, 136321,162158, 136418,136432,136456,136487,136494,136425,138310 ,144629]

adv=[163557, 197239, 512145, 670208, 998310] 

mod=[136550, 136581, 200637, 215387, 240819, 255903, 283229, 294019, 331182, 332758, 438820, 503788, 507704, 609065,
     633549, 640431, 660928, 757591, 810826, 811041, 860079,  873698, 971099, 985215, 991277, 101191, 944714]

conf=[552612,866164, 282528, 370941, 617769, 754238, 845594] #Assuming that >300mm3 nodules not taken into account - 592863 excluded

print("Total participants in emphysema experiments are",len(noemph+adv+mod+conf))
print('From those',len(noemph),'have no emphysema')

Total participants in emphysema experiments are 121
From those 82 have no emphysema


In [5]:
#Replace encoding issues in columns
df.columns=[c.replace("+AF8-","_") for c in list(df.columns)]
df.columns=[c.replace("+AC0-","_") for c in list(df.columns)]
df.columns

#We will ignore issues with 'age_5yr', 'age_10yr' since we won't use these columns
#'copd_diagnosis' not gives emphysema - will also be ignored

Index(['patientID', 'weight_at_scan', 'length_at_scan', 'first_scan_date',
       'age_at_scan', 'gender', 'bp_tlv', 'age_5yr', 'age_10yr',
       'never_smoker', 'ever_smoker', 'current_smoker', 'ex_smoker',
       'pack_years', 'smoking_end_age', 'smoking_start_age',
       'smoking_duration', 'total_frequency', 'copd_diagnosis',
       'asthma_diagnosis', 'cancer_type', 'resp_other', 'fev1', 'fvc',
       'fev1_pp', 'fev1fvc_lln', 'fev1_fvc', 'GOLD_stage'],
      dtype='object')

In [6]:
#Split information between emphysema and non-emphysema participants
df_noemph=df[df['patientID'].isin(noemph)]
df_emph=df[df['patientID'].isin(adv+mod+conf)]

#Confirm that all participants taken into account - df is used below for statistics over all participants
assert len(df)==len(df_emph)+len(df_noemph) 

In [7]:
#Check if there are any missing values
gender=df['gender']
age=df['age_at_scan']
weight=df['weight_at_scan']
height=df['length_at_scan']

never_smoker=df['never_smoker']
ever_smoker=df['ever_smoker']
current_smoker=df['current_smoker']
ex_smoker=df['ex_smoker']
pack_years=df['pack_years']

try: #Confirm that there are no missing values for age, gender, weight, height
    assert np.sum(gender.value_counts().values)==len(df)
    assert np.sum(age.value_counts().values)==len(df)
    assert np.sum(weight.value_counts().values)==len(df)
    assert np.sum(height.value_counts().values)==len(df)
except:
    print(traceback.format_exc())
    print('\n')


#Report missing values for smoking attributes
try:
    assert np.sum(never_smoker.value_counts().values)==len(df)
except:
    print("For never_smoker we have {} values out of {}".format(np.sum(never_smoker.value_counts().values),len(df)))

try:
    assert np.sum(ever_smoker.value_counts().values)==len(df)
except:
    print("For ever_smoker we have {} values out of {}".format(np.sum(ever_smoker.value_counts().values),len(df)))
    
try:
    assert np.sum(current_smoker.value_counts().values)==len(df)
except:
    print("For current_smoker we have {} values out of {}".format(np.sum(current_smoker.value_counts().values),len(df)))
    
try:
    assert np.sum(ex_smoker.value_counts().values)==len(df)
except:
    print("For ex_smoker we have {} values out of {}".format(np.sum(ex_smoker.value_counts().values),len(df)))
    
try:
    assert np.sum(pack_years.value_counts().values)==len(df)
except:
    print("For pack_years we have {} values out of {}".format(np.sum(pack_years.value_counts().values),len(df)))

For never_smoker we have 120 values out of 121
For ever_smoker we have 120 values out of 121
For current_smoker we have 120 values out of 121
For ex_smoker we have 120 values out of 121
For pack_years we have 85 values out of 121


In [8]:
#Create df with demographics to be filled below for each of the emphysema/non-emphysema categories
df_statistics=pd.DataFrame(columns=['All participants','Participants with Emphysema','Participants without Emphysema','P value'],
                            index=['Number of participants','Gender Male', 'Gender Female',
                                   'Age', 'Weight', 'Height',
                                   'Never smoker','Ever smoker','Pack years',
                                   'Advanced Emphysema','Confluent Emphysema' ,'Moderate Emphysema']) #'Current smoker', 'Ex smoker',

df_statistics.index.name = 'Characteristics' 

#All participants statistics
num_participants=len(df)
males=np.sum(df['gender']=='MALE')
females=np.sum(df['gender']=='FEMALE')
age_mean=np.mean(df['age_at_scan'])
age_std=np.std(df['age_at_scan'])
weight_mean=np.mean(df['weight_at_scan'])
weight_std=np.std(df['weight_at_scan'])
length_mean=np.mean(df['length_at_scan'])
length_std=np.std(df['length_at_scan'])

never_smoker=np.sum(df['never_smoker']==True)
ever_smoker=np.sum(df['ever_smoker']==True)
current_smoker=np.sum(df['current_smoker']==True)
ex_smoker=np.sum(df['ex_smoker']==True)
pack_years_mean=np.mean(df['pack_years'])
pack_years_std=np.std(df['pack_years'])

#Emphysema participants statistics
num_participants_emph=len(df_emph)
males_emph=np.sum(df_emph['gender']=='MALE')
females_emph=np.sum(df_emph['gender']=='FEMALE')
age_mean_emph=np.mean(df_emph['age_at_scan'])
age_std_emph=np.std(df_emph['age_at_scan'])
weight_mean_emph=np.mean(df_emph['weight_at_scan'])
weight_std_emph=np.std(df_emph['weight_at_scan'])
length_mean_emph=np.mean(df_emph['length_at_scan'])
length_std_emph=np.std(df_emph['length_at_scan'])

never_smoker_emph=np.sum(df_emph['never_smoker']==True)
ever_smoker_emph=np.sum(df_emph['ever_smoker']==True)
current_smoker_emph=np.sum(df_emph['current_smoker']==True)
ex_smoker_emph=np.sum(df_emph['ex_smoker']==True)
pack_years_mean_emph=np.mean(df_emph['pack_years'])
pack_years_std_emph=np.std(df_emph['pack_years'])

#Non-emphysema participants statistics
num_participants_noemph=len(df_noemph)
males_noemph=np.sum(df_noemph['gender']=='MALE')
females_noemph=np.sum(df_noemph['gender']=='FEMALE')
age_mean_noemph=np.mean(df_noemph['age_at_scan'])
age_std_noemph=np.std(df_noemph['age_at_scan'])
weight_mean_noemph=np.mean(df_noemph['weight_at_scan'])
weight_std_noemph=np.std(df_noemph['weight_at_scan'])
length_mean_noemph=np.mean(df_noemph['length_at_scan'])
length_std_noemph=np.std(df_noemph['length_at_scan'])

never_smoker_noemph=np.sum(df_noemph['never_smoker']==True)
ever_smoker_noemph=np.sum(df_noemph['ever_smoker']==True)
current_smoker_noemph=np.sum(df_noemph['current_smoker']==True)
ex_smoker_noemph=np.sum(df_noemph['ex_smoker']==True)
pack_years_mean_noemph=np.mean(df_noemph['pack_years'])
pack_years_std_noemph=np.std(df_noemph['pack_years'])


#Add them to df
df_statistics['All participants']=[
                        num_participants,
                        str(males)+' ('+str(100*np.round(males/num_participants,2))+'%)',
                        str(females)+' ('+str(100*np.round(females/num_participants,2))+'%)',
                        str(np.round(age_mean,2))+str('±')+str(np.round(age_std,2)),
                        str(np.round(weight_mean,2))+str('±')+str(np.round(weight_std,2)),
                        str(np.round(length_mean,2))+str('±')+str(np.round(length_std,2)),
                        str(never_smoker)+' ('+str(100*np.round(never_smoker/num_participants,2))+'%)', 
                        str(ever_smoker)+' ('+str(100*np.round(ever_smoker/num_participants,2))+'%)',
                        str(np.round(pack_years_mean,2))+str('±')+str(np.round(pack_years_std,2)),'','','' ]

df_statistics['Participants with Emphysema']=[
                        num_participants_emph,
                        str(males_emph)+' ('+str(100*np.round(males_emph/num_participants_emph,2))+'%)',
                        str(females_emph)+' ('+str(100*np.round(females_emph/num_participants_emph,2))+'%)',
                        str(np.round(age_mean_emph,2))+str('±')+str(np.round(age_std_emph,2)),
                        str(np.round(weight_mean_emph,2))+str('±')+str(np.round(weight_std_emph,2)),
                        str(np.round(length_mean_emph,2))+str('±')+str(np.round(length_std_emph,2)),
                        str(never_smoker_emph)+' ('+str(100*np.round(never_smoker_emph/num_participants_emph,2))+'%)', 
                        str(ever_smoker_emph)+' ('+str(100*np.round(ever_smoker_emph/num_participants_emph,2))+'%)', 
                        str(np.round(pack_years_mean_emph,2))+str('±')+str(np.round(pack_years_std_emph,2)),'5','7','27'
]

df_statistics['Participants without Emphysema']=[
                        num_participants_noemph,
                        str(males_noemph)+' ('+str(np.round(100*males_noemph/num_participants_noemph,2))+'%)',
                        str(females_noemph)+' ('+str(np.round(100*females_noemph/num_participants_noemph,2))+'%)',       
                        str(np.round(age_mean_noemph,2))+str('±')+str(np.round(age_std_noemph,2)),
                        str(np.round(weight_mean_noemph,2))+str('±')+str(np.round(weight_std_noemph,2)),
                        str(np.round(length_mean_noemph,2))+str('±')+str(np.round(length_std_noemph,2)),
                        str(never_smoker_noemph)+' ('+str(np.round(100*never_smoker_noemph/num_participants_noemph,2))+'%)', 
                        str(ever_smoker_noemph)+' ('+str(np.round(100*ever_smoker_noemph/num_participants_noemph,2))+'%)',
                        str(np.round(pack_years_mean_noemph,2))+str('±')+str(np.round(pack_years_std_noemph,2)),'','','']

df_statistics
#Symbol '±' copied from online
#'ever smoker' is 'current smoker'+'ex smoker'. 

Unnamed: 0_level_0,All participants,Participants with Emphysema,Participants without Emphysema,P value
Characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Number of participants,121,39,82,
Gender Male,58 (48.0%),23 (59.0%),35 (42.68%),
Gender Female,63 (52.0%),16 (41.0%),47 (57.32%),
Age,60.55±8.3,62.72±7.96,59.52±8.25,
Weight,81.77±14.21,78.46±12.66,83.34±14.64,
Height,174.66±9.77,176.29±9.44,173.88±9.82,
Never smoker,32 (26.0%),4 (10.0%),28 (34.15%),
Ever smoker,88 (73.0%),35 (90.0%),53 (64.63%),
Pack years,17.28±16.46,28.95±16.15,9.88±11.67,
Advanced Emphysema,,5,,


### P-value calculations

In [9]:
P_vals=['','gender=','','age=','weight=','height=','smoking status=','','pack years=','','',''] #For those column we calculate p values

In [10]:
#Create two groups of categorical data
group1 = [males_emph, males_noemph]  # frequencies of each category in group 1 - males
group2 = [females_emph, females_noemph]  # frequencies of each category in group 2 - females

# calculate the chi-square statistic and p-value
chi2_statistic, p_value, dof, expected = stats.chi2_contingency([group1, group2])

# print the p-value
print("P-value for gender:", p_value)
P_vals[1]=str(p_value)


# create two groups of numeric data
group1 = df_noemph['age_at_scan']
group2 = df_emph['age_at_scan']

# calculate the t-statistic and p-value
t_statistic, p_value = stats.ttest_ind(group1, group2)

# print the p-value
print("P-value for age:", p_value)
P_vals[3]=str(p_value)


#Repeat above for other categorical and numeric data

# create two groups of numeric data
group1 = df_noemph['weight_at_scan']
group2 = df_emph['weight_at_scan']

# calculate the t-statistic and p-value
t_statistic, p_value = stats.ttest_ind(group1, group2)

# print the p-value
print("P-value for weight:", p_value)
P_vals[4]=str(p_value)


# create two groups of numeric data
group1 = df_noemph['length_at_scan']
group2 = df_emph['length_at_scan']

# calculate the t-statistic and p-value
t_statistic, p_value = stats.ttest_ind(group1, group2)

# print the p-value
print("P-value for length:", p_value)
P_vals[5]=str(p_value)

# create two groups of numeric data
group1 = df_noemph['pack_years'].dropna()
group2 = df_emph['pack_years'].dropna()

# calculate the t-statistic and p-value
t_statistic, p_value = stats.ttest_ind(group1, group2)

# print the p-value
print("P-value for pack_years:", p_value)
P_vals[8]=str(p_value) #was 10


# create two groups of categorical data
group1 = [never_smoker_noemph, never_smoker_emph]  # frequencies of each category in group 1 - never smokers
group2 = [ever_smoker_noemph, ever_smoker_emph]  # frequencies of each category in group 2 - even smokers

# calculate the chi-square statistic and p-value
chi2_statistic, p_value, dof, expected = stats.chi2_contingency([group1, group2])

# print the p-value
print("P-value for smoking status (never, ever):", p_value)
P_vals[6]=str(p_value)

P-value for gender: 0.1383851389603378
P-value for age: 0.048340570903044104
P-value for weight: 0.07875154741279795
P-value for length: 0.20701184353713686
P-value for pack_years: 1.8265763055882205e-08
P-value for smoking status (never, ever): 0.009312720223910172


In [11]:
#P values smaller than 0.001 set to '<0.001' and others are rounded to 2 digits
for ind,val in enumerate(P_vals):
    if val!='':
        if float(val)<0.001:
            P_vals[ind]='<0.001'
        else:
            P_vals[ind]=np.round(float(val),2)
    else:
        pass
    
df_statistics['P value']=P_vals
df_statistics

Unnamed: 0_level_0,All participants,Participants with Emphysema,Participants without Emphysema,P value
Characteristics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Number of participants,121,39,82,
Gender Male,58 (48.0%),23 (59.0%),35 (42.68%),0.14
Gender Female,63 (52.0%),16 (41.0%),47 (57.32%),
Age,60.55±8.3,62.72±7.96,59.52±8.25,0.05
Weight,81.77±14.21,78.46±12.66,83.34±14.64,0.08
Height,174.66±9.77,176.29±9.44,173.88±9.82,0.21
Never smoker,32 (26.0%),4 (10.0%),28 (34.15%),0.01
Ever smoker,88 (73.0%),35 (90.0%),53 (64.63%),
Pack years,17.28±16.46,28.95±16.15,9.88±11.67,<0.001
Advanced Emphysema,,5,,


In [12]:
df_statistics.style.to_latex()

'\\begin{tabular}{lllll}\n & All participants & Participants with Emphysema & Participants without Emphysema & P value \\\\\nCharacteristics &  &  &  &  \\\\\nNumber of participants & 121 & 39 & 82 &  \\\\\nGender Male & 58 (48.0%) & 23 (59.0%) & 35 (42.68%) & 0.140000 \\\\\nGender Female & 63 (52.0%) & 16 (41.0%) & 47 (57.32%) &  \\\\\nAge & 60.55±8.3 & 62.72±7.96 & 59.52±8.25 & 0.050000 \\\\\nWeight & 81.77±14.21 & 78.46±12.66 & 83.34±14.64 & 0.080000 \\\\\nHeight & 174.66±9.77 & 176.29±9.44 & 173.88±9.82 & 0.210000 \\\\\nNever smoker & 32 (26.0%) & 4 (10.0%) & 28 (34.15%) & 0.010000 \\\\\nEver smoker & 88 (73.0%) & 35 (90.0%) & 53 (64.63%) &  \\\\\nPack years & 17.28±16.46 & 28.95±16.15 & 9.88±11.67 & <0.001 \\\\\nAdvanced Emphysema &  & 5 &  &  \\\\\nConfluent Emphysema &  & 7 &  &  \\\\\nModerate Emphysema &  & 27 &  &  \\\\\n\\end{tabular}\n'