In [55]:
import numpy as np
from scipy.stats import chi2_contingency

import pandas as pd

from IPython.display import display, display_html

In [56]:
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [57]:
data = pd.read_csv('./Prostate_cancer_micronutrients.csv')

# Sun Exposure and the Risk of Prostate Cancer in the Singapore Prostate Cancer Study: Table 1

### Basic overview

In [58]:
data.shape

(524, 833)

In [59]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(data.head().transpose())

Unnamed: 0,0,1,2,3,4
CAP_SN,1-001-44,1-002-41,1-003-38,1-004-39,1-005-39
PSA,10.58,0.53,,9.55,6.82
UNIT,ug/L,ug/L,,ug/L,ug/L
F,H,,,H,H
REFERENCE RANGE,0.00 - 4.00,0.00 - 4.00,,0.00 - 4.00,0.00 - 4.00
serial1st,1,1,1,1,1
age,62.7844,66.0671,69.1307,67.4305,67.5592
ageCat,2,2,2,2,2
age5,3,4,4,4,4
casectrl,1,1,1,1,1


"Family history of any cancer in the first degree relatives" could be anything from emorehistory to histo_sgh. Gotta get back to this later on.

In [60]:
def printNullCount(columns):
    for c in columns:
        if len(data[data[c].isna()]) > 0:
            print(c, ":", len(data[data[c].isna()]))

Are textual and their categorical columns identical when it comes to the number of NaNs?

In [61]:
printNullCount(['ethnic', 'ethnic0'])

ethnic : 11
ethnic0 : 11


In [62]:
printNullCount(['housing', 'housingGrp'])

housing : 11
housingGrp : 19


Nope!

In [63]:
# Sun Exposure and the Risk of Prostate Cancer in the Singapore Prostate Cancer Study: A Case-control Study
# Table 1 - couldn't find the family cancer history column.
columns = ['age', 'ethnic', 'ethnic0', 'housing', 'housingGrp', 'school_gp', 'marital', 'BMI', 'BMI_gp']

### Characteristics

In [64]:
set(data.casectrl)

{0.0, 1.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}

In [65]:
len(data[data.casectrl == 0]), len(data[data.casectrl == 1])

(268, 245)

As you can see, the number of casectrl == 0 is correct, but the number of casectrl == 1 is a bit too high compared to the 240 in the paper. This means that we have to find five rows to drop that don't affect the casectrl == 0 count.

In [66]:
printNullCount(columns)

age : 11
ethnic : 11
ethnic0 : 11
housing : 11
housingGrp : 19
school_gp : 14
marital : 13
BMI : 123
BMI_gp : 123


In [67]:
#for c in columns:
#    data = data[data[c].isnull() == False]
#printNullCount(columns)

In [68]:
controls_data = data[data.casectrl == 0]
case_data = data[data.casectrl == 1]

In [69]:
len(controls_data), len(case_data)

(268, 245)

### Age

In [70]:
age_controls_table = {}
age_case_table = {}

In [71]:
# n
age_controls_table['age_count_50-59'] = len(controls_data[(controls_data.age >= 50) & (controls_data.age <= 59)])
age_controls_table['age_count_60-69'] = len(controls_data[(controls_data.age >= 60) & (controls_data.age <= 69)])
age_controls_table['age_count_70-'] = len(controls_data[controls_data.age >= 70])

# (%)
age_controls_table['age_percentage_50-59'] = age_controls_table['age_count_50-59'] / len(controls_data.age)
age_controls_table['age_percentage_60-69'] = age_controls_table['age_count_60-69'] / len(controls_data.age)
age_controls_table['age_percentage_70-'] = age_controls_table['age_count_70-'] / len(controls_data.age)

# n
age_case_table['age_count_50-59'] = len(case_data[(case_data.age >= 50) & (case_data.age <= 59)])
age_case_table['age_count_60-69'] = len(case_data[(case_data.age >= 60) & (case_data.age <= 69)])
age_case_table['age_count_70-'] = len(case_data[case_data.age >= 70])

# (%)
age_case_table['age_percentage_50-59'] = age_case_table['age_count_50-59'] / len(case_data.age)
age_case_table['age_percentage_60-69'] = age_case_table['age_count_60-69'] / len(case_data.age)
age_case_table['age_percentage_70-'] = age_case_table['age_count_70-'] / len(case_data.age)

In [72]:
df1 = pd.DataFrame(age_controls_table.items(), columns=['Age', 'n / (%)'])
df2 = pd.DataFrame(age_case_table.items(), columns=['Age', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Age,n / (%)
0,age_count_50-59,91.0
1,age_count_60-69,86.0
2,age_count_70-,69.0
3,age_percentage_50-59,0.339552
4,age_percentage_60-69,0.320896
5,age_percentage_70-,0.257463

Unnamed: 0,Age,n / (%)
0,age_count_50-59,36.0
1,age_count_60-69,98.0
2,age_count_70-,90.0
3,age_percentage_50-59,0.146939
4,age_percentage_60-69,0.4
5,age_percentage_70-,0.367347


In [73]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00000185


### Ethnic

In [74]:
set(data.ethnic)

{'Chinese', 'Indian', 'Malay', 'Others', nan}

In [75]:
ethnic_controls_table = {}
ethnic_case_table = {}

In [76]:
# n
ethnic_controls_table['chinese_count'] = len(controls_data[(controls_data.ethnic == 'Chinese')])
ethnic_controls_table['malay_count'] = len(controls_data[(controls_data.ethnic == 'Malay')])
ethnic_controls_table['indian_count'] = len(controls_data[controls_data.ethnic == 'Indian'])
ethnic_controls_table['others_count'] = len(controls_data[controls_data.ethnic == 'Others'])

# (%)
ethnic_controls_table['chinese_percentage'] = ethnic_controls_table['chinese_count'] / len(controls_data.ethnic)
ethnic_controls_table['malay_percentage'] = ethnic_controls_table['malay_count'] / len(controls_data.ethnic)
ethnic_controls_table['indian_percentage'] = ethnic_controls_table['indian_count'] / len(controls_data.ethnic)
ethnic_controls_table['others_percentage'] = ethnic_controls_table['others_count'] / len(controls_data.ethnic)

# n
ethnic_case_table['chinese_count'] = len(case_data[(case_data.ethnic == 'Chinese')])
ethnic_case_table['malay_count'] = len(case_data[(case_data.ethnic == 'Malay')])
ethnic_case_table['indian_count'] = len(case_data[case_data.ethnic == 'Indian'])
ethnic_case_table['others_count'] = len(case_data[case_data.ethnic == 'Others'])

# (%)
ethnic_case_table['chinese_percentage'] = ethnic_case_table['chinese_count'] / len(case_data.ethnic)
ethnic_case_table['malay_percentage'] = ethnic_case_table['malay_count'] / len(case_data.ethnic)
ethnic_case_table['indian_percentage'] = ethnic_case_table['indian_count'] / len(case_data.ethnic)
ethnic_case_table['others_percentage'] = ethnic_case_table['others_count'] / len(case_data.ethnic)

In [77]:
df1 = pd.DataFrame(ethnic_controls_table.items(), columns=['Ethnic', 'n / (%)'])
df2 = pd.DataFrame(ethnic_case_table.items(), columns=['Ethnic', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Ethnic,n / (%)
0,chinese_count,225.0
1,malay_count,13.0
2,indian_count,23.0
3,others_count,7.0
4,chinese_percentage,0.839552
5,malay_percentage,0.048507
6,indian_percentage,0.085821
7,others_percentage,0.026119

Unnamed: 0,Ethnic,n / (%)
0,chinese_count,214.0
1,malay_count,13.0
2,indian_count,13.0
3,others_count,5.0
4,chinese_percentage,0.873469
5,malay_percentage,0.053061
6,indian_percentage,0.053061
7,others_percentage,0.020408


In [78]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.50106888


### Housing

In [79]:
set(data.housing)

{'HDB 1-3room', 'HDB 4room', 'Others', 'Private condo', nan}

In [80]:
housing_controls_table = {}
housing_case_table = {}

In [81]:
# n
housing_controls_table['hdb_1-3_room_count'] = len(controls_data[(controls_data.housing == 'HDB 1-3room')])
housing_controls_table['hdb_4_room_count'] = len(controls_data[(controls_data.housing == 'HDB 4room')])
housing_controls_table['private_condo_count'] = len(controls_data[controls_data.housing == 'Private condo'])
housing_controls_table['others_count'] = len(controls_data[controls_data.housing == 'Others'])

# (%)
housing_controls_table['hdb_1-3_room_percentage'] = housing_controls_table['hdb_1-3_room_count'] / len(controls_data.housing)
housing_controls_table['hdb_4_room_percentage'] = housing_controls_table['hdb_4_room_count'] / len(controls_data.housing)
housing_controls_table['private_condo_percentage'] = housing_controls_table['private_condo_count'] / len(controls_data.housing)
housing_controls_table['others_percentage'] = housing_controls_table['others_count'] / len(controls_data.housing)

# n
housing_case_table['hdb_1-3_room_count'] = len(case_data[(case_data.housing == 'HDB 1-3room')])
housing_case_table['hdb_4_room_count'] = len(case_data[(case_data.housing == 'HDB 4room')])
housing_case_table['private_condo_count'] = len(case_data[case_data.housing == 'Private condo'])
housing_case_table['others_count'] = len(case_data[case_data.housing == 'Others'])

# (%)
housing_case_table['hdb_1-3_room_percentage'] = housing_case_table['hdb_1-3_room_count'] / len(case_data.housing)
housing_case_table['hdb_4_room_percentage'] = housing_case_table['hdb_4_room_count'] / len(case_data.housing)
housing_case_table['private_condo_percentage'] = housing_case_table['private_condo_count'] / len(case_data.housing)
housing_case_table['others_percentage'] = housing_case_table['others_count'] / len(case_data.housing)

In [82]:
df1 = pd.DataFrame(housing_controls_table.items(), columns=['Housing', 'n / (%)'])
df2 = pd.DataFrame(housing_case_table.items(), columns=['Housing', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Housing,n / (%)
0,hdb_1-3_room_count,73.0
1,hdb_4_room_count,154.0
2,private_condo_count,29.0
3,others_count,12.0
4,hdb_1-3_room_percentage,0.272388
5,hdb_4_room_percentage,0.574627
6,private_condo_percentage,0.108209
7,others_percentage,0.044776

Unnamed: 0,Housing,n / (%)
0,hdb_1-3_room_count,40.0
1,hdb_4_room_count,115.0
2,private_condo_count,86.0
3,others_count,4.0
4,hdb_1-3_room_percentage,0.163265
5,hdb_4_room_percentage,0.469388
6,private_condo_percentage,0.35102
7,others_percentage,0.016327


In [83]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00000000


### Education

In [84]:
set(data.school_gp)

{0.0,
 1.0,
 2.0,
 3.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

I believe that the values 0 to 3 map straight to the categories on the paper, but let's make sure:

In [85]:
print(len(data[data.school_gp == 0])) # should be ~ 26
print(len(data[data.school_gp == 1])) # should be ~ 144
print(len(data[data.school_gp == 2])) # should be ~ 184
print(len(data[data.school_gp == 3])) # should be ~ 151

26
146
185
153


Okay, looks good.

In [86]:
education_controls_table = {}
education_case_table = {}

In [87]:
# n
education_controls_table['never_count'] = len(controls_data[(controls_data.school_gp == 0)])
education_controls_table['1-6_years_count'] = len(controls_data[(controls_data.school_gp == 1)])
education_controls_table['7-10_years_count'] = len(controls_data[controls_data.school_gp == 2])
education_controls_table['10-_count'] = len(controls_data[controls_data.school_gp == 3])

# (%)
education_controls_table['never_percentage'] = education_controls_table['never_count'] / len(controls_data.school_gp)
education_controls_table['1-6_years_percentage'] = education_controls_table['1-6_years_count'] / len(controls_data.school_gp)
education_controls_table['7-10_years_percentage'] = education_controls_table['7-10_years_count'] / len(controls_data.school_gp)
education_controls_table['10-_percentage'] = education_controls_table['10-_count'] / len(controls_data.school_gp)

# n
education_case_table['never_count'] = len(case_data[(case_data.school_gp == 0)])
education_case_table['1-6_years_count'] = len(case_data[(case_data.school_gp == 1)])
education_case_table['7-10_years_count'] = len(case_data[case_data.school_gp == 2])
education_case_table['10-_count'] = len(case_data[case_data.school_gp == 3])

# (%)
education_case_table['never_percentage'] = education_case_table['never_count'] / len(case_data.school_gp)
education_case_table['1-6_years_percentage'] = education_case_table['1-6_years_count'] / len(case_data.school_gp)
education_case_table['7-10_years_percentage'] = education_case_table['7-10_years_count'] / len(case_data.school_gp)
education_case_table['10-_percentage'] = education_case_table['10-_count'] / len(case_data.school_gp)

In [88]:
df1 = pd.DataFrame(education_controls_table.items(), columns=['Education', 'n / (%)'])
df2 = pd.DataFrame(education_case_table.items(), columns=['Education', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Education,n / (%)
0,never_count,20.0
1,1-6_years_count,89.0
2,7-10_years_count,102.0
3,10-_count,55.0
4,never_percentage,0.074627
5,1-6_years_percentage,0.33209
6,7-10_years_percentage,0.380597
7,10-_percentage,0.205224

Unnamed: 0,Education,n / (%)
0,never_count,6.0
1,1-6_years_count,57.0
2,7-10_years_count,83.0
3,10-_count,98.0
4,never_percentage,0.02449
5,1-6_years_percentage,0.232653
6,7-10_years_percentage,0.338776
7,10-_percentage,0.4


In [89]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]],])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00000422


### Marital status

In [90]:
set(data.marital)

{0.0,
 1.0,
 2.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

Once again, I'm expected 0 to 2 to map to the categories in the paper - married, separated, and never married that is.

In [91]:
print(len(data[data.marital== 0])) # should be ~ 443
print(len(data[data.marital == 1])) # should be ~ 35
print(len(data[data.marital == 2])) # should be ~ 28

448
35
28


Yup, it seems to work as planned.

In [92]:
marital_controls_table = {}
marital_case_table = {}

In [93]:
# n
marital_controls_table['currently_married_count'] = len(controls_data[(controls_data.marital == 0)])
marital_controls_table['separated_count'] = len(controls_data[(controls_data.marital == 1)])
marital_controls_table['never_married_count'] = len(controls_data[controls_data.marital == 2])

# (%)
marital_controls_table['currently_married_percentage'] = marital_controls_table['currently_married_count'] / len(controls_data.marital)
marital_controls_table['separated_percentage'] = marital_controls_table['separated_count'] / len(controls_data.marital)
marital_controls_table['never_married_percentage'] = marital_controls_table['never_married_count'] / len(controls_data.marital)

# n
marital_case_table['currently_married_count'] = len(case_data[(case_data.marital == 0)])
marital_case_table['separated_count'] = len(case_data[(case_data.marital == 1)])
marital_case_table['never_married_count'] = len(case_data[case_data.marital == 2])

# (%)
marital_case_table['currently_married_percentage'] = marital_case_table['currently_married_count'] / len(case_data.marital)
marital_case_table['separated_percentage'] = marital_case_table['separated_count'] / len(case_data.marital)
marital_case_table['never_married_percentage'] = marital_case_table['never_married_count'] / len(case_data.marital)

In [94]:
df1 = pd.DataFrame(marital_controls_table.items(), columns=['Marital status', 'n / (%)'])
df2 = pd.DataFrame(marital_case_table.items(), columns=['Marital status', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Marital status,n / (%)
0,currently_married_count,224.0
1,separated_count,26.0
2,never_married_count,17.0
3,currently_married_percentage,0.835821
4,separated_percentage,0.097015
5,never_married_percentage,0.063433

Unnamed: 0,Marital status,n / (%)
0,currently_married_count,224.0
1,separated_count,9.0
2,never_married_count,11.0
3,currently_married_percentage,0.914286
4,separated_percentage,0.036735
5,never_married_percentage,0.044898


In [95]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.01408773


### Family history of any cancer in the dirst degree relatives

In [129]:
set(data.q4A1)

{nan,
 1.0,
 2.0,
 3.0,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 99.0,
 333.0}

In [133]:
print("No cancer in family:", len(data[(data.q4A1 == 1)]))
print("Cancer in family:", len(data[(data.q4A1 == 2) | (data.q4A1 == 3)]))

No cancer in family: 324
Cancer in family: 152


In [135]:
cif_controls_table = {}
cif_case_table = {}

In [136]:
# n
cif_controls_table['cancer_in_family_count'] = len(controls_data[(controls_data.q4A1 == 2) | (controls_data.q4A1 == 3)])
cif_controls_table['no_cancer_in_family_count'] = len(controls_data[(controls_data.q4A1 == 1)])

# (%)
cif_controls_table['cancer_in_family_percentage'] = cif_controls_table['cancer_in_family_count'] / len(controls_data.q4A1)
cif_controls_table['no_cancer_in_family_percentage'] = cif_controls_table['no_cancer_in_family_count'] / len(controls_data.q4A1)

# n
cif_case_table['cancer_in_family_count'] = len(case_data[(case_data.q4A1 == 2) | (case_data.q4A1 == 3)])
cif_case_table['no_cancer_in_family_count'] = len(case_data[(case_data.q4A1 == 1)])

# (%)
cif_case_table['cancer_in_family_percentage'] = cif_case_table['cancer_in_family_count'] / len(case_data.q4A1)
cif_case_table['no_cancer_in_family_percentage'] = cif_case_table['no_cancer_in_family_count'] / len(case_data.q4A1)


In [140]:
df1 = pd.DataFrame(cif_controls_table.items(), columns=['Cancer in family', 'n / (%)'])
df2 = pd.DataFrame(cif_case_table.items(), columns=['Cancer in family', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Cancer in family,n / (%)
0,cancer_in_family_count,51.0
1,no_cancer_in_family_count,198.0
2,cancer_in_family_percentage,0.190299
3,no_cancer_in_family_percentage,0.738806

Unnamed: 0,Cancer in family,n / (%)
0,cancer_in_family_count,101.0
1,no_cancer_in_family_count,126.0
2,cancer_in_family_percentage,0.412245
3,no_cancer_in_family_percentage,0.514286


In [142]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00000004


### BMI (kg/m2)

In [100]:
set(data.BMI_gp)

{nan,
 1.0,
 2.0,
 3.0,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

In [101]:
print(len(data[data.BMI_gp == 1])) # should be ~ 114
print(len(data[data.BMI_gp == 2])) # should be ~ 126
print(len(data[data.BMI_gp == 3])) # should be ~ 86
print(len(data[data.BMI_gp == 4])) # should be ~ 70

118
126
87
70


In [102]:
bmi_controls_table = {}
bmi_case_table = {}

In [151]:
# n
bmi_controls_table['quartile1_count'] = len(controls_data[(controls_data.BMI_gp == 1)])
bmi_controls_table['quartile2_count'] = len(controls_data[(controls_data.BMI_gp == 2)])
bmi_controls_table['quartile3_count'] = len(controls_data[controls_data.BMI_gp == 3])
bmi_controls_table['quartile4_count'] = len(controls_data[controls_data.BMI_gp == 4])

# (%)
bmi_controls_table['quartile1_percentage'] = bmi_controls_table['quartile1_count'] / len(controls_data.BMI_gp)
bmi_controls_table['quartile2_percentage'] = bmi_controls_table['quartile2_count'] / len(controls_data.BMI_gp)
bmi_controls_table['quartile3_percentage'] = bmi_controls_table['quartile3_count'] / len(controls_data.BMI_gp)
bmi_controls_table['quartile4_percentage'] = bmi_controls_table['quartile4_count'] / len(controls_data.BMI_gp)

# n
bmi_case_table['quartile1_count'] = len(case_data[(case_data.BMI_gp == 1)])
bmi_case_table['quartile2_count'] = len(case_data[(case_data.BMI_gp == 2)])
bmi_case_table['quartile3_count'] = len(case_data[case_data.BMI_gp == 3])
bmi_case_table['quartile4_count'] = len(case_data[case_data.BMI_gp == 4])

# (%)
bmi_case_table['quartile1_percentage'] = bmi_case_table['quartile1_count'] / len(case_data.BMI_gp)
bmi_case_table['quartile2_percentage'] = bmi_case_table['quartile2_count'] / len(case_data.BMI_gp)
bmi_case_table['quartile3_percentage'] = bmi_case_table['quartile3_count'] / len(case_data.BMI_gp)
bmi_case_table['quartile4_percentage'] = bmi_case_table['quartile4_count'] / len(case_data.BMI_gp)


In [152]:
df1 = pd.DataFrame(bmi_controls_table.items(), columns=['BMI (kg/m2)', 'n / (%)'])
df2 = pd.DataFrame(bmi_case_table.items(), columns=['BMI (kg/m2)', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,BMI (kg/m2),n / (%)
0,quartile1_count,45.0
1,quartile2_count,47.0
2,quartile3_count,46.0
3,quartile4_count,46.0
4,quartile1_percentage,0.16791
5,quartile2_percentage,0.175373
6,quartile3_percentage,0.171642
7,quartile4_percentage,0.171642

Unnamed: 0,BMI (kg/m2),n / (%)
0,quartile1_count,73.0
1,quartile2_count,79.0
2,quartile3_count,41.0
3,quartile4_count,24.0
4,quartile1_percentage,0.297959
5,quartile2_percentage,0.322449
6,quartile3_percentage,0.167347
7,quartile4_percentage,0.097959


In [153]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00022724


# Sun Exposure and the Risk of Prostate Cancer in the Singapore Prostate Cancer Study: Table 2

### Eye colour

In [144]:
set(data.eye)

{nan,
 1.0,
 2.0,
 nan,
 nan,
 3.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

In [143]:
eye_controls_table = {}
eye_case_table = {}

In [154]:
# n
eye_controls_table['black_or_dark_brown_count'] = len(controls_data[(controls_data.eye == 1)])
eye_controls_table['light_brown_count'] = len(controls_data[(controls_data.eye == 2)])

# (%)
eye_controls_table['black_or_dark_brown_percentage'] = eye_controls_table['black_or_dark_brown_count'] / len(controls_data.eye)
eye_controls_table['light_brown_percentage'] = eye_controls_table['light_brown_count'] / len(controls_data.eye)

# n
eye_case_table['black_or_dark_brown_count'] = len(case_data[(case_data.eye == 1)])
eye_case_table['light_brown_count'] = len(case_data[(case_data.eye == 2)])

# (%)
eye_case_table['black_or_dark_brown_percentage'] = eye_case_table['black_or_dark_brown_count'] / len(case_data.eye)
eye_case_table['light_brown_percentage'] = eye_case_table['light_brown_count'] / len(case_data.eye)


In [155]:
df1 = pd.DataFrame(eye_controls_table.items(), columns=['BMI (kg/m2)', 'n / (%)'])
df2 = pd.DataFrame(eye_case_table.items(), columns=['BMI (kg/m2)', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,BMI (kg/m2),n / (%)
0,black_or_dark_brown_count,169.0
1,light_brown_count,97.0
2,black_or_dark_brown_percentage,0.630597
3,light_brown_percentage,0.36194

Unnamed: 0,BMI (kg/m2),n / (%)
0,black_or_dark_brown_count,214.0
1,light_brown_count,25.0
2,black_or_dark_brown_percentage,0.873469
3,light_brown_percentage,0.102041


In [158]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.12f'))

0.000000000019


### Skin colour

In [165]:
set(data.skin2)

{nan,
 1.0,
 nan,
 3.0,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

In [169]:
len(data[data.skin2 == 1]), len(data[data.skin2 == 3]), len(data[data.skin2 == 4])

(106, 248, 153)

In [171]:
skin_controls_table = {}
skin_case_table = {}

In [172]:
# n
skin_controls_table['white_count'] = len(controls_data[(controls_data.skin2 == 1)])
skin_controls_table['light_tan_count'] = len(controls_data[controls_data.skin2 == 3])
skin_controls_table['tan_brown_black_count'] = len(controls_data[controls_data.skin2 == 4])

# (%)
skin_controls_table['white_percentage'] = skin_controls_table['white_count'] / len(controls_data.skin2)
skin_controls_table['light_tan_percentage'] = skin_controls_table['light_tan_count'] / len(controls_data.skin2)
skin_controls_table['tan_brown_black_percentage'] = skin_controls_table['tan_brown_black_count'] / len(controls_data.skin2)

# n
skin_case_table['white_count'] = len(case_data[(case_data.skin2 == 1)])
skin_case_table['light_tan_count'] = len(case_data[case_data.skin2 == 3])
skin_case_table['tan_brown_black_count'] = len(case_data[case_data.skin2 == 4])

# (%)
skin_case_table['white_percentage'] = skin_case_table['white_count'] / len(case_data.skin2)
skin_case_table['light_tan_percentage'] = skin_case_table['light_tan_count'] / len(case_data.skin2)
skin_case_table['tan_brown_black_percentage'] = skin_case_table['tan_brown_black_count'] / len(case_data.skin2)


In [175]:
df1 = pd.DataFrame(skin_controls_table.items(), columns=['Skin colour', 'n / (%)'])
df2 = pd.DataFrame(skin_case_table.items(), columns=['Skin colour', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Skin colour,n / (%)
0,white_count,80.0
1,light_tan_count,116.0
2,tan_brown_black_count,72.0
3,white_percentage,0.298507
4,light_tan_percentage,0.432836
5,tan_brown_black_percentage,0.268657

Unnamed: 0,Skin colour,n / (%)
0,white_count,26.0
1,light_tan_count,132.0
2,tan_brown_black_count,81.0
3,white_percentage,0.106122
4,light_tan_percentage,0.538776
5,tan_brown_black_percentage,0.330612


In [176]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00000107


### Sunburn frequency

In [182]:
set(data.sunburn_fq)

{'Frequently', 'None', 'Occasionally', 'Seldom', nan}

In [184]:
len(data[data.sunburn_fq == "None"]), 147+112

(261, 259)

In [185]:
len(data[data.sunburn_fq == "Seldom"]), 70+56

(127, 126)

In [186]:
len(data[data.sunburn_fq == "Occasionally"]), 24+33

(59, 57)

In [187]:
len(data[data.sunburn_fq == "Frequently"]), 15+30

(45, 45)

In [189]:
sunburn_controls_table = {}
sunburn_case_table = {}

In [190]:
# n
sunburn_controls_table['never_count'] = len(controls_data[(controls_data.sunburn_fq == "None")])
sunburn_controls_table['seldom_count'] = len(controls_data[(controls_data.sunburn_fq == "Seldom")])
sunburn_controls_table['occasionally_count'] = len(controls_data[controls_data.sunburn_fq == "Occasionally"])
sunburn_controls_table['frequently_count'] = len(controls_data[controls_data.sunburn_fq == "Frequently"])

# (%)
sunburn_controls_table['never_percentage'] = sunburn_controls_table['never_count'] / len(controls_data.sunburn_fq)
sunburn_controls_table['seldom_percentage'] = sunburn_controls_table['seldom_count'] / len(controls_data.sunburn_fq)
sunburn_controls_table['occasionally_percentage'] = sunburn_controls_table['occasionally_count'] / len(controls_data.sunburn_fq)
sunburn_controls_table['frequently_percentage'] = sunburn_controls_table['frequently_count'] / len(controls_data.sunburn_fq)

# n
sunburn_case_table['never_count'] = len(case_data[(case_data.sunburn_fq == "None")])
sunburn_case_table['seldom_count'] = len(case_data[(case_data.sunburn_fq == "Seldom")])
sunburn_case_table['occasionally_count'] = len(case_data[case_data.sunburn_fq == "Occasionally"])
sunburn_case_table['frequently_count'] = len(case_data[case_data.sunburn_fq == "Frequently"])

# (%)
sunburn_case_table['never_percentage'] = sunburn_case_table['never_count'] / len(case_data.sunburn_fq)
sunburn_case_table['seldom_percentage'] = sunburn_case_table['seldom_count'] / len(case_data.sunburn_fq)
sunburn_case_table['occasionally_percentage'] = sunburn_case_table['occasionally_count'] / len(case_data.sunburn_fq)
sunburn_case_table['frequently_percentage'] = sunburn_case_table['frequently_count'] / len(case_data.sunburn_fq)


In [191]:
df1 = pd.DataFrame(sunburn_controls_table.items(), columns=['Sunburn frequency', 'n / (%)'])
df2 = pd.DataFrame(sunburn_case_table.items(), columns=['Sunburn frequency', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Sunburn frequency,n / (%)
0,never_count,147.0
1,seldom_count,70.0
2,occasionally_count,24.0
3,frequently_count,15.0
4,never_percentage,0.548507
5,seldom_percentage,0.261194
6,occasionally_percentage,0.089552
7,frequently_percentage,0.05597

Unnamed: 0,Sunburn frequency,n / (%)
0,never_count,114.0
1,seldom_count,57.0
2,occasionally_count,35.0
3,frequently_count,30.0
4,never_percentage,0.465306
5,seldom_percentage,0.232653
6,occasionally_percentage,0.142857
7,frequently_percentage,0.122449


In [192]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00825068


### Adult sun exposure

In [240]:
for c in data.columns:
    if "adult" in c:
        print(c)

adultwork
adultwk
adultwork_y
adultwork_ygp
adultwkend_y
adultwkend_ygp
adult_pw
adultpw_gp
adultpw_gp3
adultpw_y
adultpw_gp_y


In [232]:
for c in data.columns:
    if len(data[data[c] == 0]) > 309 and len(data[data[c] == 0]) < 329:
        if len(data[data[c] == 1]) > 79 and len(data[data[c] == 1]) < 99:
            print(c)

adultpw_gp3


In [233]:
set(data.adultpw_gp3)
#set(data.adultwk)

{0.0, 1.0, 2.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}

In [237]:
len(data[data.adultpw_gp3 == 0]), 188+131

(322, 319)

In [238]:
len(data[data.adultpw_gp3 == 1]), 36+53

(90, 89)

In [239]:
len(data[data.adultpw_gp3 == 2]), 44+56

(101, 100)

In [241]:
sun_controls_table = {}
sun_case_table = {}

In [244]:
# n
sun_controls_table['less_than_0.5h_week_count'] = len(controls_data[(controls_data.adultpw_gp3 == 0)])
sun_controls_table['0.5-10h_week_count'] = len(controls_data[(controls_data.adultpw_gp3 == 1)])
sun_controls_table['10.1-56h_week_count'] = len(controls_data[controls_data.adultpw_gp3 == 2])

# (%)
sun_controls_table['less_than_0.5h_week_percentage'] = sun_controls_table['less_than_0.5h_week_count'] / len(controls_data.adultpw_gp3)
sun_controls_table['0.5-10h_week_percentage'] = sun_controls_table['0.5-10h_week_count'] / len(controls_data.adultpw_gp3)
sun_controls_table['10.1-56h_week_percentage'] = sun_controls_table['10.1-56h_week_count'] / len(controls_data.adultpw_gp3)

# n
sun_case_table['less_than_0.5h_week_count'] = len(case_data[(case_data.adultpw_gp3 == 0)])
sun_case_table['0.5-10h_week_count'] = len(case_data[(case_data.adultpw_gp3 == 1)])
sun_case_table['10.1-56h_week_count'] = len(case_data[case_data.adultpw_gp3 == 2])

# (%)
sun_case_table['less_than_0.5h_week_percentage'] = sun_case_table['less_than_0.5h_week_count'] / len(case_data.adultpw_gp3)
sun_case_table['0.5-10h_week_percentage'] = sun_case_table['0.5-10h_week_count'] / len(case_data.adultpw_gp3)
sun_case_table['10.1-56h_week_percentage'] = sun_case_table['10.1-56h_week_count'] / len(case_data.adultpw_gp3)

In [245]:
df1 = pd.DataFrame(sun_controls_table.items(), columns=['Sun exposure', 'n / (%)'])
df2 = pd.DataFrame(sun_case_table.items(), columns=['Sun exposure', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Sun exposure,n / (%)
0,less_than_0.5h_week_count,188.0
1,0.5-10h_week_count,36.0
2,10.1-56h_week_count,44.0
3,less_than_0.5h_week_percentage,0.701493
4,0.5-10h_week_percentage,0.134328
5,10.1-56h_week_percentage,0.164179

Unnamed: 0,Sun exposure,n / (%)
0,less_than_0.5h_week_count,134.0
1,0.5-10h_week_count,54.0
2,10.1-56h_week_count,57.0
3,less_than_0.5h_week_percentage,0.546939
4,0.5-10h_week_percentage,0.220408
5,10.1-56h_week_percentage,0.232653


In [246]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.00127810


### Physical activities (MET/wk)

In [271]:
for c in data.columns:
    try:
        if len(data[data[c] <= 222]) > 95 and len(data[data[c] <= 222]) < 115:
            print(c)
    except TypeError as e:
        continue

MET_pw
redtea
GREENtea
soymilk_s
stage


In [273]:
for c in data.columns:
    if "MET" in c:
        print(c)

MET_work
MET_rest
MET_pw
MET_pwgp3
MET_pwgp2
MET_pwgp
MET_loco
MET_log
lo_METgp


In [272]:
set(data.MET_pw)

{nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 151.5,
 184.8000031,
 201.5999908,
 204.0,
 205.8000031,
 210.0,
 211.8000031,
 212.09999080000003,
 214.1999969,
 216.0,
 217.2000122,
 218.40000919999997,
 219.0,
 219.1999969,
 219.35000609999997,
 220.5,
 221.52500919999997,
 221.90000919999997,
 222.0,
 222.59999080000003,
 222.77500919999997,
 222.89999390000003,
 222.9499969,
 223.40000919999997,
 223.90000919999997,
 224.17501830000003,
 224.69998169999997,
 226.80001830000003,
 227.15000919999997,
 229.25,
 229.65000919999997,
 231.10000609999997,
 231.17501830000003,
 231.80001830000003,
 233.05001830000003,
 233.09999080000003,
 233.5,
 233.625,
 233.80001830000003,
 234.0,
 234.30001830000003,
 234.5,
 234.65000919999997,
 234.90000919999997,
 235.0,
 235.0500031,
 235.09999080000003,
 235.15000919999997,
 235.19998169999997,
 235.55001830000003,
 235.72499080000003,
 235.75,
 235.90000919999997,
 236.0,
 236.3000031,
 236.80001830000003,
 236.90000919999997,
 237.25,
 

In [274]:
set(data.MET_pwgp)

{nan, 1.0, 2.0, 3.0, 4.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}

In [275]:
len(data[data.MET_pwgp == 1]), 65+40

(105, 105)

In [276]:
len(data[data.MET_pwgp == 2]), 66+57

(126, 123)

In [277]:
len(data[data.MET_pwgp == 3]), 69+64

(133, 133)

In [278]:
len(data[data.MET_pwgp == 4]), 68+79

(149, 147)

In [279]:
pa_controls_table = {}
pa_case_table = {}

In [280]:
# n
pa_controls_table['quartile1_count'] = len(controls_data[(controls_data.MET_pwgp == 1)])
pa_controls_table['quartile2_count'] = len(controls_data[(controls_data.MET_pwgp == 2)])
pa_controls_table['quartile3_count'] = len(controls_data[controls_data.MET_pwgp == 3])
pa_controls_table['quartile4_count'] = len(controls_data[controls_data.MET_pwgp == 4])

# (%)
pa_controls_table['quartile1_percentage'] = pa_controls_table['quartile1_count'] / len(controls_data.MET_pwgp)
pa_controls_table['quartile2_percentage'] = pa_controls_table['quartile2_count'] / len(controls_data.MET_pwgp)
pa_controls_table['quartile3_percentage'] = pa_controls_table['quartile3_count'] / len(controls_data.MET_pwgp)
pa_controls_table['quartile4_percentage'] = pa_controls_table['quartile4_count'] / len(controls_data.MET_pwgp)

# n
pa_case_table['quartile1_count'] = len(case_data[(case_data.MET_pwgp == 1)])
pa_case_table['quartile2_count'] = len(case_data[(case_data.MET_pwgp == 2)])
pa_case_table['quartile3_count'] = len(case_data[case_data.MET_pwgp == 3])
pa_case_table['quartile4_count'] = len(case_data[case_data.MET_pwgp == 4])

# (%)
pa_case_table['quartile1_percentage'] = pa_case_table['quartile1_count'] / len(case_data.MET_pwgp)
pa_case_table['quartile2_percentage'] = pa_case_table['quartile2_count'] / len(case_data.MET_pwgp)
pa_case_table['quartile3_percentage'] = pa_case_table['quartile3_count'] / len(case_data.MET_pwgp)
pa_case_table['quartile4_percentage'] = pa_case_table['quartile4_count'] / len(case_data.MET_pwgp)


In [281]:
df1 = pd.DataFrame(pa_controls_table.items(), columns=['', 'n / (%)'])
df2 = pd.DataFrame(pa_case_table.items(), columns=['', 'n / (%)'])
display_side_by_side(df1, df2)

Unnamed: 0,Unnamed: 1,n / (%)
0,quartile1_count,65.0
1,quartile2_count,66.0
2,quartile3_count,69.0
3,quartile4_count,68.0
4,quartile1_percentage,0.242537
5,quartile2_percentage,0.246269
6,quartile3_percentage,0.257463
7,quartile4_percentage,0.253731

Unnamed: 0,Unnamed: 1,n / (%)
0,quartile1_count,40.0
1,quartile2_count,60.0
2,quartile3_count,64.0
3,quartile4_count,81.0
4,quartile1_percentage,0.163265
5,quartile2_percentage,0.244898
6,quartile3_percentage,0.261224
7,quartile4_percentage,0.330612


In [282]:
a = np.array([[df1.iloc[0,1], df2.iloc[0,1]], 
              [df1.iloc[1,1], df2.iloc[1,1]], 
              [df1.iloc[2,1], df2.iloc[2,1]],
              [df1.iloc[3,1], df2.iloc[3,1]]])

_, p_value, _, _ = chi2_contingency(a)
print(format(p_value, '.8f'))

0.08801089
