In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns;sns.set()

In [10]:
dataset = pd.read_csv('NHANES.csv')

In [11]:
dataset["SMQ020x"] = dataset.SMQ020.replace({1:'Yes',2:"No",9:np.nan,7:np.nan})
dataset.SMQ020x.head()

0    Yes
1    Yes
2    Yes
3     No
4     No
Name: SMQ020x, dtype: object

In [12]:
dataset["RIAGENDRx"] = dataset.RIAGENDR.replace({1:'Male',2:'Female'})
dataset.RIAGENDRx.head()

0      Male
1      Male
2      Male
3    Female
4    Female
Name: RIAGENDRx, dtype: object

In [13]:
dx = dataset[["SMQ020x","RIAGENDRx"]].dropna()
pd.crosstab(dx.RIAGENDRx , dx.SMQ020x)

SMQ020x,No,Yes
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,2066,906
Male,1340,1413


In [14]:
dx["SMQ020x"] = dx.SMQ020x.replace({'Yes':1,'No':0})

In [15]:
dz = dx.groupby(["RIAGENDRx"]).agg({'SMQ020x':[np.mean,np.size]})
dz.columns = ["Proportion" , "Total n"]
dz

Unnamed: 0_level_0,Proportion,Total n
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.304845,2972
Male,0.513258,2753


In [16]:
# We can use the above information to calculate confidence intervals for difference of two population proportions

p = dz.Proportion.Female
n = dz["Total n"].Female
se_female = np.sqrt(p*(1-p)/n)
se_female

0.008444152146214435

In [17]:
p = dz.Proportion.Male
n = dz["Total n"].Male
se_male = np.sqrt(p*(1-p)/n)
se_male

0.009526078653689868

In [18]:
se = np.sqrt(se_female**2 + se_male**2)
se

0.012729881381407434

In [19]:
best_estimate = 0.513258 - 0.304845
tstar = 1.96

In [20]:
lcb = best_estimate - tstar*se
ucb = best_estimate + tstar*se
(lcb,ucb)

(0.18346243249244146, 0.23336356750755857)

In [32]:
#Estimating confidence intervals for difference for BMI for male and female
dx = dataset.groupby(["RIAGENDRx"]).agg({'BMXBMI':[np.mean,np.std,np.size]})
dx.columns = ['Mean','Std','Size']
dx

Unnamed: 0_level_0,Mean,Std,Size
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,29.939946,7.753319,2976.0
Male,28.778072,6.252568,2759.0


In [34]:
se_female = dx.Std.Female/np.sqrt(dx.Size.Female)
se_male = dx.Std.Male/np.sqrt(dx.Size.Male)
se = np.sqrt(se_female**2 + se_male**2)
se

0.18538992862064455

In [35]:
best_estimate = 28.778072 - 29.939946
lcb = best_estimate - tstar*se
ucb = best_estimate + tstar*se

(lcb,ucb)

(-1.5252382600964607, -0.7985097399035341)