In [303]:
import pandas as pd 
import numpy as np 

In [304]:
share_w1 = '../data_sources/share/stata_sharew1_rel2-5-0__all_capi_modules/'
share_w2 = '../data_sources/share/stata_sharew2_rel2-5-0__all_capi_modules/'


# Wave 1

## Demographics

In [305]:
dn = pd.read_stata(share_w1+'sharew1_rel2-5-0_dn.dta',convert_categoricals=False)

In [306]:
dn['rage_w1'] = 2004 - dn['dn003_']
dn.loc[dn['rage_w1']>120,'rage_w1'] = np.nan

In [307]:
dn['hhidpn'] = dn['hhid1'] + dn['cvid'].astype('str')

In [308]:
dn = dn.loc[:,['hhidpn','mergeid','country','rage_w1']]
dn.set_index('hhidpn',inplace=True)

In [309]:
dn.rage_w1.value_counts().sort_index()

26.0     2
28.0     1
29.0     1
30.0     1
31.0     6
        ..
100.0    5
101.0    1
102.0    2
103.0    1
104.0    1
Name: rage_w1, Length: 78, dtype: int64

In [310]:
dn.groupby('country').count()

Unnamed: 0_level_0,mergeid,rage_w1
country,Unnamed: 1_level_1,Unnamed: 2_level_1
11,1893,1893
12,3008,3007
13,3053,3053
14,2979,2972
15,2396,2396
16,2559,2559
17,3193,3193
18,1707,1707
19,2898,2898
20,1004,1003


In [311]:
len(dn)

31115

## BMI

In [312]:
gv = pd.read_stata(share_w1+'sharew1_rel2-5-0_gv_health.dta',convert_categoricals=False)

In [313]:
gv['hhidpn'] = gv['hhid1'] + gv['cvid'].astype('str')

In [314]:
gv = gv.rename({'bmi':'rbmi_w1'},axis=1)

In [315]:
gv = gv.loc[:,['hhidpn','rbmi_w1']]
gv.set_index('hhidpn',inplace=True)

In [316]:
gv.head()

Unnamed: 0_level_0,rbmi_w1
hhidpn,Unnamed: 1_level_1
AT-000327-A1,32.787994
AT-000327-A2,31.161373
AT-001816-A1,23.722811
AT-001816-A2,27.440599
AT-002132-A1,34.04903


In [317]:
len(gv)

31115

## Health Conditions

In [318]:
ph = pd.read_stata(share_w1+'sharew1_rel2-5-0_ph.dta',convert_categoricals=False)

In [319]:
ph['hhidpn'] = ph['hhid1'] + ph['cvid'].astype('str')

In [320]:
ph = ph.rename({'ph006d1':'rhearte_w1','ph006d2':'rhibpe_w1','ph006d5':'rdiabe_w1','ph006d10':'rcancre_w1','ph006d6':'rlunge_w1','ph006d4':'rstroke_w1'},axis=1)

In [321]:
for c in ['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']:
	ph[c] = np.where(ph[c]<0,np.nan,ph[c])
ph['radla_w1'] = ph[['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']].sum(axis=1)

In [322]:
ph = ph.loc[:,['hhidpn','rhearte_w1','rhibpe_w1','rdiabe_w1','rcancre_w1','rlunge_w1','rstroke_w1','radla_w1']]

In [323]:
ph.set_index('hhidpn',inplace=True)

In [324]:
ph.head()

Unnamed: 0_level_0,rhearte_w1,rhibpe_w1,rdiabe_w1,rcancre_w1,rlunge_w1,rstroke_w1,radla_w1
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AT-000327-A1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AT-000327-A2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AT-001816-A1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AT-001816-A2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AT-002132-A1,0.0,0.0,0.0,0.0,1.0,0.0,2.0


In [325]:
len(ph)

31115

## Income and Self-reported Health

In [326]:
im = pd.read_stata(share_w1+'sharew1_rel2-5-0_imputations.dta',convert_categoricals=False)



In [327]:
im = im.rename({'pppx2004':'pppx_w1'},axis=1)

In [328]:
im['hhidpn'] = im['hhid1'] + im['cvid'].astype('str')

In [329]:
im = im.rename({'hgtincv':'hitot_w1'},axis=1)

In [330]:
im = im.rename({'srhealtha':'rshlt_w1'},axis=1)
 

In [331]:
im = im.loc[:,['hhidpn','hitot_w1','rshlt_w1','pppx_w1']]

In [332]:
im.drop_duplicates(subset=['hhidpn'], keep='first',inplace=True)

In [333]:
im.set_index('hhidpn',inplace=True)

In [334]:
len(im)

32405

## Weights

In [335]:
wt = pd.read_stata(share_w1+'sharew1_rel2-5-0_gv_weights.dta',convert_categoricals=False)

In [336]:
wt['hhidpn'] = wt['hhid1'] + wt['cvid'].astype('str')

In [337]:
wt = wt.rename({'w1aci':'wgid_w1'},axis=1)

In [338]:
wt = wt.loc[:,['hhidpn','wgid_w1']]
wt.set_index('hhidpn',inplace=True)

## Behavioral Risk Factors

In [339]:
br = pd.read_stata(share_w1+'sharew1_rel2-5-0_br.dta',convert_categoricals=False)

In [340]:
br['hhidpn'] = br['hhid1'] + br['cvid'].astype('str')

In [341]:
br = br.rename({'br001_':'rsmokev_w1'},axis=1)
br['rsmokev_w1'] = br['rsmokev_w1'].replace({5:0})

In [342]:
br = br.rename({'br015_':'rvgactx_w1'},axis=1)
br.loc[br['rvgactx_w1']>4,'rvgactx_w1'] = np.nan
br = br.rename({'br016_':'rmdactx_w1'},axis=1)
br.loc[br['rmdactx_w1']>4,'rmdactx_w1'] = np.nan

In [343]:
br = br.rename({'br010_':'rdrinkv_w1'},axis=1)
br.loc[br['rdrinkv_w1']>7,'rdrinkv_w1'] = np.nan

In [344]:
br = br.loc[:,['hhidpn','rsmokev_w1','rvgactx_w1','rmdactx_w1','rdrinkv_w1']]
br.set_index('hhidpn',inplace=True)

In [345]:
br.head()

Unnamed: 0_level_0,rsmokev_w1,rvgactx_w1,rmdactx_w1,rdrinkv_w1
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT-000327-A1,0.0,2.0,1.0,4.0
AT-000327-A2,0.0,3.0,1.0,6.0
AT-001816-A1,1.0,3.0,1.0,4.0
AT-001816-A2,0.0,2.0,1.0,4.0
AT-002132-A1,0.0,4.0,4.0,-1.0


## Merging

In [346]:
sh_w1 = dn.merge(gv,left_index=True,right_index=True,how='left').merge(ph,left_index=True,right_index=True,how='left').merge(im,left_index=True,right_index=True,how='left').merge(wt,left_index=True,right_index=True,how='left').merge(br,left_index=True,right_index=True,how='left')

In [347]:
sh_w1.head()

Unnamed: 0_level_0,mergeid,country,rage_w1,rbmi_w1,rhearte_w1,rhibpe_w1,rdiabe_w1,rcancre_w1,rlunge_w1,rstroke_w1,radla_w1,hitot_w1,rshlt_w1,pppx_w1,wgid_w1,rsmokev_w1,rvgactx_w1,rmdactx_w1,rdrinkv_w1
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AT-000327-A1,AT-000327-01,11,52.0,32.787994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27189.0,1.0,0.95925,1726.401367,0.0,2.0,1.0,4.0
AT-000327-A2,AT-000327-02,11,49.0,31.161373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27189.0,1.0,0.95925,,0.0,3.0,1.0,6.0
AT-001816-A1,AT-001816-01,11,61.0,23.722811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275700.0,2.0,0.95925,999.33551,1.0,3.0,1.0,4.0
AT-001816-A2,AT-001816-02,11,56.0,27.440599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275700.0,2.0,0.95925,1726.401367,0.0,2.0,1.0,4.0
AT-002132-A1,AT-002132-01,11,71.0,34.04903,0.0,0.0,0.0,0.0,1.0,0.0,2.0,13583.0,5.0,0.95925,1292.320679,0.0,4.0,4.0,-1.0


In [348]:
sh_w1.columns.to_list()

['mergeid',
 'country',
 'rage_w1',
 'rbmi_w1',
 'rhearte_w1',
 'rhibpe_w1',
 'rdiabe_w1',
 'rcancre_w1',
 'rlunge_w1',
 'rstroke_w1',
 'radla_w1',
 'hitot_w1',
 'rshlt_w1',
 'pppx_w1',
 'wgid_w1',
 'rsmokev_w1',
 'rvgactx_w1',
 'rmdactx_w1',
 'rdrinkv_w1']

In [349]:
sh_w1.to_pickle('../data_sources/share/share_w1.pkl')

In [350]:
len(sh_w1)

31115

# Wave 2

## Demographics

In [351]:
dn = pd.read_stata(share_w2+'sharew2_rel2-5-0_dn.dta',convert_categoricals=False)

In [352]:
dn['rage_w2'] = 2006 - dn['dn003_']
dn.loc[dn['rage_w2']>120,'rage_w2'] = np.nan

In [353]:
dn['hhidpn'] = dn['hhid2'] + dn['cvid'].astype('str')

In [354]:
dn = dn.loc[:,['hhidpn','mergeid','country','rage_w2']]
dn.set_index('hhidpn',inplace=True)

## BMI

We will keep BMI in wave 2 but note missing for most because height was not asked again. In paper, only use wave 1 to define obesity. 

In [355]:
gv = pd.read_stata(share_w2+'sharew2_rel2-5-0_gv_health.dta',convert_categoricals=False)

In [356]:
gv['hhidpn'] = gv['hhid2'] + gv['cvid'].astype('str')
gv = gv.rename({'bmi':'rbmi_w2'},axis=1)
gv = gv.loc[:,['hhidpn','rbmi_w2']]
gv.set_index('hhidpn',inplace=True)

In [357]:
gv.head()

Unnamed: 0_level_0,rbmi_w2
hhidpn,Unnamed: 1_level_1
AT-000327-A1,
AT-000327-A2,
AT-001816-A1,
AT-002132-A1,
AT-004234-A2,


## Health Conditions

In [358]:
ph = pd.read_stata(share_w2+'sharew2_rel2-5-0_ph.dta',convert_categoricals=False)
ph['hhidpn'] = ph['hhid2'] + ph['cvid'].astype('str')
ph = ph.rename({'ph006d1':'rhearte_w2','ph006d2':'rhibpe_w2','ph006d5':'rdiabe_w2','ph006d10':'rcancre_w2','ph006d6':'rlunge_w2','ph006d4':'rstroke_w2'},axis=1)
for c in ['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']:
	ph[c] = np.where(ph[c]<0,np.nan,ph[c])
ph['radla_w2'] = ph[['ph049d1','ph049d2','ph049d3','ph049d4','ph049d5']].sum(axis=1)
ph = ph.loc[:,['hhidpn','rhearte_w2','rhibpe_w2','rdiabe_w2','rcancre_w2','rlunge_w2','rstroke_w2','radla_w2']]
ph.set_index('hhidpn',inplace=True)

In [359]:
ph.head()

Unnamed: 0_level_0,rhearte_w2,rhibpe_w2,rdiabe_w2,rcancre_w2,rlunge_w2,rstroke_w2,radla_w2
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AT-000327-A1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
AT-000327-A2,0.0,1.0,0.0,0.0,0.0,0.0,0.0
AT-001816-A1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
AT-002132-A1,0.0,0.0,0.0,0.0,1.0,0.0,5.0
AT-004234-A2,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Income and Self-reported Health

In [360]:
im = pd.read_stata(share_w2+'sharew2_rel2-5-0_imputations.dta',convert_categoricals=False)
im['hhidpn'] = im['hhid2'] + im['cvid'].astype('str')
im = im.rename({'hgtincv':'hitot_w2'},axis=1)
im = im.rename({'pppx2006':'pppx_w2'},axis=1)
im = im.rename({'srhealtha':'rshlt_w2'},axis=1)
im = im.loc[:,['hhidpn','hitot_w2','rshlt_w2','pppx_w2']]
im.drop_duplicates(subset=['hhidpn'], keep='first',inplace=True)
im.set_index('hhidpn',inplace=True)

In [361]:
im.head()

Unnamed: 0_level_0,hitot_w2,rshlt_w2,pppx_w2
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AT-000327-A1,25300,5,0.99545
AT-000327-A2,25300,2,0.99545
AT-001816-A1,57008,3,0.99545
AT-002132-A1,14300,5,0.99545
AT-004234-A2,23620,4,0.99545


## Weights

In [362]:
wt = pd.read_stata(share_w2+'sharew2_rel2-5-0_gv_weights.dta',convert_categoricals=False)
wt['hhidpn'] = wt['hhid2'] + wt['cvid'].astype('str')
wt = wt.rename({'w2aci':'wgid_w2'},axis=1)
wt = wt.loc[:,['hhidpn','wgid_w2']]
wt.set_index('hhidpn',inplace=True)


## Behavioral risk factors

In [363]:
br = pd.read_stata(share_w2+'sharew2_rel2-5-0_br.dta',convert_categoricals=False)
br['hhidpn'] = br['hhid2'] + br['cvid'].astype('str')
br = br.rename({'br001_':'rsmokev_w2'},axis=1)
br['rsmokev_w2'] = br['rsmokev_w2'].replace({5:0})
br = br.rename({'br015_':'rvgactx_w2'},axis=1)
br.loc[br['rvgactx_w2']>4,'rvgactx_w2'] = np.nan
br = br.rename({'br016_':'rmdactx_w2'},axis=1)
br.loc[br['rmdactx_w2']>4,'rmdactx_w2'] = np.nan
br = br.rename({'br010_':'rdrinkv_w2'},axis=1)
br.loc[br['rdrinkv_w2']>7,'rdrinkv_w2'] = np.nan
br = br.rename({'br019_':'rdrinkn_w2'},axis=1)
br = br.loc[:,['hhidpn','rsmokev_w2','rvgactx_w2','rmdactx_w2','rdrinkv_w2','rdrinkn_w2']]
br.set_index('hhidpn',inplace=True)

## Merging

In [364]:
dn.groupby('country').count()

Unnamed: 0_level_0,mergeid,rage_w2
country,Unnamed: 1_level_1,Unnamed: 2_level_1
11,1341,1341
12,2568,2568
13,2745,2745
14,2661,2661
15,2228,2228
16,2983,2983
17,2968,2968
18,2616,2616
19,3243,3242
20,1462,1462


In [365]:
sh_w2 = dn.merge(gv,left_index=True,right_index=True,how='left').merge(ph,left_index=True,right_index=True,how='left').merge(im,left_index=True,right_index=True,how='left').merge(wt,left_index=True,right_index=True,how='left').merge(br,left_index=True,right_index=True,how='left')

In [366]:
sh_w2.head()

Unnamed: 0_level_0,mergeid,country,rage_w2,rbmi_w2,rhearte_w2,rhibpe_w2,rdiabe_w2,rcancre_w2,rlunge_w2,rstroke_w2,radla_w2,hitot_w2,rshlt_w2,pppx_w2,wgid_w2,rsmokev_w2,rvgactx_w2,rmdactx_w2,rdrinkv_w2,rdrinkn_w2
hhidpn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AT-000327-A1,AT-000327-01,11,54.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,25300.0,5.0,0.99545,3402.772949,,4.0,4.0,7.0,
AT-000327-A2,AT-000327-02,11,51.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,25300.0,2.0,0.99545,2391.937988,,1.0,1.0,7.0,
AT-001816-A1,AT-001816-02,11,58.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,57008.0,3.0,0.99545,3402.772949,,1.0,2.0,3.0,1.0
AT-002132-A1,AT-002132-01,11,73.0,,0.0,0.0,0.0,0.0,1.0,0.0,5.0,14300.0,5.0,0.99545,1778.268433,,4.0,4.0,7.0,
AT-004234-A2,AT-004234-01,11,56.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23620.0,4.0,0.99545,3548.626221,,3.0,1.0,4.0,1.0


In [367]:
sh_w2.to_pickle('../data_sources/share/share_w2.pkl')

# Creating Wide Form SHARE dataset

In [368]:
sh_w1 = sh_w1.reset_index()
sh_w1.set_index('mergeid',inplace=True)
sh_w2 = sh_w2.reset_index()
sh_w2.set_index('mergeid',inplace=True)
sh = sh_w1.merge(sh_w2,left_index=True,right_index=True,how='left')
sh = sh.rename({'hhidpn_x':'hhidpn_w1','hhidpn_y':'hhidpn_w2','country_x':'country'},axis=1)
sh = sh.drop('country_y',axis=1)



Adjust to euros using PPPs in SHARE and then express in US dollars. Exchange rate in 2004 is 1.24 dollar per euro. 

In [369]:
sh['hitot_w1'] = sh['hitot_w1']/sh['pppx_w1']*1.24
sh['hitot_w2'] = sh['hitot_w2']/sh['pppx_w2']*1.24


In [370]:
sh.to_pickle('../data_sources/share/share_wide_select.pkl')