In [60]:
import pandas as pd
from sqlalchemy import create_engine

## Original wave (wave 1) demographic info

In [2]:
# read the wave 1-3 stata file
df_wv1_3=pd.read_stata("HCMST_ver_3.04.dta")
df_wv1_3.head(3)

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,,,yes,yes,"no, did not marry [xNameP]","No, we have not gotten a domestic partnership ...",,,,
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,,,,,,,,
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,,,,,,,,,,


In [3]:
# check how many columns
df_wv1_3.shape

(4002, 387)

In [4]:
# store columns into a csv for easy inspection
columns = df_wv1_3.columns
df=pd.DataFrame(columns)
df.to_csv("wave_1_to_3_columns.csv",index=False,header=False)

In [5]:
# only get the first 50 columns (those are the demographic info for wave 1)
df_wv1_pp=df_wv1_3.iloc[:,0:51]
df_wv1_pp

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,pprace_guamanian,pprace_samoan,pprace_otherpacificislander,pprace_someotherrace,papglb_friend,pppartyid3,papevangelical,papreligion,ppppcmdate_yrmo,pppadate_yrmo
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,no,no,no,no,"yes, friends",democrat,yes,catholic,200711.0,200709.0
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,no,no,no,no,"yes, both",democrat,no,jewish,200711.0,200709.0
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,no,no,no,no,"yes, both",democrat,yes,baptist-any denomination,200711.0,200709.0
3,26315,4575,4575.0,31,25-34,30-44,associate degree,some college,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"other non-christian, please specify:",200710.0,200709.0
4,27355,12147,,35,35-44,30-44,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,other christian,200710.0,200709.0
5,27695,1799,,69,65-74,60+,"some college, no degree",some college,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200711.0,200709.0
6,28536,1924,1924.0,53,45-54,45-59,bachelors degree,bachelor's degree or higher,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200710.0,200709.0
7,29584,3173,3173.0,58,55-64,45-59,masters degree,bachelor's degree or higher,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200710.0,200709.0
8,30393,68772,,39,35-44,30-44,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,catholic,200710.0,200709.0
9,31456,1021,1021.0,45,45-54,45-59,"some college, no degree",some college,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,"other non-christian, please specify:",200803.0,200711.0


In [6]:
# check some columns and some data exploration
print(df_wv1_pp.shape)
print("---------------------")
print(df_wv1_pp['ppagecat'].value_counts()) # categorized age (7 groups)
print("---------------------")
print(df_wv1_pp['ppagect4'].value_counts()) # categorized age (4 groups)
print("---------------------")
print(df_wv1_pp['ppeduc'].value_counts())
print("---------------------")
print(df_wv1_pp['ppeducat'].value_counts())

(4002, 51)
---------------------
45-54    810
35-44    808
55-64    728
25-34    655
65-74    395
18-24    346
75+      260
Name: ppagecat, dtype: int64
---------------------
45-59    1246
30-44    1086
60+       947
18-29     723
Name: ppagect4, dtype: int64
---------------------
high school graduate - high school diploma or the equivalent (ged)    987
some college, no degree                                               905
bachelors degree                                                      864
masters degree                                                        383
associate degree                                                      305
professional or doctorate degree                                      160
12th grade no diploma                                                 118
11th grade                                                             98
10th grade                                                             71
7th or 8th grade                                    

In [7]:
print(df_wv1_pp['pphhhead'].value_counts())
print("---------------------")
print(df_wv1_pp['pphouseholdsize'].value_counts())
print("---------------------")
print(df_wv1_pp['pphouse'].value_counts())

yes    3253
no      749
Name: pphhhead, dtype: int64
---------------------
2     1461
1      970
3      581
4      566
5      252
6      102
7       42
8       14
9        7
10       3
11       2
15       1
12       1
Name: pphouseholdsize, dtype: int64
---------------------
a one-family house detached from any other house     2779
a building with 2 or more apartments                  720
a one-family house attached to one or more houses     334
a mobile home                                         167
boat, rv, van, etc.                                     2
Name: pphouse, dtype: int64


In [8]:
print(df_wv1_pp['ppincimp'].value_counts())

$60,000 to $74,999      461
$50,000 to $59,999      422
$40,000 to $49,999      397
$85,000 to $99,999      311
$100,000 to $124,999    300
$75,000 to $84,999      296
$35,000 to $39,999      260
$20,000 to $24,999      219
$25,000 to $29,999      212
$30,000 to $34,999      201
$125,000 to $149,999    162
$15,000 to $19,999      157
$175,000 or more        127
$10,000 to $12,499      106
$7,500 to $9,999        100
$150,000 to $174,999     87
$12,500 to $14,999       85
$5,000 to $7,499         54
less than $5,000         45
Name: ppincimp, dtype: int64


In [9]:
print(df_wv1_pp['hhinc'].value_counts())

67250.0     461
55000.0     422
45000.0     397
92250.0     311
112250.0    300
80000.0     296
37250.0     260
22250.0     219
27250.0     212
32250.0     201
137250.0    162
17250.0     157
200000.0    127
11250.0     106
8750.0      100
162250.0     87
13750.0      85
6250.0       54
2500.0       45
Name: hhinc, dtype: int64


In [10]:
print(df_wv1_pp['ppreg4'].value_counts())
print("---------------------")
print(df_wv1_pp['ppreg9'].value_counts())

south        1262
west          975
midwest       955
northeast     810
Name: ppreg4, dtype: int64
---------------------
south atlantic        681
pacific               677
east-north central    653
mid-atlantic          594
west-south central    378
west-north central    302
mountain              298
new england           216
east-south central    203
Name: ppreg9, dtype: int64


In [11]:
# owning a house or not
# values are
# owned or being bought by you or someone in your household    
# rented for cash                             
# occupied without payment of cash rent

print(df_wv1_pp['pprent'].value_counts())

owned or being bought by you or someone in your household    3040
rented for cash                                               894
occupied without payment of cash rent                          68
Name: pprent, dtype: int64


In [12]:
# check race related questions
# question "ppq14arace" has many "not asked" so need to keep all the binary questions
# such as "pprace_white" and "pprace_black"

print(df_wv1_pp['ppq14arace'].value_counts())
print("---------------------")
print(df_wv1_pp['pphispan'].value_counts())
print("---------------------")
print(df_wv1_pp['pprace_white'].value_counts())
print("---------------------")
print(df_wv1_pp['pprace_black'].value_counts())

not asked                           3789
white                                124
some other race                       33
black, or  african american           22
american indian or alaska native      14
japanese                               5
filipino                               5
refused                                3
native hawaiian                        2
other asian                            2
other pacific islander                 1
guamanian or chamorro                  1
chinese                                1
Name: ppq14arace, dtype: int64
---------------------
no, i am not                               3615
yes, mexican, mexican-american, chicano     170
yes, other spanish/hispanic/latino           84
yes, puerto rican                            66
yes, south american                          24
yes, cuban                                   16
yes, caribbean                               14
yes, central american                        13
Name: pphispan, dtype: int64
-

In [13]:
##### TBD: save to DB

## wave 2 demographic info

In [14]:
df_wv1_3.head(3)

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,,,yes,yes,"no, did not marry [xNameP]","No, we have not gotten a domestic partnership ...",,,,
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,,,,,,,,
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,,,,,,,,,,


In [15]:
# save the list of columns (=wave 2 demographics info) into a list 
wv2_pp_cls=pd.read_csv("wv2_pp_columns.txt",header=None,delimiter="\n")
wv2_pp_cls_list=wv2_pp_cls[0].tolist()
wv2_pp_cls_list

['pp2_afterp1',
 'pp2_pphhhead',
 'pp2_pphhsize',
 'pp2_pphouse',
 'pp2_ppincimp',
 'pp2_ppmarit',
 'pp2_ppmsacat',
 'pp2_ppeduc',
 'pp2_ppeducat',
 'pp2_respondent_yrsed',
 'pp2_ppethm',
 'pp2_ppreg4',
 'pp2_ppreg9',
 'pp2_pprent',
 'pp2_ppt01',
 'pp2_ppt1317',
 'pp2_ppt18ov',
 'pp2_ppt25',
 'pp2_ppt612',
 'pp2_ppwork',
 'pp2_ppnet',
 'pp2_ppcmdate_yrmo',
 'pp_igdr1',
 'pp_ieduc1',
 'pp2_igdr2',
 'pp2_ieduc2']

In [16]:
# select the columns that reprsent wv2 pp info and store those columns into a new table
df_wv2_pp=df_wv1_3[wv2_pp_cls_list]
df_wv2_pp.head()

Unnamed: 0,pp2_afterp1,pp2_pphhhead,pp2_pphhsize,pp2_pphouse,pp2_ppincimp,pp2_ppmarit,pp2_ppmsacat,pp2_ppeduc,pp2_ppeducat,pp2_respondent_yrsed,...,pp2_ppt18ov,pp2_ppt25,pp2_ppt612,pp2_ppwork,pp2_ppnet,pp2_ppcmdate_yrmo,pp_igdr1,pp_ieduc1,pp2_igdr2,pp2_ieduc2
0,Yes second background survey,yes,2.0,a building with 2 or more apartments,"$20,000 to $24,999",living with partner,metro,bachelors degree,bachelor's degree or higher,16.0,...,2.0,0.0,0.0,not working - looking for work,yes,200905.0,value not imputed,value not imputed,value not imputed,value not imputed
1,Yes second background survey,yes,2.0,a building with 2 or more apartments,"$100,000 to $124,999",married,metro,professional or doctorate degree,bachelor's degree or higher,20.0,...,2.0,0.0,0.0,working - as a paid employee,yes,200904.0,value not imputed,value not imputed,value not imputed,value not imputed
2,Yes second background survey,yes,4.0,a building with 2 or more apartments,"$5,000 to $7,499",never married,metro,high school graduate - high school diploma or ...,high school,12.0,...,3.0,0.0,0.0,working - as a paid employee,yes,200904.0,value not imputed,value not imputed,value not imputed,value not imputed
3,Yes second background survey,yes,1.0,a building with 2 or more apartments,"$40,000 to $49,999",never married,metro,associate degree,some college,14.0,...,1.0,0.0,0.0,working - as a paid employee,yes,200905.0,value not imputed,value not imputed,value not imputed,value not imputed
4,Yes second background survey,yes,2.0,a one-family house attached to one or more houses,"$40,000 to $49,999",never married,metro,high school graduate - high school diploma or ...,high school,12.0,...,2.0,0.0,0.0,working - self-employed,yes,200904.0,value not imputed,value not imputed,value not imputed,value not imputed


## wave 3 demographic info

In [17]:
# save the list of columns (representing wave 3 demographics info) into a list 
wv3_pp_cls=pd.read_csv("wv3_pp_columns.txt",header=None,delimiter="\n")
wv3_pp_cls_list=wv3_pp_cls[0].tolist()
wv3_pp_cls_list

['pp3_pphhhead',
 'pp3_pphhsize',
 'pp3_pphouse',
 'pp3_ppincimp',
 'pp3_ppmarit',
 'pp3_ppmsacat',
 'pp3_pprent',
 'pp3_ppreg4',
 'pp3_ppreg9',
 'interstate_mover_pp1_pp2',
 'interstate_mover_pp2_pp3',
 'interstate_mover_pp1_pp3',
 'pp3_ppt01',
 'pp3_ppt1317',
 'pp3_ppt18ov',
 'pp3_ppt25',
 'pp3_ppt612',
 'pp3_ppwork',
 'pp3_ppnet',
 'pp3_ppcmdate_yrmo',
 'pp3_ppeduc',
 'pp3_ppeducat',
 'pp3_respondent_yrsed',
 'pp3_ppethm',
 'pp3_newer']

In [18]:
# select the columns that reprsent wv3 pp info and store those columns into a new table
df_wv3_pp=df_wv1_3[wv3_pp_cls_list]
df_wv3_pp.head()

Unnamed: 0,pp3_pphhhead,pp3_pphhsize,pp3_pphouse,pp3_ppincimp,pp3_ppmarit,pp3_ppmsacat,pp3_pprent,pp3_ppreg4,pp3_ppreg9,interstate_mover_pp1_pp2,...,pp3_ppt25,pp3_ppt612,pp3_ppwork,pp3_ppnet,pp3_ppcmdate_yrmo,pp3_ppeduc,pp3_ppeducat,pp3_respondent_yrsed,pp3_ppethm,pp3_newer
0,Yes,2.0,A building with 2 or more apartments,"$15,000 to $19,999",Living with partner,Metro,Rented for cash,Midwest,East-North Central,stayer,...,0.0,0.0,Not working - looking for work,Yes,201107.0,Bachelors degree,Bachelor's degree or higher,16.0,Hispanic,"Yes, pp3 data is newer and available"
1,Yes,2.0,A one-family house attached to one or more houses,"$85,000 to $99,999",Living with partner,Metro,Rented for cash,West,Pacific,stayer,...,0.0,0.0,Working - as a paid employee,Yes,201106.0,Professional or Doctorate degree,Bachelor's degree or higher,20.0,"White, Non-Hispanic","Yes, pp3 data is newer and available"
2,Yes,3.0,A building with 2 or more apartments,"Less than $5,000",Never married,Metro,Rented for cash,South,West-South Central,stayer,...,0.0,0.0,Working - as a paid employee,Yes,201106.0,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,12.0,"Black, Non-Hispanic","Yes, pp3 data is newer and available"
3,Yes,1.0,A building with 2 or more apartments,"$40,000 to $49,999",Never married,Metro,Owned or being bought by you or someone in you...,South,South Atlantic,stayer,...,0.0,0.0,Working - as a paid employee,Yes,201106.0,Associate degree,Some college,14.0,"White, Non-Hispanic","Yes, pp3 data is newer and available"
4,Yes,2.0,A one-family house attached to one or more houses,"$40,000 to $49,999",Never married,Metro,Owned or being bought by you or someone in you...,South,South Atlantic,stayer,...,0.0,0.0,Working - self-employed,Yes,201106.0,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,12.0,"White, Non-Hispanic","Yes, pp3 data is newer and available"


## Wave 4 demographic info

In [19]:
df_wv4=pd.read_stata("wave_4_supplement_v1_2.dta")
df_wv4.head()

Unnamed: 0,caseid_new,w4_xqualified,w4_status,w4_complete,w4_source,w4_duration,w4_HCMST_interview_fin_yrmo,w4_xtype,w4_xyear,w4_xmonth,...,pp4_ppt25,pp4_ppt612,pp4_ppt1317,pp4_ppt18ov,pp4_ppwork,pp4_ppnet,pp4_ppcmdate_yrmo,pp4_newer,ppa2009_services_yrmo,ppa2009_how_often_services
0,22526,qualified for wave 4,Active,Yes,On-line,3.0,201303.0,,,,...,0,0,0,2,Not working - disabled,Yes,,Yes,,
1,23286,qualified for wave 4,Active,Yes,On-line,0.0,201303.0,,,,...,0,0,0,2,Working - as a paid employee,Yes,,Yes,,
2,25495,unqualified for wave 4,,,,,,,,,...,0,0,0,3,Working - as a paid employee,Yes,201305.0,Yes,200903.0,More than once a week
3,26315,unqualified for wave 4,,,,,,,,,...,0,0,0,1,Working - as a paid employee,Yes,201309.0,Yes,200903.0,Never
4,27355,unqualified for wave 4,,,,,,,,,...,0,0,0,2,Working - self-employed,Yes,201305.0,Yes,200903.0,A few times a year


In [20]:
df_wv4.shape

(4002, 62)

In [21]:
# get the columns from wave 4 and store in a csv file for easy inspection
columns = df_wv4.columns
df=pd.DataFrame(columns)
df.to_csv("wv4_columns.csv",index=False,header=False)

In [22]:
# save the list of columns (representing wave 4 demographics info) into a list 
wv4_pp_cls=pd.read_csv("wv4_pp_columns.txt",header=None,delimiter="\n")
wv4_pp_cls_list=wv4_pp_cls[0].tolist()
wv4_pp_cls_list

['caseid_new',
 'pp4_ppeduc',
 'pp4_ppeducat',
 'pp4_ppethm',
 'pp4_pphhhead',
 'pp4_pphhsize',
 'pp4_pphouse',
 'pp4_ppincimp',
 'pp4_ppmarit',
 'pp4_ppmsacat',
 'pp4_ppreg4',
 'pp4_ppreg9',
 'pp4_pprent',
 'pp4_ppt01',
 'pp4_ppt25',
 'pp4_ppt612',
 'pp4_ppt1317',
 'pp4_ppt18ov',
 'pp4_ppwork',
 'pp4_ppnet',
 'pp4_ppcmdate_yrmo',
 'pp4_newer',
 'ppa2009_services_yrmo',
 'ppa2009_how_often_services']

In [23]:
# select the columns that reprsent wv4 pp info and store those columns into a new table
df_wv4_pp=df_wv4[wv4_pp_cls_list]
df_wv4_pp.head()

Unnamed: 0,caseid_new,pp4_ppeduc,pp4_ppeducat,pp4_ppethm,pp4_pphhhead,pp4_pphhsize,pp4_pphouse,pp4_ppincimp,pp4_ppmarit,pp4_ppmsacat,...,pp4_ppt25,pp4_ppt612,pp4_ppt1317,pp4_ppt18ov,pp4_ppwork,pp4_ppnet,pp4_ppcmdate_yrmo,pp4_newer,ppa2009_services_yrmo,ppa2009_how_often_services
0,22526,Bachelors degree,Bachelor's degree or higher,Hispanic,Yes,2,A building with 2 or more apartments,"$12,500 to $14,999",Living with partner,Metro,...,0,0,0,2,Not working - disabled,Yes,,Yes,,
1,23286,Professional or Doctorate degree,Bachelor's degree or higher,"White, Non-Hispanic",Yes,3,A one-family house attached to one or more houses,"$60,000 to $74,999",Living with partner,Metro,...,0,0,0,2,Working - as a paid employee,Yes,,Yes,,
2,25495,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,"Black, Non-Hispanic",Yes,3,A building with 2 or more apartments,"$30,000 to $34,999",Never married,Metro,...,0,0,0,3,Working - as a paid employee,Yes,201305.0,Yes,200903.0,More than once a week
3,26315,Associate degree,Some college,"White, Non-Hispanic",Yes,1,A building with 2 or more apartments,"$50,000 to $59,999",Never married,Metro,...,0,0,0,1,Working - as a paid employee,Yes,201309.0,Yes,200903.0,Never
4,27355,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,"White, Non-Hispanic",Yes,2,A one-family house detached from any other house,"$40,000 to $49,999",Never married,Metro,...,0,0,0,2,Working - self-employed,Yes,201305.0,Yes,200903.0,A few times a year


## Wave 5 demographic info

In [24]:
df_wv5=pd.read_stata("HCMST_wave_5_supplement_ver_1.dta")
df_wv5.head()

Unnamed: 0,caseid_new,w5_duration,w5_source,w5_complete,w5_status,w5x_qualified,w5x_marry,w5x_last,w5x_year,w5x_civil,...,ppmarit_2012,ppcmarit_2012_yrmo,ppmarit_2011,ppcmarit_2011_yrmo,ppmarit_2010,ppcmarit_2010_yrmo,ppmarit_2009,ppcmarit_2009_yrmo,ppmarit_2007,ppcmarit_2007_yrmo
0,22526,1.0,online,wave 5 completed,Active,qualified for wave 5,unmarried partners,1.0,2013.0,no civil union or DP prior to wave 5,...,Living with partner,201207.0,Living with partner,201107.0,Living with partner,201007.0,Living with partner,200905.0,Living with partner,200711.0
1,23286,0.0,online,wave 5 completed,Active,qualified for wave 5,married,1.0,2013.0,yes civil union or DP prior to wave 5,...,Living with partner,201208.0,Married,201106.0,Living with partner,201007.0,Married,200904.0,Living with partner,200711.0
2,25495,,,,,not qualified for wave 5,,,,,...,Never married,201206.0,Never married,201106.0,Never married,201007.0,Never married,200904.0,Never married,200711.0
3,26315,,,,,not qualified for wave 5,,,,,...,Never married,201206.0,Never married,201106.0,Never married,201007.0,Never married,200905.0,Never married,200710.0
4,27355,,,,,not qualified for wave 5,,,,,...,Never married,201206.0,Never married,201106.0,Never married,201005.0,Never married,200904.0,Never married,200710.0


In [25]:
df_wv5.shape

(4002, 78)

In [26]:
# get the columns from wave 5 and store in a csv file for easy inspection
columns = df_wv5.columns
df=pd.DataFrame(columns)
df.to_csv("wv5_columns.csv",index=False,header=False)

In [27]:
# save the list of columns (representing wave 5 demographics info) into a list 
wv5_pp_cls=pd.read_csv("wv5_pp_columns.txt",header=None,delimiter="\n")
wv5_pp_cls_list=wv5_pp_cls[0].tolist()
wv5_pp_cls_list

['caseid_new',
 'pp5_ppage',
 'pp5_ppagecat',
 'pp5_ppagect4',
 'pp5_ppeduc',
 'pp5_ppeducat',
 'pp5_ppethm',
 'pp5_ppgender',
 'pp5_pphhhead',
 'pp5_pphhsize',
 'pp5_pphouse',
 'pp5_ppincimp',
 'pp5_ppmarit',
 'pp5_ppcmdate_yrmo',
 'pp5_newer',
 'pp5_ppmsacat',
 'pp5_ppreg4',
 'pp5_ppreg9',
 'pp5_pprent',
 'pp5_ppt01',
 'pp5_ppt25',
 'pp5_ppt612',
 'pp5_ppt1317',
 'pp5_ppt18ov',
 'pp5_ppwork',
 'pp5_ppnet',
 'ppmarit_2014',
 'ppcmarit_2014_yrmo',
 'ppmarit_2013',
 'ppcmarit_2013_yrmo',
 'ppmarit_2012',
 'ppcmarit_2012_yrmo',
 'ppmarit_2011',
 'ppcmarit_2011_yrmo',
 'ppmarit_2010',
 'ppcmarit_2010_yrmo',
 'ppmarit_2009',
 'ppcmarit_2009_yrmo',
 'ppmarit_2007',
 'ppcmarit_2007_yrmo']

In [28]:
# select the columns that reprsent wv5 pp info and store those columns into a new table
df_wv5_pp=df_wv5[wv5_pp_cls_list]
df_wv5_pp.head()

Unnamed: 0,caseid_new,pp5_ppage,pp5_ppagecat,pp5_ppagect4,pp5_ppeduc,pp5_ppeducat,pp5_ppethm,pp5_ppgender,pp5_pphhhead,pp5_pphhsize,...,ppmarit_2012,ppcmarit_2012_yrmo,ppmarit_2011,ppcmarit_2011_yrmo,ppmarit_2010,ppcmarit_2010_yrmo,ppmarit_2009,ppcmarit_2009_yrmo,ppmarit_2007,ppcmarit_2007_yrmo
0,22526,58,55-64,45-59,Bachelors degree,Bachelor's degree or higher,Hispanic,Female,1,2,...,Living with partner,201207.0,Living with partner,201107.0,Living with partner,201007.0,Living with partner,200905.0,Living with partner,200711.0
1,23286,34,25-34,30-44,Professional or Doctorate degree,Bachelor's degree or higher,"White, Non-Hispanic",Female,1,4,...,Living with partner,201208.0,Married,201106.0,Living with partner,201007.0,Married,200904.0,Living with partner,200711.0
2,25495,54,45-54,45-59,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,"Black, Non-Hispanic",Female,1,2,...,Never married,201206.0,Never married,201106.0,Never married,201007.0,Never married,200904.0,Never married,200711.0
3,26315,37,35-44,30-44,Associate degree,Some college,"White, Non-Hispanic",Male,1,1,...,Never married,201206.0,Never married,201106.0,Never married,201007.0,Never married,200905.0,Never married,200710.0
4,27355,40,35-44,30-44,HIGH SCHOOL GRADUATE - high school DIPLOMA or ...,High school,"White, Non-Hispanic",Male,1,2,...,Never married,201206.0,Never married,201106.0,Never married,201005.0,Never married,200904.0,Never married,200710.0


## Wave 6 demographic info

In [29]:
df_wv6=pd.read_stata("HCMST_wave_6_public_v1.dta")
df_wv6.head()

Unnamed: 0,caseid_new,xw6_year,xw6_month,xw6_last,xw6_marry,w6_still_mar,w6_live_together,w6_otherdate,w6_how_many,w6_otherdate_app,...,w6_met_through_as_coworkers,w6_met_online_dating,w6_met_bar_restaurant,w6_how_met_story_length,w6_combo_live_together,w6_otherdate_combo,w6_how_many_combo,w6_otherdate_app_combo,w6_how_many_app_combo,w6_HCMST_yrmo
0,22526,2014,December,3,unmarried partners,,,,,,...,,,,0,yes,"No, I have not met anyone for dating, romance,...",met no people,,,201707
1,28536,2014,December,3,unmarried partners,,,,,,...,no,no,no,6,yes,"Yes, I have met at least one person for dating...",Six to Twenty people. I met between six and tw...,"Yes, I have used a phone dating app in the pas...","Two to Five people. Using phone apps, I met be...",201707
2,29584,2014,December,3,married,Yes,Yes,"No, I have not met anyone for dating, romance,...",,,...,,,,0,yes,"No, I have not met anyone for dating, romance,...",met no people,,,201707
3,48225,2014,December,3,married,Yes,Yes,"No, I have not met anyone for dating, romance,...",,,...,,,,0,yes,"No, I have not met anyone for dating, romance,...",met no people,,,201707
4,49731,2014,December,3,married,Yes,Yes,"No, I have not met anyone for dating, romance,...",,,...,,,,0,yes,"No, I have not met anyone for dating, romance,...",met no people,,,201707


In [30]:
df_wv6.shape
# (640, 77)
# why is there so few data??? 

(640, 77)

In [31]:
# get the columns from wave 6 and store in a csv file for easy inspection
columns = df_wv6.columns
df=pd.DataFrame(columns)
df.to_csv("wv6_columns.csv",index=False,header=False)

In [32]:
# save the list of columns (representing wave 6 demographics info) into a list 
wv6_pp_cls=pd.read_csv("wv6_pp_columns.txt",header=None,delimiter="\n")
wv6_pp_cls_list=wv6_pp_cls[0].tolist()
wv6_pp_cls_list

['caseid_new',
 'xw6_marry',
 'pp6_ppage',
 'pp6_ppagecat',
 'pp6_ppagect4',
 'pp6_ppeduc',
 'pp6_ppeducat',
 'pp6_ppethm',
 'pp6_ppgender',
 'pp6_pphhhead',
 'pp6_pphhsize',
 'pp6_pphouse',
 'pp6_ppincimp',
 'pp6_ppmarit',
 'pp6_ppmsacat',
 'pp6_PPREG4',
 'pp6_ppreg9',
 'pp6_pprent',
 'pp6_PPT01',
 'pp6_PPT25',
 'pp6_PPT612',
 'pp6_PPT1317',
 'pp6_PPT18OV',
 'pp6_ppwork',
 'w6_relate_status']

In [33]:
# select the columns that reprsent wv5 pp info and store those columns into a new table
df_wv6_pp=df_wv6[wv6_pp_cls_list]
df_wv6_pp.head()

Unnamed: 0,caseid_new,xw6_marry,pp6_ppage,pp6_ppagecat,pp6_ppagect4,pp6_ppeduc,pp6_ppeducat,pp6_ppethm,pp6_ppgender,pp6_pphhhead,...,pp6_PPREG4,pp6_ppreg9,pp6_pprent,pp6_PPT01,pp6_PPT25,pp6_PPT612,pp6_PPT1317,pp6_PPT18OV,pp6_ppwork,w6_relate_status
0,22526,unmarried partners,60,55-64,60+,Bachelors degree,Bachelor's degree or higher,Hispanic,Female,Yes,...,Midwest,East-North Central,Rented for cash,0,0,0,0,2,Not working - retired,unmarried still together
1,28536,unmarried partners,62,55-64,60+,Bachelors degree,Bachelor's degree or higher,"White, Non-Hispanic",Male,Yes,...,Midwest,West-North Central,Owned or being bought by you or someone in you...,0,0,0,0,2,Not working - retired,unmarried still together
2,29584,married,66,65-74,60+,Masters degree,Bachelor's degree or higher,"White, Non-Hispanic",Male,No,...,South,South Atlantic,Owned or being bought by you or someone in you...,0,0,0,0,2,Not working - looking for work,"was married, remained married"
3,48225,married,31,25-34,30-44,Masters degree,Bachelor's degree or higher,"White, Non-Hispanic",Male,Yes,...,Midwest,West-North Central,Owned or being bought by you or someone in you...,0,0,0,0,1,Working - as a paid employee,"was married, remained married"
4,49731,married,81,75+,60+,7th or 8th grade,Less than high school,"White, Non-Hispanic",Male,Yes,...,Northeast,New England,Owned or being bought by you or someone in you...,0,0,0,0,2,Not working - retired,"was married, remained married"


## Save to MySQL

In [44]:
connection_string = "root:MniXXt530mysql@localhost/hcmst"
engine = create_engine(f'mysql://{connection_string}')
engine.table_names() # now it's empty

[]

In [49]:
df_wv6_pp.to_sql(name="wv6_pp", con=engine, if_exists='replace', index=True)
df_wv5_pp.to_sql(name="wv5_pp", con=engine, if_exists='replace', index=True)
df_wv4_pp.to_sql(name="wv4_pp", con=engine, if_exists='replace', index=True)
df_wv3_pp.to_sql(name="wv3_pp", con=engine, if_exists='replace', index=True)
df_wv2_pp.to_sql(name="wv2_pp", con=engine, if_exists='replace', index=True)
df_wv1_pp.to_sql(name="wv1_pp", con=engine, if_exists='replace', index=True)
engine.table_names()

['wv1_pp', 'wv2_pp', 'wv3_pp', 'wv4_pp', 'wv5_pp', 'wv6_pp']

In [68]:
# check if data has been stored properly 

mysql_wv2_pp=engine.execute("SELECT * FROM wv2_pp").fetchall()
for elem in mysql_wv2_pp[0:5]:
    print(elem)
    print("\n")

(0, 'Yes second background survey', 'yes', '2.0', 'a building with 2 or more apartments', '$20,000 to $24,999', 'living with partner', 'metro', 'bachelors degree', "bachelor's degree or higher", 16.0, 'hispanic', 'midwest', 'east-north central', 'rented for cash', '0.0', '0.0', '2.0', '0.0', '0.0', 'not working - looking for work', 'yes', 200905.0, 'value not imputed', 'value not imputed', 'value not imputed', 'value not imputed')


(1, 'Yes second background survey', 'yes', '2.0', 'a building with 2 or more apartments', '$100,000 to $124,999', 'married', 'metro', 'professional or doctorate degree', "bachelor's degree or higher", 20.0, 'white, non-hispanic', 'west', 'pacific', 'rented for cash', '0.0', '0.0', '2.0', '0.0', '0.0', 'working - as a paid employee', 'yes', 200904.0, 'value not imputed', 'value not imputed', 'value not imputed', 'value not imputed')


(2, 'Yes second background survey', 'yes', '4.0', 'a building with 2 or more apartments', '$5,000 to $7,499', 'never married'

## Save to mysqlite

In [65]:
# using gitbash to create the database (HCMST.sqlite) first 
# (outside of Jupyter notebook)

connection_string = "HCMST.sqlite"
engine2 = create_engine(f'sqlite:///{connection_string}')
engine2.table_names() # now it's empty

[]

In [66]:
df_wv6_pp.to_sql(name="wv6_pp", con=engine2, if_exists='replace', index=True)
df_wv5_pp.to_sql(name="wv5_pp", con=engine2, if_exists='replace', index=True)
df_wv4_pp.to_sql(name="wv4_pp", con=engine2, if_exists='replace', index=True)
df_wv3_pp.to_sql(name="wv3_pp", con=engine2, if_exists='replace', index=True)
df_wv2_pp.to_sql(name="wv2_pp", con=engine2, if_exists='replace', index=True)
df_wv1_pp.to_sql(name="wv1_pp", con=engine2, if_exists='replace', index=True)
engine2.table_names()

['wv1_pp', 'wv2_pp', 'wv3_pp', 'wv4_pp', 'wv5_pp', 'wv6_pp']

In [67]:
# check if data has been stored properly 

sqlite_wv2_pp=engine2.execute("SELECT * FROM wv2_pp").fetchall()
for elem in sqlite_wv2_pp[0:5]:
    print(elem)
    print("\n")

(0, 'Yes second background survey', 'yes', '2.0', 'a building with 2 or more apartments', '$20,000 to $24,999', 'living with partner', 'metro', 'bachelors degree', "bachelor's degree or higher", 16.0, 'hispanic', 'midwest', 'east-north central', 'rented for cash', '0.0', '0.0', '2.0', '0.0', '0.0', 'not working - looking for work', 'yes', 200905.0, 'value not imputed', 'value not imputed', 'value not imputed', 'value not imputed')


(1, 'Yes second background survey', 'yes', '2.0', 'a building with 2 or more apartments', '$100,000 to $124,999', 'married', 'metro', 'professional or doctorate degree', "bachelor's degree or higher", 20.0, 'white, non-hispanic', 'west', 'pacific', 'rented for cash', '0.0', '0.0', '2.0', '0.0', '0.0', 'working - as a paid employee', 'yes', 200904.0, 'value not imputed', 'value not imputed', 'value not imputed', 'value not imputed')


(2, 'Yes second background survey', 'yes', '4.0', 'a building with 2 or more apartments', '$5,000 to $7,499', 'never married'