In [1]:
import pandas as pd
from sqlalchemy import create_engine

## Original wave (wave 1) demographic info

In [4]:
# read the wave 1-3 stata file
df_wv1_3=pd.read_stata("elle_work_data_and_codes/HCMST_ver_3.04.dta")
df_wv1_3.head(3)

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,,,yes,yes,"no, did not marry [xNameP]","No, we have not gotten a domestic partnership ...",,,,
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,,,,,,,,
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,,,,,,,,,,


In [5]:
# check how many columns
df_wv1_3.shape

(4002, 387)

In [6]:
# store columns into a csv for easy inspection
columns = df_wv1_3.columns
df=pd.DataFrame(columns)
df.to_csv("wave_1_to_3_columns.csv",index=False,header=False)

In [7]:
# only get the first 50 columns (those are the demographic info for wave 1)
df_wv1_pp=df_wv1_3.iloc[:,0:51]
df_wv1_pp

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,pprace_guamanian,pprace_samoan,pprace_otherpacificislander,pprace_someotherrace,papglb_friend,pppartyid3,papevangelical,papreligion,ppppcmdate_yrmo,pppadate_yrmo
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,no,no,no,no,"yes, friends",democrat,yes,catholic,200711.0,200709.0
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,no,no,no,no,"yes, both",democrat,no,jewish,200711.0,200709.0
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,no,no,no,no,"yes, both",democrat,yes,baptist-any denomination,200711.0,200709.0
3,26315,4575,4575.0,31,25-34,30-44,associate degree,some college,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"other non-christian, please specify:",200710.0,200709.0
4,27355,12147,,35,35-44,30-44,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,other christian,200710.0,200709.0
5,27695,1799,,69,65-74,60+,"some college, no degree",some college,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200711.0,200709.0
6,28536,1924,1924.0,53,45-54,45-59,bachelors degree,bachelor's degree or higher,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200710.0,200709.0
7,29584,3173,3173.0,58,55-64,45-59,masters degree,bachelor's degree or higher,"white, non-hispanic",male,...,no,no,no,no,"yes, both",democrat,no,"protestant (e.g., methodist, lutheran, presbyt...",200710.0,200709.0
8,30393,68772,,39,35-44,30-44,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,catholic,200710.0,200709.0
9,31456,1021,1021.0,45,45-54,45-59,"some college, no degree",some college,"white, non-hispanic",male,...,no,no,no,no,"yes, friends",democrat,no,"other non-christian, please specify:",200803.0,200711.0


In [8]:
# check some columns and some data exploration
print(df_wv1_pp.shape)
print("---------------------")
print(df_wv1_pp['ppagecat'].value_counts()) # categorized age (7 groups)
print("---------------------")
print(df_wv1_pp['ppagect4'].value_counts()) # categorized age (4 groups)
print("---------------------")
print(df_wv1_pp['ppeduc'].value_counts())
print("---------------------")
print(df_wv1_pp['ppeducat'].value_counts())

(4002, 51)
---------------------
45-54    810
35-44    808
55-64    728
25-34    655
65-74    395
18-24    346
75+      260
Name: ppagecat, dtype: int64
---------------------
45-59    1246
30-44    1086
60+       947
18-29     723
Name: ppagect4, dtype: int64
---------------------
high school graduate - high school diploma or the equivalent (ged)    987
some college, no degree                                               905
bachelors degree                                                      864
masters degree                                                        383
associate degree                                                      305
professional or doctorate degree                                      160
12th grade no diploma                                                 118
11th grade                                                             98
10th grade                                                             71
7th or 8th grade                                    

In [9]:
print(df_wv1_pp['pphhhead'].value_counts())
print("---------------------")
print(df_wv1_pp['pphouseholdsize'].value_counts())
print("---------------------")
print(df_wv1_pp['pphouse'].value_counts())

yes    3253
no      749
Name: pphhhead, dtype: int64
---------------------
2     1461
1      970
3      581
4      566
5      252
6      102
7       42
8       14
9        7
10       3
11       2
15       1
12       1
Name: pphouseholdsize, dtype: int64
---------------------
a one-family house detached from any other house     2779
a building with 2 or more apartments                  720
a one-family house attached to one or more houses     334
a mobile home                                         167
boat, rv, van, etc.                                     2
Name: pphouse, dtype: int64


In [10]:
print(df_wv1_pp['ppincimp'].value_counts())

$60,000 to $74,999      461
$50,000 to $59,999      422
$40,000 to $49,999      397
$85,000 to $99,999      311
$100,000 to $124,999    300
$75,000 to $84,999      296
$35,000 to $39,999      260
$20,000 to $24,999      219
$25,000 to $29,999      212
$30,000 to $34,999      201
$125,000 to $149,999    162
$15,000 to $19,999      157
$175,000 or more        127
$10,000 to $12,499      106
$7,500 to $9,999        100
$150,000 to $174,999     87
$12,500 to $14,999       85
$5,000 to $7,499         54
less than $5,000         45
Name: ppincimp, dtype: int64


In [11]:
print(df_wv1_pp['hhinc'].value_counts())

67250.0     461
55000.0     422
45000.0     397
92250.0     311
112250.0    300
80000.0     296
37250.0     260
22250.0     219
27250.0     212
32250.0     201
137250.0    162
17250.0     157
200000.0    127
11250.0     106
8750.0      100
162250.0     87
13750.0      85
6250.0       54
2500.0       45
Name: hhinc, dtype: int64


In [12]:
print(df_wv1_pp['ppreg4'].value_counts())
print("---------------------")
print(df_wv1_pp['ppreg9'].value_counts())

south        1262
west          975
midwest       955
northeast     810
Name: ppreg4, dtype: int64
---------------------
south atlantic        681
pacific               677
east-north central    653
mid-atlantic          594
west-south central    378
west-north central    302
mountain              298
new england           216
east-south central    203
Name: ppreg9, dtype: int64


In [13]:
# owning a house or not
# values are
# owned or being bought by you or someone in your household    
# rented for cash                             
# occupied without payment of cash rent

print(df_wv1_pp['pprent'].value_counts())

owned or being bought by you or someone in your household    3040
rented for cash                                               894
occupied without payment of cash rent                          68
Name: pprent, dtype: int64


In [14]:
# check race related questions
# question "ppq14arace" has many "not asked" so need to keep all the binary questions
# such as "pprace_white" and "pprace_black"

print(df_wv1_pp['ppq14arace'].value_counts())
print("---------------------")
print(df_wv1_pp['pphispan'].value_counts())
print("---------------------")
print(df_wv1_pp['pprace_white'].value_counts())
print("---------------------")
print(df_wv1_pp['pprace_black'].value_counts())

not asked                           3789
white                                124
some other race                       33
black, or  african american           22
american indian or alaska native      14
japanese                               5
filipino                               5
refused                                3
native hawaiian                        2
other asian                            2
other pacific islander                 1
guamanian or chamorro                  1
chinese                                1
Name: ppq14arace, dtype: int64
---------------------
no, i am not                               3615
yes, mexican, mexican-american, chicano     170
yes, other spanish/hispanic/latino           84
yes, puerto rican                            66
yes, south american                          24
yes, cuban                                   16
yes, caribbean                               14
yes, central american                        13
Name: pphispan, dtype: int64
-

In [15]:
##### TBD: save to DB

## wave 2 demographic info

In [16]:
df_wv1_3.head(3)

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
0,22526,4265,4265.0,52,45-54,45-59,bachelors degree,bachelor's degree or higher,hispanic,female,...,,,yes,yes,"no, did not marry [xNameP]","No, we have not gotten a domestic partnership ...",,,,
1,23286,16485,16485.0,28,25-34,18-29,masters degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,,,,,,,,
2,25495,52464,,49,45-54,45-59,high school graduate - high school diploma or ...,high school,"black, non-hispanic",female,...,,,,,,,,,,


In [17]:
# save the list of columns (=wave 2 demographics info) into a list 
wv2_pp_cls=pd.read_csv("wv2_pp_columns.txt",header=None,delimiter="\n")
wv2_pp_cls_list=wv2_pp_cls[0].tolist()
wv2_pp_cls_list

FileNotFoundError: [Errno 2] File b'wv2_pp_columns.txt' does not exist: b'wv2_pp_columns.txt'

In [None]:
# select the columns that reprsent wv2 pp info and store those columns into a new table
df_wv2_pp=df_wv1_3[wv2_pp_cls_list]
df_wv2_pp.head()

## wave 3 demographic info

In [None]:
# save the list of columns (representing wave 3 demographics info) into a list 
wv3_pp_cls=pd.read_csv("wv3_pp_columns.txt",header=None,delimiter="\n")
wv3_pp_cls_list=wv3_pp_cls[0].tolist()
wv3_pp_cls_list

In [None]:
# select the columns that reprsent wv3 pp info and store those columns into a new table
df_wv3_pp=df_wv1_3[wv3_pp_cls_list]
df_wv3_pp.head()

## Wave 4 demographic info

In [None]:
df_wv4=pd.read_stata("wave_4_supplement_v1_2.dta")
df_wv4.head()

In [None]:
df_wv4.shape

In [None]:
# get the columns from wave 4 and store in a csv file for easy inspection
columns = df_wv4.columns
df=pd.DataFrame(columns)
df.to_csv("wv4_columns.csv",index=False,header=False)

In [None]:
# save the list of columns (representing wave 4 demographics info) into a list 
wv4_pp_cls=pd.read_csv("wv4_pp_columns.txt",header=None,delimiter="\n")
wv4_pp_cls_list=wv4_pp_cls[0].tolist()
wv4_pp_cls_list

In [None]:
# select the columns that reprsent wv4 pp info and store those columns into a new table
df_wv4_pp=df_wv4[wv4_pp_cls_list]
df_wv4_pp.head()

## Wave 5 demographic info

In [None]:
df_wv5=pd.read_stata("HCMST_wave_5_supplement_ver_1.dta")
df_wv5.head()

In [None]:
df_wv5.shape

In [None]:
# get the columns from wave 5 and store in a csv file for easy inspection
columns = df_wv5.columns
df=pd.DataFrame(columns)
df.to_csv("wv5_columns.csv",index=False,header=False)

In [None]:
# save the list of columns (representing wave 5 demographics info) into a list 
wv5_pp_cls=pd.read_csv("wv5_pp_columns.txt",header=None,delimiter="\n")
wv5_pp_cls_list=wv5_pp_cls[0].tolist()
wv5_pp_cls_list

In [None]:
# select the columns that reprsent wv5 pp info and store those columns into a new table
df_wv5_pp=df_wv5[wv5_pp_cls_list]
df_wv5_pp.head()

## Wave 6 demographic info

In [None]:
df_wv6=pd.read_stata("HCMST_wave_6_public_v1.dta")
df_wv6.head()

In [None]:
df_wv6.shape
# (640, 77)
# why is there so few data??? 

In [None]:
# get the columns from wave 6 and store in a csv file for easy inspection
columns = df_wv6.columns
df=pd.DataFrame(columns)
df.to_csv("wv6_columns.csv",index=False,header=False)

In [None]:
# save the list of columns (representing wave 6 demographics info) into a list 
wv6_pp_cls=pd.read_csv("wv6_pp_columns.txt",header=None,delimiter="\n")
wv6_pp_cls_list=wv6_pp_cls[0].tolist()
wv6_pp_cls_list

In [None]:
# select the columns that reprsent wv5 pp info and store those columns into a new table
df_wv6_pp=df_wv6[wv6_pp_cls_list]
df_wv6_pp.head()

## Save to MySQL

In [None]:
connection_string = "root:MniXXt530mysql@localhost/hcmst"
engine = create_engine(f'mysql://{connection_string}')
engine.table_names() # now it's empty

In [None]:
df_wv6_pp.to_sql(name="wv6_pp", con=engine, if_exists='replace', index=True)
df_wv5_pp.to_sql(name="wv5_pp", con=engine, if_exists='replace', index=True)
df_wv4_pp.to_sql(name="wv4_pp", con=engine, if_exists='replace', index=True)
df_wv3_pp.to_sql(name="wv3_pp", con=engine, if_exists='replace', index=True)
df_wv2_pp.to_sql(name="wv2_pp", con=engine, if_exists='replace', index=True)
df_wv1_pp.to_sql(name="wv1_pp", con=engine, if_exists='replace', index=True)
engine.table_names()

In [None]:
# check if data has been stored properly 

mysql_wv2_pp=engine.execute("SELECT * FROM wv2_pp").fetchall()
for elem in mysql_wv2_pp[0:5]:
    print(elem)
    print("\n")

## Save to mysqlite

In [18]:
# using gitbash to create the database (HCMST.sqlite) first 
# (outside of Jupyter notebook)

connection_string = "Relationship_survey.sqlite"
engine2 = create_engine(f'sqlite:///{connection_string}')
engine2.table_names() # now it's empty

['relationship_qs', 'wv2_w', 'wv3_w', 'wv4_w', 'wv5_w', 'wv6_w']

In [19]:
df_wv6_pp.to_sql(name="wv6_pp", con=engine2, if_exists='replace', index=True)
df_wv5_pp.to_sql(name="wv5_pp", con=engine2, if_exists='replace', index=True)
df_wv4_pp.to_sql(name="wv4_pp", con=engine2, if_exists='replace', index=True)
df_wv3_pp.to_sql(name="wv3_pp", con=engine2, if_exists='replace', index=True)
df_wv2_pp.to_sql(name="wv2_pp", con=engine2, if_exists='replace', index=True)
df_wv1_pp.to_sql(name="wv1_pp", con=engine2, if_exists='replace', index=True)
engine2.table_names()

NameError: name 'df_wv6_pp' is not defined

In [20]:
# check if data has been stored properly 

sqlite_wv2_pp=engine2.execute("SELECT * FROM wv2_pp").fetchall()
for elem in sqlite_wv2_pp[0:5]:
    print(elem)
    print("\n")

OperationalError: (sqlite3.OperationalError) no such table: wv2_pp
[SQL: SELECT * FROM wv2_pp]
(Background on this error at: http://sqlalche.me/e/e3q8)