### Capstone 1 - Washington state linkage of infant death, birth, and mother's hospitalization discharge data

##### Maya Bhat-Gregerson

January 7, 2020

### B. PREPARATION OF DEATH DATA 2016-2018

### I. Data acquisition

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

I use SQL queries to get the death and birth variables I am likely to need for linking the records. 

NOTE: I renamed all variables from death records so that they begin with 'd' to distinguish the fields from those in birth records with the same names.

**DEATH DATA 2016, 2017, 2018**

In [7]:
## CONNECT TO WHALES & USE SQL QUERY FOR DEATH DATA SET

driver = '{SQL Server Native Client 11.0}'

conn = pyodbc.connect(
        Trusted_Connection='Yes',
        Driver='{ODBC Driver 13 for SQL Server}',
        Server='###',
        Database='###'
        )

querystring = ("SELECT SFN_NUM as 'dsfn'," + 
        "BIRTH_SFN_NUM as 'dbirsfn', " + 
        "SSN as 'dssn', " +
        "ISNULL(GNAME, 'NaN') as 'dfname', " +
        "ISNULL(MNAME, 'NaN') as 'dmname', " +
        "ISNULL(LNAME, 'NaN') as 'dlname', " +
        "ISNULL(MOTHER_GNAME, 'NaN') as 'dmom_fname', " +
        "ISNULL(MOTHER_MNAME, 'NaN') as 'dmom_mname', " + 
        "ISNULL(MOTHER_LNAME, 'NaN') as 'dmom_maiden', " +
        "ISNULL(FATHER_LNAME, 'NaN') as 'ddad_lname', " +
        "ISNULL(FATHER_MNAME, 'NaN') as 'ddad_mname', " +    
        "ISNULL(FATHER_GNAME, 'NaN') as 'ddad_fname', " +           
        "ISNULL(SEX, 'NaN') as 'dsex', " + 
		"DOB as 'ddob', " + 
		"ISNULL(SUBSTRING(DOB, 1,2), '99') as 'ddobm', " + 
		"ISNULL(SUBSTRING(DOB, 4,2), '99') as 'ddobd', " +
		"ISNULL(SUBSTRING(DOB, 7,4), '9999') as 'ddoby', " + 
		"ISNULL(DOD, '  ') as 'ddod'," + 
		"ISNULL(SUBSTRING(DOD, 1,2), '99') as 'ddodm', " + 
		"ISNULL(SUBSTRING(DOD, 4,2), '99') as 'ddodd', " + 
		"ISNULL(SUBSTRING(DOD, 7,4), '9999') as 'ddody', " +
		"ISNULL(BPLACE_ST_FIPS_CD, '  ') as 'dbirplstatefips', " + 
		"ISNULL(BPLACE_CNT, '  ') as 'dbircountryl'," + 
		"ISNULL(DNAME_CITY, '  ') as 'ddthcityl', " + 
		"ISNULL(DCOUNTY, '  ') as 'ddthcountyl'," + 
		"ISNULL(DSTATEL, '  ') as 'ddthstatel', " + 
		"ISNULL(SUBSTRING(DZIP9, 1,5), '99999') as 'ddthzip', " + 
		"ISNULL(RES_CITY, '  ') as 'drescity', " + 
		"RIGHT('00000' + ISNULL(RES_CITY_FIPS_CD, '99999'), 5) as 'drescityfips', " + 
		"ISNULL(RES_COUNTY, '  ') as 'drescountyl', " + 
		"RIGHT('000' + ISNULL(RES_COUNTY_FIPS_CD, '999'), 3) as 'drescntyfips', " + 
		"ISNULL(RES_STATE_FIPS_CD, '  ') as 'dresstatefips', " + 
		"ISNULL(SUBSTRING(RES_ZIP, 1,5), '99999') as 'dreszip'" + 
"FROM [wa_vrvweb_events].[VRV_DEATH_TBL]" +
"WHERE ((DATE_DEATH_YEAR = 2016) OR (DATE_DEATH_YEAR = 2017) OR (DATE_DEATH_YEAR = 2018))" +
	"AND VRV_REGISTERED_FLAG = '1'" +
	"AND FL_CURRENT = '1'" +
	"AND FL_VOIDED = '0'")

dth1618= pd.read_sql_query(querystring, conn)

## SAVE DATA AS CSV FILE

dth1618.to_csv(r'###\Data\dth1618_raw.csv', index=None, header=True)

In [20]:
d1618 = pd.read_csv(r'###\Data\dth1618_raw.csv', low_memory=False)

In [21]:
d1618.shape

(172810, 33)

In [22]:
d1618.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172810 entries, 0 to 172809
Data columns (total 33 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   dsfn             172810 non-null  int64 
 1   dbirsfn          7510 non-null    object
 2   dssn             172810 non-null  object
 3   dfname           172808 non-null  object
 4   dmname           157998 non-null  object
 5   dlname           172803 non-null  object
 6   dmom_fname       172747 non-null  object
 7   dmom_mname       75900 non-null   object
 8   dmom_maiden      172751 non-null  object
 9   ddad_lname       172788 non-null  object
 10  ddad_mname       89674 non-null   object
 11  ddad_fname       172738 non-null  object
 12  dsex             172809 non-null  object
 13  ddob             172810 non-null  object
 14  ddobm            172810 non-null  int64 
 15  ddobd            172810 non-null  int64 
 16  ddoby            172810 non-null  int64 
 17  ddod      

### II. Data cleaning and standardization

In [2]:
# look at data
#d1618.head()

  - Check death record number 'dsfn' and convert to integer if necessary

In [24]:
d1618.dsfn.describe()

count    1.728100e+05
mean     2.017043e+09
std      8.144041e+05
min      2.016000e+09
25%      2.016043e+09
50%      2.017031e+09
75%      2.018015e+09
max      2.018091e+09
Name: dsfn, dtype: float64

In [25]:
dth1618['dsfn'] = dth1618['dsfn'].astype(int)

In [26]:
dth1618.dsfn.dtype

dtype('int32')

**RESTRICT TO WA RESIDENTS**

In [27]:
d1618['dresstatefips'].value_counts(dropna=False)

WA    168703
OR      1424
ID       605
CA       311
AK       308
MT       187
XX       139
ZZ       131
AZ       117
TX        82
FL        73
BC        52
HI        46
NV        45
NY        42
IL        38
MI        36
CO        30
UT        30
PA        24
MN        23
OH        19
VA        18
OK        18
NJ        18
IN        17
TN        15
MO        15
AL        14
SC        13
IA        13
MD        12
SD        12
LA        12
NC        12
NM        11
MA        11
WI        11
KS        11
AR        10
GA        10
ND         9
MS         9
WY         9
CT         8
AB         7
NE         7
ME         5
WV         4
KY         4
GU         4
DE         4
ON         4
NH         4
AS         3
RI         2
PR         2
DC         2
SK         1
YT         1
MP         1
VT         1
NS         1
Name: dresstatefips, dtype: int64

In [28]:
# keep only WA residents and check if successful in filtering out other state residents.
d1618 = d1618[(d1618['dresstatefips']=="WA")]
d1618['dresstatefips'].value_counts(dropna=False)

WA    168703
Name: dresstatefips, dtype: int64

#### CHECK FOR NULL VALUES

In [29]:
d1618.isna().sum()

dsfn                    0
dbirsfn            161334
dssn                    0
dfname                  1
dmname              14422
dlname                  6
dmom_fname             62
dmom_mname          94589
dmom_maiden            58
ddad_lname             21
ddad_mname          81126
ddad_fname             70
dsex                    0
ddob                    0
ddobm                   0
ddobd                   0
ddoby                   0
ddod                    0
ddodm                   0
ddodd                   0
ddody                   0
dbirplstatefips         0
dbircountryl            0
ddthcityl               3
ddthcountyl             0
ddthstatel              0
ddthzip                 0
drescity                5
drescityfips            0
drescountyl             0
drescntyfips            0
dresstatefips           0
dreszip                 0
dtype: int64

Probably will not use mothers' and infants' middle name for linking as there are too many missing values.

#### STANDARDIZE ALL STRING VARIABLES

First, middle, and last names of infants and mothers as well as city names will be standardized by converting these columns to upper case text, removing white spaces, removing hyphens and other punctuation marks.

In [30]:
# convert to upper case
d1618 = d1618.apply(lambda x: x.str.upper() if type(x) == str else x)

In [31]:
#remove white spaces, punctuation
d1618 = d1618.apply(lambda x: x.str.strip() if type(x) == str else x)
d1618 = d1618.applymap(lambda x: x.replace(" ", "") if type(x) == str else x)
d1618 = d1618.applymap(lambda x: x.replace("-", "") if type(x) == str else x)
d1618 = d1618.applymap(lambda x: x.replace(".", "") if type(x) == str else x)
d1618 = d1618.applymap(lambda x: x.replace("'", "") if type(x) == str else x)

In [1]:
#d1618.tail(30)

- Verified that all string transformations were successful.

#### CHECK FOR OUT OF RANGE VALUES

In [33]:
#create dictionary of valid values so that each variable can be checked to make sure there is no
# out of range value.

valids = {'sex': ['M', 'F', 'U'],
          'dobm': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 99],
          'dodm': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 99],
          'dobd': np.r_[1:32 ,99],
          'dodd': np.r_[1:32 ,99],
          'doby': list(range(1900,2019)),
          'dody': [2016,2017,2018],
         'rcntyfips': np.r_[list(range(1, 78, 2)), 99],
         'certtype': ['R'],
         'birthstatefips': ['WA'], 
         'rstatefips': ['WA']}

In [34]:
# check for out of range values for 'dsex'

chksex = d1618['dsex'].isin(valids['sex'])
len(d1618[~chksex])

0

In [35]:
# check for out of range values for 'ddobm'

chkdobm = d1618['ddobm'].isin(valids['dobm'])
len(d1618[~chkdobm])

0

In [36]:
# check for out of range values for 'ddody'

chkdody = d1618['ddody'].isin(valids['dody'])
len(d1618[~chkdody])

0

In [37]:
d1618['ddody'].value_counts(dropna=False)

2017    57024
2018    56918
2016    54761
Name: ddody, dtype: int64

In [38]:
# check for out of range values for 'ddoby'

chkdoby = d1618['ddoby'].isin(valids['doby'])
len(d1618[~chkdoby])

6

In [39]:
d1618.loc[~chkdoby].ddoby.value_counts(dropna=False)

9999    6
Name: ddoby, dtype: int64

In [40]:

dobyerrors = d1618[~chkdoby][['ddob']]

dobyerrors

Unnamed: 0,ddob
4897,99/99/9999
8190,99/99/9999
20713,99/99/9999
40131,99/99/9999
75586,99/99/9999
94603,99/99/9999


- unable to correct dob year without additional information on date of birth.

In [41]:
# check for out of range values for 'dobd'

chkdobd = d1618['ddobd'].isin(valids['dobd'])
len(d1618[~chkdobd])


0

In [42]:
# check for out of range values for 'drescntyfips'

chkrcounty = d1618['drescntyfips'].isin(valids['rcntyfips'])
len(d1618['drescntyfips'][~chkrcounty])

235

In [43]:
rcntyerrors = d1618[~chkrcounty][['dsfn','drescntyfips', 'drescountyl','drescity']]

rcntyerrors

Unnamed: 0,dsfn,drescntyfips,drescountyl,drescity
2815,2016091650,999,,
3140,2016051562,999,,
3796,2016002323,999,UNKNOWN,UNKNOWN
4262,2016091147,999,,
4366,2016092112,999,,
6662,2016091178,999,,
7027,2016091112,999,,
7285,2016003874,999,,
7694,2016016523,999,,
7882,2016045492,999,,


 - cannot replace with correct values without any information on residence city or county.

###### Format date variables as dates

In [46]:
d1618.ddod = pd.to_datetime(d1618.ddod)


In [None]:
d1618.to_csv(r'###\Data\d1618_clean.csv', index=None, header=True)

Next step - clean/prepare the manually linked labeled data for infant birth and death linked file.