#### Clean death data in preparation for linkage

###### MBG December 5, 2019

In [1]:
import pandas as pd

In [2]:
#Read in 2018 death data

dth18 = pd.read_csv(r'..\infdth2018_raw.csv', 
                    index_col = None,
                    low_memory = False)

#### GENERAL INFO ABOUT INFANT DEATH DATA FRAME

In [3]:
# variable info - name and type

pd.set_option('display.max_rows', 140)

dth18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58282 entries, 0 to 58281
Data columns (total 35 columns):
Unnamed: 0      58282 non-null int64
sfn             58282 non-null int64
sex             58282 non-null object
agetype         58282 non-null float64
age             58282 non-null int64
ageyrs          58282 non-null float64
dob             58282 non-null object
dobm            58282 non-null int64
dobd            58282 non-null int64
doby            58282 non-null int64
dod             58282 non-null object
dodm            58282 non-null int64
dodd            58282 non-null int64
dody            58282 non-null int64
dodmod          58282 non-null object
todhh           58282 non-null int64
todmm           58282 non-null int64
todmod          58282 non-null object
bplstatefips    58282 non-null object
bcountryl       58282 non-null object
dcityl          58280 non-null object
dcountyl        58282 non-null object
dstatel         58282 non-null object
dzip            58282 non-

In [4]:
dth18.head()

Unnamed: 0.1,Unnamed: 0,sfn,sex,agetype,age,ageyrs,dob,dobm,dobd,doby,...,dfaccd,bridgerace,hispno,rcity,rcityfips,rcitylim,rcountyl,rcntyfips,rstatefips,rzip
0,0,2018042418,M,1.0,81,81.0,03/07/1937,3,7,1937,...,1,1,Y,SEATTLE,63000,Y,KING,33,WA,98116
1,1,2018042419,M,1.0,85,85.0,11/30/1932,11,30,1932,...,810,1,Y,RICHLAND,58235,Y,BENTON,5,WA,99352
2,2,2018042420,F,1.0,88,88.0,11/02/1929,11,2,1929,...,810,1,Y,PORT ANGELES,55365,Y,CLALLAM,9,WA,98363
3,3,2018042421,F,1.0,85,85.0,04/07/1933,4,7,1933,...,800,1,Y,SPOKANE VALLEY,67167,Y,SPOKANE,63,WA,99016
4,4,2018042422,M,1.0,84,84.0,02/04/1934,2,4,1934,...,810,1,Y,EVERSON,22745,Y,WHATCOM,73,WA,98247


In [5]:
# drop 'unnamed'column - not sure how that was introduced into dataframe.

dth18.drop(columns="Unnamed: 0", inplace = True)

In [6]:
# number of rows, columns

dth18.head()

Unnamed: 0,sfn,sex,agetype,age,ageyrs,dob,dobm,dobd,doby,dod,...,dfaccd,bridgerace,hispno,rcity,rcityfips,rcitylim,rcountyl,rcntyfips,rstatefips,rzip
0,2018042418,M,1.0,81,81.0,03/07/1937,3,7,1937,09/30/2018,...,1,1,Y,SEATTLE,63000,Y,KING,33,WA,98116
1,2018042419,M,1.0,85,85.0,11/30/1932,11,30,1932,09/30/2018,...,810,1,Y,RICHLAND,58235,Y,BENTON,5,WA,99352
2,2018042420,F,1.0,88,88.0,11/02/1929,11,2,1929,09/29/2018,...,810,1,Y,PORT ANGELES,55365,Y,CLALLAM,9,WA,98363
3,2018042421,F,1.0,85,85.0,04/07/1933,4,7,1933,09/29/2018,...,800,1,Y,SPOKANE VALLEY,67167,Y,SPOKANE,63,WA,99016
4,2018042422,M,1.0,84,84.0,02/04/1934,2,4,1934,09/29/2018,...,810,1,Y,EVERSON,22745,Y,WHATCOM,73,WA,98247


#### Limit to deaths among infants (WA residents only)

In [7]:
## restrict to infants defined as <=365 days old at time of death who were residents of Washington State.
## code selects all 'agetype' values other than 1.  Agetype = 1 indicates age is in years. Infant age would be recorded as
## 'agetype' values 2 through 5 i.e. months, days, hours, or minutes.

infdth18 = dth18[(dth18['rstatefips'] == "WA") & (dth18['agetype'] != 1)]

In [8]:
infdth18.shape

(401, 34)

In [9]:
pd.crosstab(infdth18.agetype, infdth18.rstatefips, dropna=False)

#All decedents are Washington State residents

rstatefips,WA
agetype,Unnamed: 1_level_1
2.0,131
3.0,111
4.0,94
5.0,65


#### check for null values in each column

In [10]:
infdth18.isna().sum()

# no missing values in any variable

sfn             0
sex             0
agetype         0
age             0
ageyrs          0
dob             0
dobm            0
dobd            0
doby            0
dod             0
dodm            0
dodd            0
dody            0
dodmod          0
todhh           0
todmm           0
todmod          0
bplstatefips    0
bcountryl       0
dcityl          0
dcountyl        0
dstatel         0
dzip            0
dplacetype      0
dfaccd          0
bridgerace      0
hispno          0
rcity           0
rcityfips       0
rcitylim        0
rcountyl        0
rcntyfips       0
rstatefips      0
rzip            0
dtype: int64

In [11]:
# create list of column names
dthvars = list(infdth18.columns)
dthvars

['sfn',
 'sex',
 'agetype',
 'age',
 'ageyrs',
 'dob',
 'dobm',
 'dobd',
 'doby',
 'dod',
 'dodm',
 'dodd',
 'dody',
 'dodmod',
 'todhh',
 'todmm',
 'todmod',
 'bplstatefips',
 'bcountryl',
 'dcityl',
 'dcountyl',
 'dstatel',
 'dzip',
 'dplacetype',
 'dfaccd',
 'bridgerace',
 'hispno',
 'rcity',
 'rcityfips',
 'rcitylim',
 'rcountyl',
 'rcntyfips',
 'rstatefips',
 'rzip']

In [12]:
#create dictionary of valid values so that each variable can be checked to make sure there is no
# out of range value.

valids = {'sex': ['M', 'U'],
          'agetype': [range(1,6), 9],
          'dobm': [range(1,13), 99],
          'dobd': [range(1,32),99],
          'doby': 2019,
          'todhh': [range(0,24), 99],
          'todmm': [range(60), 99],
          'dcntycitywacd':[range(100,3912), 0, 9999],
          'dcntywacd': [range(1,40),99],
          'dplacetype': [range(0,10)],
          'education': [range(1,10)],
          'lt8grade': [range(1,10)],
          'bridgerace': [range(1,17), range(21, 25)]}

In [14]:
# check for out of range values for 'sex'

chksex = infdth18['sex'].isin(valids['sex'])
oorsex = infdth18[~chksex]
len(oorsex)

176

In [16]:
print(oorsex.sfn.tolist())

[2018006396, 2018007775, 2018004489, 2018000269, 2018001047, 2018002075, 2018003138, 2018003378, 2018003164, 2018003511, 2018005121, 2018004121, 2018004746, 2018005174, 2018004917, 2018008874, 2018005914, 2018008010, 2018007452, 2018007590, 2018008651, 2018008407, 2018008340, 2018012317, 2018009272, 2018009149, 2018090191, 2018010504, 2018011896, 2018012269, 2018012716, 2018013171, 2018013885, 2018013983, 2018014348, 2018014651, 2018014535, 2018014505, 2018019835, 2018018410, 2018016935, 2018017716, 2018017858, 2018019607, 2018016846, 2018000882, 2018018037, 2018019261, 2018015967, 2018018935, 2018018756, 2018018824, 2018019875, 2018050234, 2018054120, 2018018938, 2018011838, 2018022101, 2018010766, 2018006565, 2018023140, 2018023834, 2018023927, 2018024151, 2018025527, 2018027053, 2018025064, 2018025080, 2018025084, 2018027120, 2018025740, 2018025514, 2018026785, 2018026132, 2018026314, 2018027186, 2018027211, 2018027212, 2018027841, 2018028729, 2018029781, 2018028996, 2018012876, 201

In [17]:
infdth18.sex.value_counts()

M    225
F    176
Name: sex, dtype: int64

In [18]:
sexerrors = [infdth18['sfn'] for i in infdth18['sex'] if i not in valids['sex']]

In [19]:
len(sexerrors)

176

In [21]:
print(sexerrors)

[85       2018039789
232      2018006396
249      2018007775
471      2018004489
1232     2018000269
            ...    
58104    2018053912
58111    2018007796
58117    2018041812
58170    2018036151
58176    2018040135
Name: sfn, Length: 401, dtype: int64, 85       2018039789
232      2018006396
249      2018007775
471      2018004489
1232     2018000269
            ...    
58104    2018053912
58111    2018007796
58117    2018041812
58170    2018036151
58176    2018040135
Name: sfn, Length: 401, dtype: int64, 85       2018039789
232      2018006396
249      2018007775
471      2018004489
1232     2018000269
            ...    
58104    2018053912
58111    2018007796
58117    2018041812
58170    2018036151
58176    2018040135
Name: sfn, Length: 401, dtype: int64, 85       2018039789
232      2018006396
249      2018007775
471      2018004489
1232     2018000269
            ...    
58104    2018053912
58111    2018007796
58117    2018041812
58170    2018036151
58176    2018040135
Name:

#### check for out of range values