#### 2019 DEATH DATA PREP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import nltk
import spacy
import textblob

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)

In [3]:
d19 = pd.read_csv('Y:/DQSS/Death/MBG/py/QA/data/d19_05-27-2020.csv',
                  low_memory=False,
                  encoding = 'unicode_escape')

In [4]:
d19.dtypes

sfn                  int64
sex                 object
agetype            float64
age                float64
ageyrs             float64
dob                 object
dod                 object
bcertno             object
bmatchcode         float64
dcity               object
dcityFIPS          float64
dcounty             object
dcountycityWA      float64
dcountyWA          float64
dcountyFIPS          int64
dcountyNCHS        float64
dstate              object
dstateNCHS         float64
dcountry            object
dzip                object
dplacetype         float64
dfaccode           float64
dplacelit           object
military            object
marital             object
education          float64
educ8less          float64
white               object
black               object
AIAN                object
asianind            object
chinese             object
filipino            object
japanese            object
korean              object
vietnamese          object
asianoth            object
h

**Keep relevant variables** including underlying cause code ('UCOD'), all multiple cause code fields ('MC1' to 'MC20'), and all cause of death literal fields.  The working data set will contain death records for deaths occurring in Washington State regardless of the decedents' residence states.

In [5]:
d19s = d19.loc[d19['dstate']=='WASHINGTON', ['sex', 'ageyrs', 'dob', 'dod', 'dstate', 'marital','dcounty', 'dzip', 'rcounty', 
                   'rstateFIPS', 'rzip', 'education','dplacetype', 'bridgerace', 'hispno', 'certdesig', 
                   'UCOD', 'MC1', 'MC2', 'MC3','MC4','MC5','MC6','MC7','MC8','MC9','MC10','MC11','MC12',
                   'MC13','MC14','MC15','MC16','MC17', 'MC18','MC19','MC20','codAq', 'codBq', 'codCq','codDq', 
                   'codIIq', 'AllMC', 'codlit', 'pg', 'manner', 'tobac']]


In [6]:
d19s['dod'] = pd.to_datetime(d19s['dod'],
                            format = '%m/%d/%Y')

In [7]:
d19s.dstate.value_counts()

WASHINGTON    58226
Name: dstate, dtype: int64

In [8]:
d19s.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58226 entries, 0 to 59679
Data columns (total 47 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   sex         58226 non-null  object        
 1   ageyrs      58226 non-null  float64       
 2   dob         58226 non-null  object        
 3   dod         58226 non-null  datetime64[ns]
 4   dstate      58226 non-null  object        
 5   marital     58226 non-null  object        
 6   dcounty     58226 non-null  object        
 7   dzip        58203 non-null  object        
 8   rcounty     58154 non-null  object        
 9   rstateFIPS  58226 non-null  object        
 10  rzip        58179 non-null  object        
 11  education   58226 non-null  float64       
 12  dplacetype  58226 non-null  float64       
 13  bridgerace  58212 non-null  float64       
 14  hispno      58226 non-null  object        
 15  certdesig   58064 non-null  float64       
 16  UCOD        58171 non-

**Create lists of ICD-10 codes for each of the 9 garbage code categories.** These lists will be used to flag records where underlying cause code (UCOD) is a member of a list. The nine categories of garbage codes are as follows:

- g1 = Septicemia
- g2 = Heart failure
- g3 = Ill-defined cancer
- g4 = Volume depletion
- g5 = Ill-defined
- g6 = Ill-defined cardiovascular
- g7 = Ill-defined injury
- g8 = Undetermined intent
- g9 = Ill-defined infectious

In [9]:
g1 = ['A40', 'A400', 'A401', 'A402', 'A403', 'A408', 'A409', 'A41', 'A410', 'A411', 'A412', 'A413', 'A414',
      'A415', 'A418', 'A419']

g2 = ['I50', 'I500', 'I501', 'I509']

g3 = ['C759', 'C76', 'C760', 'C761', 'C762', 'C763', 'C764', 'C765', 'C767', 'C768', 'C80', 'D099', 'D489']

g4 = ['E86', 'E87', 'E870', 'E871', 'E872', 'E873', 'E874', 'E875', 'E876', 'E877', 'E878' ]

g5 = ['I46', 'I460', 'I461', 'I469', 'P95', 'R00', 'R000', 'R001', 'R002', 'R008', 'R01', 'R010', 'R011', 
      'R012', 'R02', 'R03', 'R030', 'R031', 'R04', 'R040', 'R041', 'R042', 'R048', 'R049', 'R05', 'R06', 
      'R060', 'R061', 'R062', 'R063', 'R064', 'R065', 'R066', 'R067', 'R068', 'R07', 'R070', 'R071', 
      'R072', 'R073', 'R074', 'R09', 'R090', 'R091', 'R092', 'R093', 'R098', 'R10', 'R100', 'R101', 
      'R102', 'R103', 'R104', 'R11', 'R12', 'R13', 'R14', 'R15', 'R16', 'R160', 'R161', 'R162', 'R17', 
      'R18', 'R19', 'R190', 'R191', 'R192', 'R193', 'R194', 'R195', 'R196', 'R198', 'R20', 'R200', 
      'R201', 'R202', 'R203', 'R208', 'R21', 'R22', 'R220', 'R221', 'R222', 'R223', 'R224', 'R227', 
      'R229', 'R23', 'R230', 'R231', 'R232', 'R233', 'R234', 'R238', 'R25', 'R250', 'R251', 'R252', 
      'R253', 'R258', 'R26', 'R260', 'R261', 'R262', 'R268', 'R27', 'R270', 'R278', 'R29', 'R290', 
      'R291', 'R292', 'R293', 'R294', 'R296', 'R298', 'R30', 'R300', 'R301', 'R309', 'R31', 'R32', 
      'R33', 'R34', 'R35', 'R36', 'R39', 'R390', 'R391', 'R392', 'R398', 'R40', 'R400', 'R401', 'R402', 
      'R41', 'R410', 'R411', 'R412', 'R413', 'R418', 'R42', 'R43', 'R430', 'R431', 'R432', 'R438', 'R44', 
      'R440', 'R441', 'R442', 'R443', 'R448', 'R45', 'R450', 'R451', 'R452', 'R453', 'R454', 'R455', 'R456', 
      'R457', 'R458', 'R46', 'R460', 'R461', 'R462', 'R463', 'R464', 'R465', 'R466', 'R467', 'R468', 'R47', 
      'R470', 'R471', 'R478', 'R48', 'R480', 'R481', 'R482', 'R488', 'R49', 'R490', 'R491', 'R492', 'R498', 
      'R50', 'R500', 'R501', 'R502', 'R508', 'R509', 'R51', 'R52', 'R520', 'R521', 'R522', 'R529', 'R53', 
      'R54', 'R55', 'R56', 'R560', 'R568', 'R57', 'R570', 'R571', 'R578', 'R579', 'R58', 'R59', 'R590', 
      'R591', 'R599', 'R60', 'R600', 'R601', 'R609', 'R61', 'R610', 'R611', 'R619', 'R62', 'R620', 'R628', 
      'R629', 'R63', 'R630', 'R631', 'R632', 'R633', 'R634', 'R635', 'R638', 'R64', 'R68', 'R680', 'R681', 
      'R682', 'R683', 'R688', 'R69', 'R70', 'R700', 'R701', 'R71', 'R72', 'R73', 'R730', 'R739', 'R74', 'R740',
      'R748', 'R749', 'R75', 'R76', 'R760', 'R761', 'R762', 'R768', 'R769', 'R77', 'R770', 'R771', 'R772', 'R778',
      'R779', 'R78', 'R780', 'R781', 'R782', 'R783', 'R784', 'R785', 'R786', 'R787', 'R788', 'R789', 'R79', 'R790',
      'R798', 'R799', 'R80', 'R81', 'R82', 'R820', 'R821', 'R822', 'R823', 'R824', 'R825', 'R826', 'R827', 'R828',
      'R829', 'R83', 'R830', 'R831', 'R832', 'R833', 'R834', 'R835', 'R836', 'R837', 'R838', 'R839', 'R84', 'R840',
      'R841', 'R842', 'R843', 'R844', 'R845', 'R846', 'R847', 'R848', 'R849', 'R85', 'R850', 'R851', 'R852',
      'R853', 'R854', 'R855', 'R856', 'R857', 'R858', 'R859', 'R86', 'R860', 'R861', 'R862', 'R863', 'R864',
      'R865', 'R866', 'R867', 'R868', 'R869', 'R87', 'R870', 'R871', 'R872', 'R873', 'R874', 'R875',
      'R876', 'R877', 'R878', 'R879', 'R89', 'R890', 'R891', 'R892', 'R893', 'R894', 'R895', 'R896', 'R897',
      'R898', 'R899', 'R90', 'R900', 'R908', 'R91', 'R92', 'R93', 'R930', 'R931', 'R932', 'R933', 'R934',
      'R935', 'R936', 'R937', 'R938', 'R94', 'R940', 'R941', 'R942', 'R943', 'R944', 'R945', 'R946', 'R947',
      'R948', 'R95', 'R96', 'R960', 'R961', 'R98', 'R99']

g6 = ['I10', 'I15', 'I150', 'I151', 'I152', 'I158', 'I159', 'I26', 'I260', 'I269', 'I49', 'I490', 'I491', 'I492',
      'I493', 'I494', 'I495', 'I498', 'I499', 'I51', 'I510', 'I511', 'I512', 'I513', 'I514', 'I515', 'I516', 'I517',
      'I518', 'I519', 'I70', 'I700', 'I701', 'I709', 'I74', 'I740', 'I741', 'I742', 'I743', 'I744', 'I745', 'I748',
      'I749', 'I99']

g7 = ['S00', 'S000', 'S001', 'S002', 'S003', 'S004', 'S005', 'S007', 'S008', 'S009', 'S01', 'S010', 'S011', 'S012',
      'S013', 'S014', 'S015', 'S017', 'S018', 'S019', 'S02', 'S020', 'S021', 'S022', 'S023', 'S024', 'S025', 'S026',
      'S027', 'S028', 'S029', 'S03', 'S030', 'S031', 'S032', 'S033', 'S034', 'S035', 'S04', 'S040', 'S041', 'S042',
      'S043', 'S044', 'S045', 'S046', 'S047', 'S048', 'S049', 'S05', 'S050', 'S051', 'S052', 'S053', 'S054', 'S055',
      'S056', 'S057', 'S058', 'S059', 'S06', 'S060', 'S061', 'S062', 'S063', 'S064', 'S065', 'S066', 'S067', 'S068',
      'S069', 'S07', 'S070', 'S071', 'S078', 'S079', 'S08', 'S080', 'S081', 'S088', 'S089', 'S09', 'S090', 'S091',
      'S092', 'S097', 'S098', 'S099', 'S10', 'S100', 'S101', 'S107', 'S108', 'S109', 'S11', 'S110', 'S111', 'S112',
      'S117', 'S118', 'S119', 'S12', 'S120', 'S121', 'S122', 'S127', 'S128', 'S129', 'S13', 'S130', 'S131', 'S132',
      'S133', 'S134', 'S135', 'S136', 'S14', 'S140', 'S141', 'S142', 'S143', 'S144', 'S145', 'S146', 'S15', 'S150',
      'S151', 'S152', 'S153', 'S157', 'S158', 'S159', 'S16', 'S17', 'S170', 'S178', 'S179', 'S18', 'S19', 'S197',
      'S198', 'S199', 'S20', 'S200', 'S201', 'S202', 'S203', 'S204', 'S207', 'S208', 'S21', 'S210', 'S211', 'S212',
      'S217', 'S218', 'S219', 'S22', 'S220', 'S221', 'S222', 'S223', 'S224', 'S225', 'S228', 'S229', 'S23', 'S230',
      'S231', 'S232', 'S233', 'S234', 'S235', 'S24', 'S240', 'S241', 'S242', 'S243', 'S244', 'S245', 'S246', 'S25',
      'S250', 'S251', 'S252', 'S253', 'S254', 'S255', 'S257', 'S258', 'S259', 'S26', 'S260', 'S268', 'S269', 'S27',
      'S270', 'S271', 'S272', 'S273', 'S274', 'S275', 'S276', 'S277', 'S278', 'S279', 'S28', 'S280', 'S281', 'S29',
      'S290', 'S297', 'S298', 'S299', 'S30', 'S300', 'S301', 'S302', 'S307', 'S308', 'S309', 'S31', 'S310', 'S311',
      'S312', 'S313', 'S314', 'S315', 'S317', 'S318', 'S32', 'S320', 'S321', 'S322', 'S323', 'S324', 'S325', 'S327',
      'S328', 'S33', 'S330', 'S331', 'S332', 'S333', 'S334', 'S335', 'S336', 'S337', 'S34', 'S340', 'S341', 'S342',
      'S343', 'S344', 'S345', 'S346', 'S348', 'S35', 'S350', 'S351', 'S352', 'S353', 'S354', 'S355', 'S357', 'S358',
      'S359', 'S36', 'S360', 'S361', 'S362', 'S363', 'S364', 'S365', 'S366', 'S367', 'S368', 'S369', 'S37', 'S370',
      'S371', 'S372', 'S373', 'S374', 'S375', 'S376', 'S377', 'S378', 'S379', 'S38', 'S380', 'S381', 'S382', 'S383',
      'S39', 'S390', 'S396', 'S397', 'S398', 'S399', 'S40', 'S400', 'S407', 'S408', 'S409', 'S41', 'S410', 'S411',
      'S417', 'S418', 'S42', 'S420', 'S421', 'S422', 'S423', 'S424', 'S427', 'S428', 'S429', 'S43', 'S430', 'S431',
      'S432', 'S433', 'S434', 'S435', 'S436', 'S437', 'S44', 'S440', 'S441', 'S442', 'S443', 'S444', 'S445', 'S447',
      'S448', 'S449', 'S45', 'S450', 'S451', 'S452', 'S453', 'S457', 'S458', 'S459', 'S46', 'S460', 'S461', 'S462',
      'S463', 'S467', 'S468', 'S469', 'S47', 'S48', 'S480', 'S481', 'S489', 'S49', 'S497', 'S498', 'S499', 'S50',
      'S500', 'S501', 'S507', 'S508', 'S509', 'S51', 'S510', 'S517', 'S518', 'S519', 'S52', 'S520', 'S521', 'S522',
      'S523', 'S524', 'S525', 'S526', 'S527', 'S528', 'S529', 'S53', 'S530', 'S531', 'S532', 'S533', 'S534', 'S54',
      'S540', 'S541', 'S542', 'S543', 'S547', 'S548', 'S549', 'S55', 'S550', 'S551', 'S552', 'S557', 'S558', 'S559',
      'S56', 'S560', 'S561', 'S562', 'S563', 'S564', 'S565', 'S567', 'S568', 'S57', 'S570', 'S578', 'S579', 'S58',
      'S580', 'S581', 'S589', 'S59', 'S597', 'S598', 'S599', 'S60', 'S600', 'S601', 'S602', 'S607', 'S608', 'S609',
      'S61', 'S610', 'S611', 'S617', 'S618', 'S619', 'S62', 'S620', 'S621', 'S622', 'S623', 'S624', 'S625', 'S626',
      'S627', 'S628', 'S63', 'S630', 'S631', 'S632', 'S633', 'S634', 'S635', 'S636', 'S637', 'S64', 'S640', 'S641',
      'S642', 'S643', 'S644', 'S647', 'S648', 'S649', 'S65', 'S650', 'S651', 'S652', 'S653', 'S654', 'S655', 'S657',
      'S658', 'S659', 'S66', 'S660', 'S661', 'S662', 'S663', 'S664', 'S665', 'S666', 'S667', 'S668', 'S669', 'S67',
      'S670', 'S678', 'S68', 'S680', 'S681', 'S682', 'S683', 'S684', 'S688', 'S689', 'S69', 'S697', 'S698', 'S699',
      'S70', 'S700', 'S701', 'S707', 'S708', 'S709', 'S71', 'S710', 'S711', 'S717', 'S718', 'S72', 'S720', 'S721',
      'S722', 'S723', 'S724', 'S727', 'S728', 'S729', 'S73', 'S730', 'S731', 'S74', 'S740', 'S741', 'S742', 'S747',
      'S748', 'S749', 'S75', 'S750', 'S751', 'S752', 'S757', 'S758', 'S759', 'S76', 'S760', 'S761', 'S762', 'S763',
      'S764', 'S767', 'S77', 'S770', 'S771', 'S772', 'S78', 'S780', 'S781', 'S789', 'S79', 'S797', 'S798', 'S799',
      'S80', 'S800', 'S801', 'S807', 'S808', 'S809', 'S81', 'S810', 'S817', 'S818', 'S819', 'S82', 'S820', 'S821',
      'S822', 'S823', 'S824', 'S825', 'S826', 'S827', 'S828', 'S829', 'S83', 'S830', 'S831', 'S832', 'S833', 'S834',
      'S835', 'S836', 'S837', 'S84', 'S840', 'S841', 'S842', 'S847', 'S848', 'S849', 'S85', 'S850', 'S851', 'S852',
      'S853', 'S854', 'S855', 'S857', 'S858', 'S859', 'S86', 'S860', 'S861', 'S862', 'S863', 'S867', 'S868', 'S869',
      'S87', 'S870', 'S878', 'S88', 'S880', 'S881', 'S889', 'S89', 'S897', 'S898', 'S899', 'S90', 'S900', 'S901',
      'S902', 'S903', 'S907', 'S908', 'S909', 'S91', 'S910', 'S911', 'S912', 'S913', 'S917', 'S92', 'S920', 'S921',
      'S922', 'S923', 'S924', 'S925', 'S927', 'S929', 'S93', 'S930', 'S931', 'S932', 'S933', 'S934', 'S935', 'S936',
      'S94', 'S940', 'S941', 'S942', 'S943', 'S947', 'S948', 'S949', 'S95', 'S950', 'S951', 'S952', 'S957', 'S958',
      'S959', 'S96', 'S960', 'S961', 'S962', 'S967', 'S968', 'S969', 'S97', 'S970', 'S971', 'S978', 'S98', 'S980',
      'S981', 'S982', 'S983', 'S984', 'S99', 'S997', 'S998', 'S999', 'T00', 'T000', 'T001', 'T002', 'T003', 'T006',
      'T008', 'T009', 'T01', 'T010', 'T011', 'T012', 'T013', 'T016', 'T018', 'T019', 'T02', 'T020', 'T021', 'T022',
      'T023', 'T024', 'T025', 'T026', 'T027', 'T028', 'T029', 'T03', 'T030', 'T031', 'T032', 'T033', 'T034', 'T038',
      'T039', 'T04', 'T040', 'T041', 'T042', 'T043', 'T044', 'T047', 'T048', 'T049', 'T05', 'T050', 'T051', 'T052',
      'T053', 'T054', 'T055', 'T056', 'T058', 'T059', 'T06', 'T060', 'T061', 'T062', 'T063', 'T064', 'T065', 'T068',
      'T07', 'T08', 'T09', 'T090', 'T091', 'T092', 'T093', 'T094', 'T095', 'T096', 'T098', 'T099', 'T10', 'T11',
      'T110', 'T111', 'T112', 'T113', 'T114', 'T115', 'T116', 'T118', 'T119', 'T12', 'T13', 'T130', 'T131', 'T132',
      'T133', 'T134', 'T135', 'T136', 'T138', 'T139', 'T14', 'T140', 'T141', 'T142', 'T143', 'T144', 'T145', 'T146',
      'T147', 'T148', 'T149', 'T15', 'T150', 'T151', 'T158', 'T159', 'T16', 'T17', 'T170', 'T171', 'T172', 'T173',
      'T174', 'T175', 'T178', 'T179', 'T18', 'T180', 'T181', 'T182', 'T183', 'T184', 'T185', 'T188', 'T189', 'T19',
      'T190', 'T191', 'T192', 'T193', 'T198', 'T199', 'T20', 'T200', 'T201', 'T202', 'T203', 'T204', 'T205', 'T206',
      'T207', 'T21', 'T210', 'T211', 'T212', 'T213', 'T214', 'T215', 'T216', 'T217', 'T22', 'T220', 'T221', 'T222',
      'T223', 'T224', 'T225', 'T226', 'T227', 'T23', 'T230', 'T231', 'T232', 'T233', 'T234', 'T235', 'T236', 'T237',
      'T24', 'T240', 'T241', 'T242', 'T243', 'T244', 'T245', 'T246', 'T247', 'T25', 'T250', 'T251', 'T252', 'T253',
      'T254', 'T255', 'T256', 'T257', 'T26', 'T260', 'T261', 'T262', 'T263', 'T264', 'T265', 'T266', 'T267', 'T268',
      'T269', 'T27', 'T270', 'T271', 'T272', 'T273', 'T274', 'T275', 'T276', 'T277', 'T28', 'T280', 'T281', 'T282',
      'T283', 'T284', 'T285', 'T286', 'T287', 'T288', 'T289', 'T29', 'T290', 'T291', 'T292', 'T293', 'T294', 'T295',
      'T296', 'T297', 'T30', 'T300', 'T301', 'T302', 'T303', 'T304', 'T305', 'T306', 'T307', 'T31', 'T310', 'T311',
      'T312', 'T313', 'T314', 'T315', 'T316', 'T317', 'T318', 'T319', 'T32', 'T320', 'T321', 'T322', 'T323', 'T324',
      'T325', 'T326', 'T327', 'T328', 'T329', 'T33', 'T330', 'T331', 'T332', 'T333', 'T334', 'T335', 'T336', 'T337',
      'T338', 'T339', 'T34', 'T340', 'T341', 'T342', 'T343', 'T344', 'T345', 'T346', 'T347', 'T348', 'T349', 'T35',
      'T350', 'T351', 'T352', 'T353', 'T354', 'T355', 'T356', 'T357', 'T36', 'T360', 'T361', 'T362', 'T363', 'T364',
      'T365', 'T366', 'T367', 'T368', 'T369', 'T37', 'T370', 'T371', 'T372', 'T373', 'T374', 'T375', 'T378', 'T379',
      'T38', 'T380', 'T381', 'T382', 'T383', 'T384', 'T385', 'T386', 'T387', 'T388', 'T389', 'T39', 'T390', 'T391',
      'T392', 'T393', 'T394', 'T398', 'T399', 'T40', 'T400', 'T401', 'T402', 'T403', 'T404', 'T405', 'T406', 'T407',
      'T408', 'T409', 'T41', 'T410', 'T411', 'T412', 'T413', 'T414', 'T415', 'T42', 'T420', 'T421', 'T422', 'T423',
      'T424', 'T425', 'T426', 'T427', 'T428', 'T43', 'T430', 'T431', 'T432', 'T433', 'T434', 'T435', 'T436', 'T438',
      'T439', 'T44', 'T440', 'T441', 'T442', 'T443', 'T444', 'T445', 'T446', 'T447', 'T448', 'T449', 'T45', 'T450',
      'T451', 'T452', 'T453', 'T454', 'T455', 'T456', 'T457', 'T458', 'T459', 'T46', 'T460', 'T461', 'T462', 'T463',
      'T464', 'T465', 'T466', 'T467', 'T468', 'T469', 'T47', 'T470', 'T471', 'T472', 'T473', 'T474', 'T475', 'T476',
      'T477', 'T478', 'T479', 'T48', 'T480', 'T481', 'T482', 'T483', 'T484', 'T485', 'T486', 'T487', 'T49', 'T490',
      'T491', 'T492', 'T493', 'T494', 'T495', 'T496', 'T497', 'T498', 'T499', 'T50', 'T500', 'T501', 'T502', 'T503',
      'T504', 'T505', 'T506', 'T507', 'T508', 'T509', 'T51', 'T510', 'T511', 'T512', 'T513', 'T518', 'T519', 'T52',
      'T520', 'T521', 'T522', 'T523', 'T524', 'T528', 'T529', 'T53', 'T530', 'T531', 'T532', 'T533', 'T534', 'T535',
      'T536', 'T537', 'T539', 'T54', 'T540', 'T541', 'T542', 'T543', 'T549', 'T55', 'T56', 'T560', 'T561', 'T562',
      'T563', 'T564', 'T565', 'T566', 'T567', 'T568', 'T569', 'T57', 'T570', 'T571', 'T572', 'T573', 'T578', 'T579',
      'T58', 'T59', 'T590', 'T591', 'T592', 'T593', 'T594', 'T595', 'T596', 'T597', 'T598', 'T599', 'T60', 'T600',
      'T601', 'T602', 'T603', 'T604', 'T608', 'T609', 'T61', 'T610', 'T611', 'T612', 'T618', 'T619', 'T62', 'T620',
      'T621', 'T622', 'T628', 'T629', 'T63', 'T630', 'T631', 'T632', 'T633', 'T634', 'T635', 'T636', 'T638', 'T639',
      'T64', 'T65', 'T650', 'T651', 'T652', 'T653', 'T654', 'T655', 'T656', 'T658', 'T659', 'T66', 'T67', 'T670',
      'T671', 'T672', 'T673', 'T674', 'T675', 'T676', 'T677', 'T678', 'T679', 'T68', 'T69', 'T690', 'T691', 'T698',
      'T699', 'T70', 'T700', 'T701', 'T702', 'T703', 'T704', 'T708', 'T709', 'T71', 'T73', 'T730', 'T731', 'T732',
      'T733', 'T738', 'T739', 'T74', 'T740', 'T741', 'T742', 'T743', 'T748', 'T749', 'T75', 'T750', 'T751', 'T752',
      'T753', 'T754', 'T758', 'T78', 'T780', 'T781', 'T782', 'T783', 'T784', 'T788', 'T789', 'T79', 'T790', 'T791',
      'T792', 'T793', 'T794', 'T795', 'T796', 'T797', 'T798', 'T799', 'T80', 'T800', 'T801', 'T802', 'T803', 'T804',
      'T805', 'T806', 'T808', 'T809', 'T81', 'T810', 'T811', 'T812', 'T813', 'T814', 'T815', 'T816', 'T817', 'T818',
      'T819', 'T82', 'T820', 'T821', 'T822', 'T823', 'T824', 'T825', 'T826', 'T827', 'T828', 'T829', 'T83', 'T830',
      'T831', 'T832', 'T833', 'T834', 'T835', 'T836', 'T838', 'T839', 'T84', 'T840', 'T841', 'T842', 'T843', 'T844',
      'T845', 'T846', 'T847', 'T848', 'T849', 'T85', 'T850', 'T851', 'T852', 'T853', 'T854', 'T855', 'T856', 'T857',
      'T858', 'T859', 'T86', 'T860', 'T861', 'T862', 'T863', 'T864', 'T868', 'T869', 'T87', 'T870', 'T871', 'T872',
      'T873', 'T874', 'T875', 'T876', 'T88', 'T880', 'T881', 'T882', 'T883', 'T884', 'T885', 'T886', 'T887', 'T888',
      'T889', 'T90', 'T900', 'T901', 'T902', 'T903', 'T904', 'T905', 'T908', 'T909', 'T91', 'T910', 'T911', 'T912',
      'T913', 'T914', 'T915', 'T918', 'T919', 'T92', 'T920', 'T921', 'T922', 'T923', 'T924', 'T925', 'T926', 'T928',
      'T929', 'T93', 'T930', 'T931', 'T932', 'T933', 'T934', 'T935', 'T936', 'T938', 'T939', 'T94', 'T940', 'T941',
      'T95', 'T950', 'T951', 'T952', 'T953', 'T954', 'T958', 'T959', 'T96', 'T97', 'T98', 'T980', 'T981', 'T982',
      'T983', 'Y89', 'Y899']
      
g8 = ['Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17', 'Y18', 'Y19', 'Y20', 'Y21', 'Y22', 'Y23',
      'Y24', 'Y241', 'Y242', 'Y243', 'Y244', 'Y249', 'Y25', 'Y26', 'Y27', 'Y28', 'Y29', 'Y30', 'Y31',
      'Y32', 'Y33', 'Y34', 'Y87', 'Y872']
      
g9 = ['B99']

**Flag records with any garbage code in UCOD field.** Combine sublists of garbage codes and flag row if underlying cause ICD-10 code (UCOD) is in the combined list.

In [10]:
gc_all = g1 + g2 + g3 + g4 + g5 + g6 + g7 + g8 + g9

In [11]:
d19s['gc_any'] = d19s['UCOD'].isin(gc_all)

In [12]:
gc_table = d19s['gc_any'].value_counts(dropna=False).to_frame('has_garbage_code')
gc_table['Percent'] = (gc_table['has_garbage_code']/gc_table['has_garbage_code'].sum()) * 100

gc_table

Unnamed: 0,has_garbage_code,Percent
False,54140,92.982516
True,4086,7.017484


In 2019, 4,086 (7%) of the 58,226 death records for persons who died in Washington State were assigned a garbage code.

**Flag records with specific category of garbage code.** Label record with '0' if it has a 'valid' i.e. non-garbage code, or 1 through 9 depending on the category of garbage code in the UCOD field as defined above.

In [13]:
gcdict = {'1': g1, '2': g2, '3': g3, '4': g4, '5': g5, '6': g6, '7': g7, '8': g8, '9': g9}

In [14]:
gcdict_rev = {v: k for k in gcdict for v in gcdict[k]}

#the above is functional equivalent of:

#gcdict_rev = {}
#for key in gcdict:
#    for value in gcdict[key]:
#        gcdict_rev[value] = key


In [15]:
d19s['gc_cat'] = d19s['UCOD'].map(gcdict_rev).fillna(0).astype(int)


In [16]:
d19s['gc_cat'].value_counts().sort_index(ascending=True).to_frame()

Unnamed: 0,gc_cat
0,54140
1,547
2,762
3,663
4,125
5,875
6,1025
8,81
9,8


Ill-defined cardiovascular conditions, other ill-defined conditions, and heart failure (categories 1, 5, and 2 in the table above) are the three groups of garbage codes accounting for over half the total number of records with garbage codes.  Ill-defined infectious condtions (category 9) has the fewest records. It is likely that I will need to exclude categories with smaller numbers of records due to the high probability of unreliable results.

**DO MORE DESCRIPTIVE ANALYSIS HERE - BY GENDER, CERTIFIER TYPE, GEOGRAPHY ETC.**

In [None]:
#d19s.to_csv('Y:\DQSS\Death\MBG\py\capstone2\data\d19s.csv')

**Clean text fields** including:

- stripping punctuation
- standardizing case
- removing stop words (including custom stopwords)
- stemming
- tokenizing

In [17]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [None]:
#d19s = pd.read_csv('Y:\DQSS\Death\MBG\py\capstone2\data\d19s.csv', low_memory=False)

In [18]:
d19s.codlit.head()

0                             ALZHEIMER'S DEMENTIA    
5    ACUTE METHAMPHETAMINE INTOXICATION    TOXIC US...
6    RESPIRATORY FAILURE SEPSIS AND PNEUMONIA STATU...
7    EMBOLIC STROKE HYPERCOAGULABLE STATE  MANTLE C...
8    CARDIOPULMONARY ARREST HYDROCEPHALUS AND STATU...
Name: codlit, dtype: object

<br>

**Restrict data set** to records with garbage code as underlying cause code and keep only garbage code category ('gc_cat') and cause of death text ('codlit').
<br>


In [19]:
cod_txt = d19s.loc[d19s['gc_cat'] > 0,['gc_cat', 'codlit']]

In [20]:
cod_txt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4086 entries, 14 to 59678
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gc_cat  4086 non-null   int32 
 1   codlit  4086 non-null   object
dtypes: int32(1), object(1)
memory usage: 79.8+ KB


In [21]:
stop_words = set(stopwords.words('english'))
word_lem = WordNetLemmatizer()
porter = PorterStemmer()

In [22]:
def preproc(data):
    cod_tokens = []
    for cod in data:
        cod_lwr = cod.lower()
        cod_words = word_tokenize(cod_lwr)
        cod_alpha = [re.sub('[^a-zA-Z]', '', w) for w in cod_words]
        #cod_alpha = [w for w in cod_words if w.isalpha()]
        cod_nostop = [w for w in cod_alpha if not w in stop_words]
        cod_stems = [porter.stem(w) for w in cod_nostop]
        cod_tokens.append(cod_stems)
    return cod_tokens

In [23]:
cod_txt['preprocd'] = preproc(cod_txt['codlit'])

In [24]:
cod_txt.dtypes

gc_cat       int32
codlit      object
preprocd    object
dtype: object

In [25]:
cod_txt.head()

Unnamed: 0,gc_cat,codlit,preprocd
14,4,"HYPERNATREMIA SEVERE DYSPHAGIA DIABETES, ALZH...","[hypernatremia, sever, dysphagia, diabet, , al..."
40,6,VASCULAR DEMENTIA ESSENTIAL HYPERTENSION,"[vascular, dementia, essenti, hypertens]"
92,3,UPPER GASTROINTESTINAL BLEED TRACHEAL MASS,"[upper, gastrointestin, bleed, tracheal, mass]"
104,2,ACUTE CARDIOPULMONARY ARREST MULTI ORGAN FAILU...,"[acut, cardiopulmonari, arrest, multi, organ, ..."
112,6,PULMONARY EMBOLISM,"[pulmonari, embol]"


**Create bag of words for each garbage code category from 1 through 8**

In [26]:
 for i in range(1, 8):
    cat_tokens = cod_txt.loc[cod_txt['gc_cat']== i, ['preprocd']]
    joined = cat_tokens.preprocd.str.cat(sep = ",")
    print('GC category: ' + i)
    bow = Counter(joined)
    print(bow_simple.most_common(20))
    

TypeError: Cannot use .str.cat with values of inferred dtype 'mixed'.

**ALTERNATE METHOD** - THIS DOESN'T WORK EITHER

In [27]:
def clean_text(df):
    df['cod_stem'] = df['codlit']
    df['cod_stem'].replace(r'[^a-zA-Z]', ' ', regex= True, inplace = True)
    df['cod_stem'] = df['cod_stem'].str.lower()
    df['cod_stem'] = df['cod_stem'].map(nltk.word_tokenize)
    df['cod_stem'] = df['cod_stem'].apply(lambda x: [w for w in x if w not in stop_words])
    df['cod_stem'] = df['cod_stem'].apply(lambda x: [porter.stem(w) for w in x])
    return df['cod_stem']


In [28]:
cod_txt['cod_stem'] = clean_text(cod_txt)

In [29]:
cod_txt.head()

Unnamed: 0,gc_cat,codlit,preprocd,cod_stem
14,4,"HYPERNATREMIA SEVERE DYSPHAGIA DIABETES, ALZH...","[hypernatremia, sever, dysphagia, diabet, , al...","[hypernatremia, sever, dysphagia, diabet, alzh..."
40,6,VASCULAR DEMENTIA ESSENTIAL HYPERTENSION,"[vascular, dementia, essenti, hypertens]","[vascular, dementia, essenti, hypertens]"
92,3,UPPER GASTROINTESTINAL BLEED TRACHEAL MASS,"[upper, gastrointestin, bleed, tracheal, mass]","[upper, gastrointestin, bleed, tracheal, mass]"
104,2,ACUTE CARDIOPULMONARY ARREST MULTI ORGAN FAILU...,"[acut, cardiopulmonari, arrest, multi, organ, ...","[acut, cardiopulmonari, arrest, multi, organ, ..."
112,6,PULMONARY EMBOLISM,"[pulmonari, embol]","[pulmonari, embol]"


In [30]:
def rejoin_words(df):
    temp = df['cod_stem']
    joined = (" ".join(temp))
    return joined

In [31]:
cod_txt['not_list'] =  cod_txt.apply(rejoin_words, axis=1)

In [32]:
cod_txt.head()

Unnamed: 0,gc_cat,codlit,preprocd,cod_stem,not_list
14,4,"HYPERNATREMIA SEVERE DYSPHAGIA DIABETES, ALZH...","[hypernatremia, sever, dysphagia, diabet, , al...","[hypernatremia, sever, dysphagia, diabet, alzh...",hypernatremia sever dysphagia diabet alzheim d...
40,6,VASCULAR DEMENTIA ESSENTIAL HYPERTENSION,"[vascular, dementia, essenti, hypertens]","[vascular, dementia, essenti, hypertens]",vascular dementia essenti hypertens
92,3,UPPER GASTROINTESTINAL BLEED TRACHEAL MASS,"[upper, gastrointestin, bleed, tracheal, mass]","[upper, gastrointestin, bleed, tracheal, mass]",upper gastrointestin bleed tracheal mass
104,2,ACUTE CARDIOPULMONARY ARREST MULTI ORGAN FAILU...,"[acut, cardiopulmonari, arrest, multi, organ, ...","[acut, cardiopulmonari, arrest, multi, organ, ...",acut cardiopulmonari arrest multi organ failur...
112,6,PULMONARY EMBOLISM,"[pulmonari, embol]","[pulmonari, embol]",pulmonari embol


In [35]:
for i in range(1,8):
    x = cod_txt.loc[cod_txt['gc_cat']==i, ['not_list']]
    temp = x['not_list'].str.cat(sep = ',')
    print('GC category: ' + i)
    bow = Counter(joined)
    print(bow_simple.most_common(20))

TypeError: can only concatenate str (not "int") to str