# Import libraries

In [1]:
import os
import re
import boto3
import pandas as pd
import json
import sagemaker

# Read files

In [2]:
data_2005 = pd.read_csv('https://s3.amazonaws.com/mrinal-ml-sagemaker/Project/2005_data.csv', dtype=object)
data_2005.head()

Unnamed: 0,resident_status,education_1989_revision,education_2003_revision,education_reporting_flag,month_of_death,sex,detail_age_type,detail_age,age_substitution_flag,age_recode_52,...,record_condition_18,record_condition_19,record_condition_20,race,bridged_race_flag,race_imputation_flag,race_recode_3,race_recode_5,hispanic_origin,hispanic_originrace_recode
0,1,11,,0,1,F,1,45,,35,...,,,,1,,,1,1,100,6
1,1,13,,0,1,M,1,61,,38,...,,,,1,,,1,1,100,6
2,1,12,,0,1,F,1,79,,41,...,,,,1,,,1,1,100,6
3,1,12,,0,1,M,1,50,,36,...,,,,1,,,1,1,100,6
4,1,14,,0,1,F,1,68,,39,...,,,,1,,,1,1,100,6


In [3]:
s3 = boto3.resource('s3')

content_object = s3.Object('mrinal-ml-sagemaker', 'Project/2005_codes.json')
code_maps = json.loads(content_object.get()['Body'].read().decode('utf-8'))

# Cleaning

### Imputing Null values

In [4]:
code_maps["place_of_injury_for_causes_w00_y34_except_y06_and_y07_"]["10"] = "Place of death unknown"
code_maps['130_infant_cause_recode']['000'] = 'Not infant'

In [5]:
data_2005.place_of_injury_for_causes_w00_y34_except_y06_and_y07_.fillna(value="10", inplace=True)
data_2005['130_infant_cause_recode'].fillna(value="000", inplace=True)
data_2005['39_cause_recode'] = '0' + data_2005['39_cause_recode']

### Creating new columns for decoded values

In [6]:
col_names = data_2005.loc[:, ['place_of_injury_for_causes_w00_y34_except_y06_and_y07_', '358_cause_recode', '113_cause_recode', '130_infant_cause_recode', '39_cause_recode']].columns.values
new_cols = 'decoded_' + col_names
new_cols

array(['decoded_place_of_injury_for_causes_w00_y34_except_y06_and_y07_',
       'decoded_358_cause_recode', 'decoded_113_cause_recode',
       'decoded_130_infant_cause_recode', 'decoded_39_cause_recode'],
      dtype=object)

### Mapping codes in data frames to values in code_maps dictionary

In [9]:
data_2005[col_names].head()

Unnamed: 0,place_of_injury_for_causes_w00_y34_except_y06_and_y07_,358_cause_recode,113_cause_recode,130_infant_cause_recode,39_cause_recode
0,10,98,28,0,15
1,10,266,84,0,28
2,10,239,70,0,24
3,10,159,46,0,16
4,10,93,27,0,8


In [7]:
for i in range(len(col_names)):
    data_2005[new_cols[i]] = data_2005.loc[:,col_names[i]].apply(lambda x: code_maps[col_names[i]][x])

In [8]:
data_2005[new_cols].head()

Unnamed: 0,decoded_place_of_injury_for_causes_w00_y34_except_y06_and_y07_,decoded_358_cause_recode,decoded_113_cause_recode,decoded_130_infant_cause_recode,decoded_39_cause_recode
0,Place of death unknown,Malignant melanoma of skin (C43),Malignant melanoma of skin (C43),Not infant,"Other malignant neoplasms (C00-C15,C17,C22-C24..."
1,Place of death unknown,Emphysema (J43),Emphysema (J43),Not infant,Chronic lower respiratory diseases (J40-J47)
2,Place of death unknown,Other cerebrovascular diseases and their seque...,Cerebrovascular diseases (I60-I69),Not infant,Cerebrovascular diseases (I60-I69)
3,Place of death unknown,Diabetes mellitus (E10-E14),Diabetes mellitus (E10-E14),Not infant,Diabetes mellitus (E10-E14)
4,Place of death unknown,"Of trachea, bronchus and lung (C33-C34)","Malignant neoplasms of trachea, bronchus and l...",Not infant,"Malignant neoplasms of trachea, bronchus and l..."
