This is an exploration of the whole series of mortality data from the CDC.

First let's configure logging, including turning off the preconfigured loggers.

In [87]:
from logging import Formatter
from logging import getLogger
from logging import INFO
from logging import StreamHandler

logger = getLogger(__name__)
logger.handlers.clear()
handler = StreamHandler()
handler.setLevel(INFO)
handler.setFormatter(Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)

Let's list all the files. We note that we have data files in CSV format representing the years 2005-2015, inclusive (11 years).

In [88]:
from os import walk
from os.path import join

for dirname, _, filenames in walk('../input/mortality/'):
    for filename in filenames:
        logger.info(msg=join(dirname, filename))

__main__ - 2023-06-15 17:35:32,009 - INFO - ../input/mortality/2014_codes.json
__main__ - 2023-06-15 17:35:32,011 - INFO - ../input/mortality/2007_data.csv
__main__ - 2023-06-15 17:35:32,012 - INFO - ../input/mortality/2007_codes.json
__main__ - 2023-06-15 17:35:32,013 - INFO - ../input/mortality/2012_data.csv
__main__ - 2023-06-15 17:35:32,014 - INFO - ../input/mortality/2010_data.csv
__main__ - 2023-06-15 17:35:32,015 - INFO - ../input/mortality/2009_data.csv
__main__ - 2023-06-15 17:35:32,016 - INFO - ../input/mortality/2011_data.csv
__main__ - 2023-06-15 17:35:32,018 - INFO - ../input/mortality/2015_codes.json
__main__ - 2023-06-15 17:35:32,019 - INFO - ../input/mortality/2011_codes.json
__main__ - 2023-06-15 17:35:32,020 - INFO - ../input/mortality/2006_data.csv
__main__ - 2023-06-15 17:35:32,021 - INFO - ../input/mortality/2010_codes.json
__main__ - 2023-06-15 17:35:32,022 - INFO - ../input/mortality/2005_data.csv
__main__ - 2023-06-15 17:35:32,023 - INFO - ../input/mortality/200

In [89]:
from json import load
with open(file='../input/mortality/2013_codes.json', mode='r', encoding='utf-8', ) as codes_fp:
    codes = load(fp=codes_fp,)
    
for key in codes.keys():
    logger.info(msg=key)


__main__ - 2023-06-15 17:35:32,048 - INFO - resident_status
__main__ - 2023-06-15 17:35:32,049 - INFO - education_1989_revision
__main__ - 2023-06-15 17:35:32,050 - INFO - education_2003_revision
__main__ - 2023-06-15 17:35:32,051 - INFO - education_reporting_flag
__main__ - 2023-06-15 17:35:32,052 - INFO - month_of_death
__main__ - 2023-06-15 17:35:32,052 - INFO - sex
__main__ - 2023-06-15 17:35:32,054 - INFO - age_substitution_flag
__main__ - 2023-06-15 17:35:32,056 - INFO - age_recode_52
__main__ - 2023-06-15 17:35:32,057 - INFO - age_recode_27
__main__ - 2023-06-15 17:35:32,057 - INFO - age_recode_12
__main__ - 2023-06-15 17:35:32,059 - INFO - infant_age_recode_22
__main__ - 2023-06-15 17:35:32,060 - INFO - place_of_death_and_decedents_status
__main__ - 2023-06-15 17:35:32,060 - INFO - marital_status
__main__ - 2023-06-15 17:35:32,061 - INFO - day_of_week_of_death
__main__ - 2023-06-15 17:35:32,062 - INFO - current_data_year
__main__ - 2023-06-15 17:35:32,063 - INFO - injury_at_wor

If we try to load up all of the data we may run out of memory, so let's start by doing one big load of just the columns we initially want across all 11 years; we'll use a list comprehension and concat() to avoid saving two copies of the data, and we'll hide the progress information in our CSV reading code. We have one year where one column is named differently, so we have to modify the used columns based on the year.

In [None]:
from pandas import concat
from pandas import read_csv
from pandas import DataFrame

USECOLS = ['activity_code', 'autopsy', 'current_data_year', 'day_of_week_of_death', 'detail_age', 'education_reporting_flag',
           'injury_at_work', 'manner_of_death', 'marital_status', 'method_of_disposition', 'month_of_death', 
           'place_of_death_and_decedents_status', 'race', 'resident_status', 'sex']

def get_usecols(year: int) -> list:
    return sorted(USECOLS + ['icd_code_10th_revision']) if year != 2012 else sorted(USECOLS + ['icd_code_10'])

def read(filename: str, usecols: list, ) -> DataFrame:
    logger = getLogger(name=__name__)
    logger.info(msg='reading {}'.format(filename))
    result_df = read_csv(filepath_or_buffer=filename, low_memory=False, usecols=usecols, )
    if 'icd_code_10' in result_df.columns:
        result_df = result_df.rename(columns={'icd_code_10': 'icd_code_10th_revision'})
    logger.info(msg='read {} rows.'.format(len(result_df)))
    logger.debug(msg=result_df.columns)
    return result_df

df = concat([read(filename='../input/mortality/{}_data.csv'.format(year),
                  usecols=get_usecols(year), ) for year in range(2005, 2016)])

__main__ - 2023-06-15 17:35:32,100 - INFO - reading ../input/mortality/2005_data.csv
__main__ - 2023-06-15 17:35:46,292 - INFO - read 2452506 rows.
__main__ - 2023-06-15 17:35:46,294 - INFO - reading ../input/mortality/2006_data.csv
__main__ - 2023-06-15 17:36:00,153 - INFO - read 2430725 rows.
__main__ - 2023-06-15 17:36:00,154 - INFO - reading ../input/mortality/2007_data.csv


Let's see how much data we have.

In [None]:
logger.info(msg='row count: {}'.format(len(df)))

Let's make our first graph: a plot of the total deaths for each year.
And let's use lmplot() to include a trend line.

In [None]:
from seaborn import lmplot
%matplotlib inline
annual_total_df = df['current_data_year'].value_counts().to_frame(name='deaths').reset_index()
annual_total_df.columns = ['year', 'deaths']
lmplot(data=annual_total_df, x='year', y='deaths',)

Let's try splitting out male and female deaths.

In [None]:
lmplot(col='sex', 
       data=df.groupby(by=['current_data_year', 'sex']).size().reset_index().rename(columns={'current_data_year': 'year', 0: 'deaths'}),
       x='year', y='deaths',)

Let's get the race names, add a column for the race name, and split out the deaths by race.

In [None]:
codes = {key: value for key, value in codes.items() if key in df.columns}
df['race_name'] = df['race'].replace(to_replace={int(key): value for key, value in codes['race'].items()})

In [None]:
lmplot(data=df.groupby(by=['current_data_year', 'race_name']).size().reset_index().rename(columns={'current_data_year': 'year', 0: 'deaths'}),
       facet_kws=dict(sharey=False), col='race_name', seed=1, col_wrap=2,
       x='year', y='deaths',)

In [None]:
logger.info(msg=codes.keys())

In [None]:
logger.info(msg='done')