# Exercise 2-2: Write your own code for the Mortality Notebook

In [832]:
import pandas as pd

### Get data

In [833]:
# my example didn't have a pkl file, so I made it  :)
mortality_url = "https://data.cdc.gov/api/views/v6ab-adf5/rows.csv?accessType=DOWNLOAD"
mortality_data = pd.read_csv(mortality_url)

### Clean and prep the data

In [840]:
# remove spacing in column headers > would need to handle camel case
mortality_data.columns = mortality_data.columns.str.replace(" ", "")

#calculate the mean centered value, normalizing the data
mortality_data['MeanCentered'] = \
    mortality_data.DeathRate - mortality_data.DeathRate.mean()

# make all single digit values two-digit for easy printing
mortality_data.AgeGroup.replace(
    {'1-4 Years':'01-04 Years','5-9 Years':'05-09 Years'},
    inplace = True)

# pivot table to get wide table
mortality_wide = mortality_data.pivot(
    index="Year",columns="AgeGroup",values="DeathRate")

# reset the index to allow for mortality_long melting
mortality_wide.reset_index(inplace=True)

# set mortality long based on wide dataset
mortality_long = mortality_wide.melt(
    id_vars='Year',
    var_name='AgeGroup',
    value_vars=['01-04 Years','05-09 Years','10-14 Years','15-19 Years'],
    value_name="DeathRate"
)

mortality_long.reset_index(inplace=True)

mortality_long["MeanCentered"] = \
    mortality_long.DeathRate - mortality_long.mean()

# rename per directions -- is there any way to query the replaced value as a class for aggregation??
# mortality_data.columns = mortality_data.columns.str.replace("DeathRate","DeathRate/100K")
mortality_wide.tail(8)

  mortality_long.DeathRate - mortality_long.mean()


AgeGroup,Year,01-04 Years,05-09 Years,10-14 Years,15-19 Years
111,2011,26.3,12.1,14.2,48.9
112,2012,26.3,11.4,13.9,47.2
113,2013,25.5,11.8,14.1,44.8
114,2014,24.0,11.5,14.0,45.5
115,2015,24.9,11.7,14.6,48.3
116,2016,25.3,12.2,14.6,51.2
117,2017,24.3,11.6,15.5,51.5
118,2018,24.0,11.5,14.9,49.2


# Wide DataFrame

### Review dataframe

In [835]:
# display df head
mortality_wide.head(5)

Unnamed: 0_level_0,Year,DeathRate,DeathRate,DeathRate,DeathRate,MeanCentered,MeanCentered,MeanCentered,MeanCentered
AgeGroup,Unnamed: 1_level_1,01-04 Years,05-09 Years,10-14 Years,15-19 Years,01-04 Years,05-09 Years,10-14 Years,15-19 Years
0,1900,1983.8,466.1,298.3,484.8,1790.87584,273.17584,105.37584,291.87584
1,1901,1695.0,427.6,273.6,454.4,1502.07584,234.67584,80.67584,261.47584
2,1902,1655.7,403.3,252.5,421.5,1462.77584,210.37584,59.57584,228.57584
3,1903,1542.1,414.7,268.2,434.1,1349.17584,221.77584,75.27584,241.17584
4,1904,1591.5,425.0,305.2,471.4,1398.57584,232.07584,112.27584,278.47584


In [846]:
# show first 5 years and MeanCentered
mortality_wide.info

<bound method DataFrame.info of AgeGroup  Year  01-04 Years  05-09 Years  10-14 Years  15-19 Years
0         1900       1983.8        466.1        298.3        484.8
1         1901       1695.0        427.6        273.6        454.4
2         1902       1655.7        403.3        252.5        421.5
3         1903       1542.1        414.7        268.2        434.1
4         1904       1591.5        425.0        305.2        471.4
..         ...          ...          ...          ...          ...
114       2014         24.0         11.5         14.0         45.5
115       2015         24.9         11.7         14.6         48.3
116       2016         25.3         12.2         14.6         51.2
117       2017         24.3         11.6         15.5         51.5
118       2018         24.0         11.5         14.9         49.2

[119 rows x 5 columns]>

In [None]:
# get summary
mortality_wide.describe()

In [None]:
# transpose summary
mortality_wide.describe().T

In [None]:
# isolate columns
mortality_wide.loc[:, ['Year', '01-04 Years']]

In [None]:
# query wide dataframe
mortality_wide.query('Year >= 1915 and  Year <= 1920')

In [None]:
# query dataframe and isolate columns for display/report
mortality_wide.query('Year >= 1915 and  Year <= 1920').loc[:, ['Year', '01-04 Years']]

In [None]:
# Get mean, median and sum for each year

# add new column called TotalDeaths: rolling sum of deaths for each year

# create line plot to shot total death rate by year

# Long Dataframe

In [None]:
# set mean centered
mortality_long["MeanCentered"] = \
    mortality_long.DeathRate - mortality_long.DeathRate.mean()

# set median
death_rate_median = mortality_long.DeathRate.median()

# replace header per spec
# mortality_long.columns = mortality_long.columns.str.replace("DeathRate","DeathRate/100K")

mortality_long

### Review dataframe

In [None]:
# display top 5 w/ mean centered
mortality_long.head(5)

In [None]:
# median calculated in long dataframe setup
print(f"Median death rate: {death_rate_median}")

In [None]:
# last 6 rows of data from 1915 to 1920
mortality_long.query("Year >= 1915 and Year <= 1920").tail(6)

In [None]:
# year and deaths/100k for 01-04 years
mortality_long.set_index('Year', inplace=True)
mortality_long.query("AgeGroup == '01-04 Years'").loc[:, ['AgeGroup', 'DeathRate']]

In [None]:
# show dataframe in descending order
mortality_long.sort_values('DeathRate', ascending=False)

# create smaller dataframe to create a subset of the first 3 and last 3
report_df = mortality_long.head(3)
report_df = report_df.append(mortality_long.tail(3))
# display dataframe
report_df

### Grouped sums

In [None]:
# as-written to "group" by year and sum the rate
mortality_long.groupby('Year').sum(['DeathRate'])

In [None]:
# how I read the problem
mortality_long.DeathRate.cumsum()