# Exercise 2-2: Write your own code for the Mortality Notebook

In [432]:
import pandas as pd

### Get data

In [433]:
# my example didn't have a pkl file, so I made it  :)
mortality_url = "https://data.cdc.gov/api/views/v6ab-adf5/rows.csv?accessType=DOWNLOAD"
mortality_data = pd.read_csv(mortality_url)

### Clean and prep the data

In [434]:
# remove spacing in column headers > would need to handle camel case
mortality_data.columns = mortality_data.columns.str.replace(" ", "")

#calculate the mean centered value, normalizing the data
mortality_data['MeanCentered'] = \
    mortality_data.DeathRate - mortality_data.DeathRate.mean()

# make all single digit values two-digit for easy printing
mortality_data.AgeGroup.replace(
    {'1-4 Years':'01-04 Years','5-9 Years':'05-09 Years'},
    inplace = True)

# pivot table to get wide table
mortality_wide = mortality_data.pivot(
    index = "Year", columns="AgeGroup" ,values="DeathRate")

# reset the index to allow for future melting
mortality_wide.reset_index(inplace=True)

# rename per directions -- is there any way to query this as a class??
mortality_data.columns = mortality_data.columns.str.replace("DeathRate","DeathRate/100K")
mortality_data.tail(8)

Unnamed: 0,Year,AgeGroup,DeathRate/100K,MeanCentered
468,2011,15-19 Years,48.9,-144.02416
469,2012,15-19 Years,47.2,-145.72416
470,2013,15-19 Years,44.8,-148.12416
471,2014,15-19 Years,45.5,-147.42416
472,2015,15-19 Years,48.3,-144.62416
473,2016,15-19 Years,51.2,-141.72416
474,2017,15-19 Years,51.5,-141.42416
475,2018,15-19 Years,49.2,-143.72416


In [435]:
# get queried dataframe
query_df = mortality_data.query('AgeGroup == "01-04 Years"').sort_values(['DeathRate/100K'],ascending=False)
# create smaller dataframe to create a subset of the first 3 and last 3
report_df = query_df.head(3)
report_df = report_df.append(query_df.tail(3))
# display dataframe
report_df

Unnamed: 0,Year,AgeGroup,DeathRate/100K,MeanCentered
0,1900,01-04 Years,1983.8,1790.87584
1,1901,01-04 Years,1695.0,1502.07584
2,1902,01-04 Years,1655.7,1462.77584
117,2017,01-04 Years,24.3,-168.62416
114,2014,01-04 Years,24.0,-168.92416
118,2018,01-04 Years,24.0,-168.92416


# Wide DataFrame

### Review dataframe

In [436]:
# print head
mortality_wide.head(5)

AgeGroup,Year,01-04 Years,05-09 Years,10-14 Years,15-19 Years
0,1900,1983.8,466.1,298.3,484.8
1,1901,1695.0,427.6,273.6,454.4
2,1902,1655.7,403.3,252.5,421.5
3,1903,1542.1,414.7,268.2,434.1
4,1904,1591.5,425.0,305.2,471.4


In [437]:
# get summary
mortality_wide.describe()

AgeGroup,Year,01-04 Years,05-09 Years,10-14 Years,15-19 Years
count,119.0,119.0,119.0,119.0,119.0
mean,1959.0,383.22605,117.32605,93.769748,177.37479
std,34.496377,500.454211,127.474958,88.404363,138.36229
min,1900.0,24.0,11.4,13.9,44.8
25%,1929.5,50.65,24.0,27.2,85.45
50%,1959.0,109.1,48.4,44.6,106.9
75%,1988.5,577.3,198.9,158.9,283.65
max,2018.0,1983.8,466.1,375.1,777.4


In [438]:
# transpose summary
mortality_wide.describe().T

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Year,119.0,1959.0,34.496377,1900.0,1929.5,1959.0,1988.5,2018.0
01-04 Years,119.0,383.22605,500.454211,24.0,50.65,109.1,577.3,1983.8
05-09 Years,119.0,117.32605,127.474958,11.4,24.0,48.4,198.9,466.1
10-14 Years,119.0,93.769748,88.404363,13.9,27.2,44.6,158.9,375.1
15-19 Years,119.0,177.37479,138.36229,44.8,85.45,106.9,283.65,777.4


In [439]:
# isolate columns
mortality_wide.loc[:, ['Year', '01-04 Years']]

AgeGroup,Year,01-04 Years
0,1900,1983.8
1,1901,1695.0
2,1902,1655.7
3,1903,1542.1
4,1904,1591.5
...,...,...
114,2014,24.0
115,2015,24.9
116,2016,25.3
117,2017,24.3


In [440]:
# query wide dataframe
mortality_wide.query('Year >= 1915 and  Year <= 1920')

AgeGroup,Year,01-04 Years,05-09 Years,10-14 Years,15-19 Years
15,1915,924.2,260.6,196.7,330.9
16,1916,1111.5,282.4,205.1,355.8
17,1917,1066.0,290.7,218.9,380.3
18,1918,1573.5,447.9,375.1,777.4
19,1919,928.0,300.0,236.4,438.5
20,1920,987.2,295.2,229.9,402.9


In [441]:
# query dataframe and isolate columns for display/report
mortality_wide.query('Year >= 1915 and  Year <= 1920').loc[:, ['Year', '01-04 Years']]

AgeGroup,Year,01-04 Years
15,1915,924.2
16,1916,1111.5
17,1917,1066.0
18,1918,1573.5
19,1919,928.0
20,1920,987.2


# Long Dataframe

### Set long dataframe

In [442]:
mortality_long = mortality_wide.melt(
    id_vars='Year',
    var_name='AgeGroup',
    value_vars=['01-04 Years','05-09 Years','10-14 Years','15-19 Years'],
    value_name="DeathRate"
)
mortality_long

Unnamed: 0,Year,AgeGroup,DeathRate
0,1900,01-04 Years,1983.8
1,1901,01-04 Years,1695.0
2,1902,01-04 Years,1655.7
3,1903,01-04 Years,1542.1
4,1904,01-04 Years,1591.5
...,...,...,...
471,2014,15-19 Years,45.5
472,2015,15-19 Years,48.3
473,2016,15-19 Years,51.2
474,2017,15-19 Years,51.5


### Review dataframe

### Calculate median

In [443]:
mortality_long.DeathRate.median()

89.5

### Grouped sums

In [444]:
mortality_long.groupby('Year').sum('DeathRate')

Unnamed: 0_level_0,DeathRate
Year,Unnamed: 1_level_1
1900,3233.0
1901,2850.6
1902,2733.0
1903,2659.1
1904,2793.1
...,...
2014,95.0
2015,99.5
2016,103.3
2017,102.9
