In [1]:
#
#    IMPORTS
#

#    These are standard python modules. In case you do not have a python module, you should use `!pip install <module>`
import pandas as pd
#

In [2]:
mortality_data = pd.read_csv("../Raw Data/1999-2020_Mortality Data.txt", sep="\t")
mortality_data

Unnamed: 0,Notes,Year,Year Code,Ten-Year Age Groups,Ten-Year Age Groups Code,Gender,Gender Code,Deaths,Population,Crude Rate
0,,1999.0,1999.0,< 1 year,1,Male,M,12.0,25147.0,Unreliable
1,,1999.0,1999.0,35-44 years,35-44,Female,F,15.0,231425.0,Unreliable
2,,1999.0,1999.0,35-44 years,35-44,Male,M,17.0,239273.0,Unreliable
3,,1999.0,1999.0,45-54 years,45-54,Female,F,36.0,180536.0,19.9
4,,1999.0,1999.0,45-54 years,45-54,Male,M,43.0,173070.0,24.8
...,...,...,...,...,...,...,...,...,...,...
327,July 1 estimates. Population figures for Infan...,,,,,,,,,
328,figures for years 2001 - 2009 differ slightly ...,,,,,,,,,
329,were available at the time of release.,,,,,,,,,
330,6. The population figures used in the calculat...,,,,,,,,,


In [3]:
mortality_data = mortality_data.drop(columns=['Crude Rate', 'Gender Code', 'Ten-Year Age Groups Code', 'Year Code', 'Notes'], axis=1)

In [4]:
mortality_data

Unnamed: 0,Year,Ten-Year Age Groups,Gender,Deaths,Population
0,1999.0,< 1 year,Male,12.0,25147.0
1,1999.0,35-44 years,Female,15.0,231425.0
2,1999.0,35-44 years,Male,17.0,239273.0
3,1999.0,45-54 years,Female,36.0,180536.0
4,1999.0,45-54 years,Male,43.0,173070.0
...,...,...,...,...,...
327,,,,,
328,,,,,
329,,,,,
330,,,,,


In [5]:
mortality_data.dropna(subset=['Year'], inplace=True)
mortality_data

Unnamed: 0,Year,Ten-Year Age Groups,Gender,Deaths,Population
0,1999.0,< 1 year,Male,12.0,25147.0
1,1999.0,35-44 years,Female,15.0,231425.0
2,1999.0,35-44 years,Male,17.0,239273.0
3,1999.0,45-54 years,Female,36.0,180536.0
4,1999.0,45-54 years,Male,43.0,173070.0
...,...,...,...,...,...
268,2020.0,65-74 years,Male,388.0,192741.0
269,2020.0,75-84 years,Female,487.0,124045.0
270,2020.0,75-84 years,Male,535.0,102384.0
271,2020.0,85+ years,Female,511.0,49896.0


In [6]:
# Rename the column
mortality_data.rename(columns={'Ten-Year Age Groups': 'Age Groups'}, inplace=True)

# Convert non-numeric values to NaN, then fill NaN with 0 and convert to int
mortality_data['Deaths'] = pd.to_numeric(mortality_data['Deaths'], errors='coerce').fillna(0).astype(int)
mortality_data['Population'] = pd.to_numeric(mortality_data['Population'], errors='coerce').fillna(0).astype(int)

# Convert 'Year' to datetime and extract the year
mortality_data['Year'] = pd.to_datetime(mortality_data['Year'], format='%Y')
mortality_data['Year'] = mortality_data['Year'].dt.year

mortality_data

Unnamed: 0,Year,Age Groups,Gender,Deaths,Population
0,1999,< 1 year,Male,12,25147
1,1999,35-44 years,Female,15,231425
2,1999,35-44 years,Male,17,239273
3,1999,45-54 years,Female,36,180536
4,1999,45-54 years,Male,43,173070
...,...,...,...,...,...
268,2020,65-74 years,Male,388,192741
269,2020,75-84 years,Female,487,124045
270,2020,75-84 years,Male,535,102384
271,2020,85+ years,Female,511,49896


In [7]:
print(mortality_data['Age Groups'].unique())

['< 1 year' '35-44 years' '45-54 years' '55-64 years' '65-74 years'
 '75-84 years' '85+ years' '25-34 years']


In [8]:
# Define the bins and labels
bins = [0, 24, 64, float('inf')]
labels = ['0-24', '25-64', '65+']

aggregated_data = mortality_data.copy()

# Drop the Population column
aggregated_data.drop(columns=['Population'], inplace=True, errors='ignore')
# Extract numeric values from 'Age Groups' and handle NaN values
age_groups_numeric = aggregated_data['Age Groups'].str.extract('(\d+)')[0].astype(float)
#age_groups_numeric.fillna(-1, inplace=True)  # Fill NaN with -1 to handle them separately

# Apply the bins to the 'Age Groups' column
aggregated_data['Age Groups'] = pd.cut(age_groups_numeric, bins=bins, labels=labels, right=False)

# Group by Year, Age Groups, and Gender, then sum the Deaths
aggregated_data = aggregated_data.groupby(['Year', 'Age Groups', 'Gender']).agg({'Deaths': 'sum'}).reset_index()

aggregated_data

Unnamed: 0,Year,Age Groups,Gender,Deaths
0,1999,0-24,Female,0
1,1999,0-24,Male,12
2,1999,25-64,Female,119
3,1999,25-64,Male,144
4,1999,65+,Female,1103
...,...,...,...,...
127,2020,0-24,Male,0
128,2020,25-64,Female,243
129,2020,25-64,Male,314
130,2020,65+,Female,1354


In [9]:
# Aggregated death values by year
aggregated_by_year = aggregated_data.groupby('Year').agg({'Deaths': 'sum'}).reset_index()
aggregated_by_year

Unnamed: 0,Year,Deaths
0,1999,2434
1,2000,2457
2,2001,2379
3,2002,2443
4,2003,2364
5,2004,2245
6,2005,2641
7,2006,2487
8,2007,2215
9,2008,2428


In [10]:
# Aggregated death values by year
aggregated_by_year = aggregated_data.groupby(['Year', 'Gender']).agg({'Deaths': 'sum'}).reset_index()
aggregated_by_year

Unnamed: 0,Year,Gender,Deaths
0,1999,Female,1222
1,1999,Male,1212
2,2000,Female,1250
3,2000,Male,1207
4,2001,Female,1219
5,2001,Male,1160
6,2002,Female,1304
7,2002,Male,1139
8,2003,Female,1197
9,2003,Male,1167


In [11]:
aggregated_data.to_csv("../Processed Data/1999_2000_Mortality_Data.csv", index=False)