In [1]:
# Import dependencies to import csv
import pandas as pd
from pathlib import Path

In [2]:
# Read in the csv file and display the dataframe
file_path = Path("data\Multiple_Cause_of_Death,_1999-2014_v1.1.csv")
df = pd.read_csv(file_path)

df

Unnamed: 0,index,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,1,Alabama,2000,46,4447100,1,0.8,1.4,126
2,2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,4,Alabama,2003,54,4503491,1.2,0.9,1.6,149
...,...,...,...,...,...,...,...,...,...
811,811,Wyoming,2010,49,563626,8.7,6.4,11.5,210
812,812,Wyoming,2011,47,568158,8.3,6.1,11,219
813,813,Wyoming,2012,47,576412,8.2,6,10.8,217
814,814,Wyoming,2013,52,582658,8.9,6.7,11.7,207


In [3]:
# Clean up the code
def clean_data(file_path):
    df = pd.read_csv(file_path)

    if "index" in df.columns:
        df.drop(columns=["index"], inplace=True)

    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    df["year"] = df["year"].astype(int)

    numeric_columns = ["deaths", "population", "crude_rate", "crude_rate_lower_95%_confidence_interval", "crude_rate_upper_95%_confidence_interval", "prescriptions_dispensed_by_us_retailers_in_that_year_(millions)"]
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df.fillna(df.median(numeric_only=True), inplace=True)

    return df

cleaned_df = clean_data("data\Multiple_Cause_of_Death,_1999-2014_v1.1.csv")
cleaned_df



Unnamed: 0,state,year,deaths,population,crude_rate,crude_rate_lower_95%_confidence_interval,crude_rate_upper_95%_confidence_interval,prescriptions_dispensed_by_us_retailers_in_that_year_(millions)
0,Alabama,1999,39.0,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46.0,4447100,1.0,0.8,1.4,126
2,Alabama,2001,67.0,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75.0,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54.0,4503491,1.2,0.9,1.6,149
...,...,...,...,...,...,...,...,...
811,Wyoming,2010,49.0,563626,8.7,6.4,11.5,210
812,Wyoming,2011,47.0,568158,8.3,6.1,11.0,219
813,Wyoming,2012,47.0,576412,8.2,6.0,10.8,217
814,Wyoming,2013,52.0,582658,8.9,6.7,11.7,207


In [4]:
# Export cleaned data to csv
cleaned_df.to_csv("data/cleaned_mortality_data.csv", index=False)