In [1]:
"""
Esmé Middaugh
2019APRIL05

This notebook merges the yearly CDC prescription data into one csv, combining the separate files for 2006 through 2016
into one and appending a column with the appropriate year. 
"""


'\nEsmé Middaugh\n2019APRIL05\n\nThis notebook merges the yearly CDC prescription data into one csv, combining the separate files for 2006 through 2016\ninto one and appending a column with the appropriate year. \n'

In [2]:
import os
import pandas as pd 
import statistics

In [3]:
## Reading in Prescription and Mortality Files
## Assumes you are in 'code' and 'data' is another folder in parent directory
pres_fname = "../raw_data/cdc_opioid_prescribing_rate.csv" 
with open (pres_fname) as f:
    pres_df = pd.read_csv(f, na_values="–")
#pres_df.info()

mort_fname =  "../raw_data/NCHS_-_Drug_Poisoning_Mortality_by_County__United_States.csv"
with open (mort_fname) as f:
    mort_df = pd.read_csv(f)
#mort_df.info()

In [4]:
#Data Cleaning -- Prescription
pres_df = pres_df.dropna()
pres_df = pres_df[pres_df['Year'] >= 2005]
pres_df.head()


Unnamed: 0,County,State,FIPS County Code,Prescribing Rate,Year
2,"Anchorage, AK",AK,2020,71.5,2006
7,"Fairbanks North Star, AK",AK,2090,54.7,2006
10,"Juneau, AK",AK,2110,95.3,2006
11,"Kenai Peninsula, AK",AK,2122,89.1,2006
12,"Ketchikan Gateway, AK",AK,2130,144.4,2006


In [5]:
#Data Cleaning -- Mortality 
def avg_mort(mort_range): #Calculating the average mortality rate given the Estimated Age-adjusted Death Rate 
    if '+' in mort_range:
        return 30.0
    elif '<' in mort_range:
        return float(statistics.mean([0,2]))
    else:
        separated = mort_range.split('-')
        lo = float(separated[0])
        hi = float(separated[1])
        return statistics.mean([lo, hi])

In [6]:
mort_df['mean_death_rate'] = mort_df['Estimated Age-adjusted Death Rate, 16 Categories (in ranges)'].apply(avg_mort)
mort_df = mort_df[mort_df['Year'] >=2005]
mort_df.head()

Unnamed: 0,FIPS,Year,State,FIPS State,County,Population,"Estimated Age-adjusted Death Rate, 16 Categories (in ranges)",mean_death_rate
6,1001,2005,Alabama,1,"Autauga County, AL",49676,4-5.9,4.95
7,1001,2006,Alabama,1,"Autauga County, AL",51328,6-7.9,6.95
8,1001,2007,Alabama,1,"Autauga County, AL",52405,6-7.9,6.95
9,1001,2008,Alabama,1,"Autauga County, AL",53277,6-7.9,6.95
10,1001,2009,Alabama,1,"Autauga County, AL",54135,6-7.9,6.95


In [7]:
## Creating Merged Data Frame 
full_county_df = pd.merge(pres_df, mort_df, left_on = ['FIPS County Code', 'Year'], right_on = ['FIPS', 'Year']) #Merging based on County Code and Year 
full_county_df = full_county_df.drop(['FIPS County Code', 'County_y'], axis=1) #Dropping Duplicates

#Renaming the Columns
full_county_df.columns = ['County', 'State_ABBRV', 'Prescribing Rate', 'Year', 'FIPS', 'State', 'FIPS State', 'Population', 'Estimated Age-adjusted Death Rate, 16 Categories (in ranges)', 'mean_mort_rate']

#Reordering the columns
full_county_df = full_county_df[['County', 'FIPS', 'State_ABBRV', 'State', 'FIPS State', 'Year', 'Population', 'Prescribing Rate', 'Estimated Age-adjusted Death Rate, 16 Categories (in ranges)', 'mean_mort_rate']]

full_county_df.head()



Unnamed: 0,County,FIPS,State_ABBRV,State,FIPS State,Year,Population,Prescribing Rate,"Estimated Age-adjusted Death Rate, 16 Categories (in ranges)",mean_mort_rate
0,"Anchorage, AK",2020,AK,Alaska,2,2006,280085,71.5,12-13.9,12.95
1,"Fairbanks North Star, AK",2090,AK,Alaska,2,2006,90545,54.7,8-9.9,8.95
2,"Juneau, AK",2110,AK,Alaska,2,2006,30808,95.3,8-9.9,8.95
3,"Kenai Peninsula, AK",2122,AK,Alaska,2,2006,52253,89.1,12-13.9,12.95
4,"Ketchikan Gateway, AK",2130,AK,Alaska,2,2006,13492,144.4,8-9.9,8.95


In [8]:
#Checking to make sure everything merged correctly 
full_county_df[full_county_df.FIPS == 1001]

Unnamed: 0,County,FIPS,State_ABBRV,State,FIPS State,Year,Population,Prescribing Rate,"Estimated Age-adjusted Death Rate, 16 Categories (in ranges)",mean_mort_rate
8,"Autauga, AL",1001,AL,Alabama,1,2006,51328,134.8,6-7.9,6.95
2757,"Autauga, AL",1001,AL,Alabama,1,2007,52405,135.8,6-7.9,6.95
5497,"Autauga, AL",1001,AL,Alabama,1,2008,53277,144.9,6-7.9,6.95
8249,"Autauga, AL",1001,AL,Alabama,1,2009,54135,147.5,6-7.9,6.95
10993,"Autauga, AL",1001,AL,Alabama,1,2010,54660,151.7,8-9.9,8.95
13728,"Autauga, AL",1001,AL,Alabama,1,2011,55253,144.1,8-9.9,8.95
16468,"Autauga, AL",1001,AL,Alabama,1,2012,55175,157.8,8-9.9,8.95
19198,"Autauga, AL",1001,AL,Alabama,1,2013,55038,166.7,8-9.9,8.95
21946,"Autauga, AL",1001,AL,Alabama,1,2014,55290,145.3,10-11.9,10.95
24899,"Autauga, AL",1001,AL,Alabama,1,2015,55347,129.9,10-11.9,10.95


In [9]:
def yearly_change(source_df, source_col, change_col):
    """ Calculates the change from the prior year for the given column
    Parameters:
    df - the pd.DataFrame you want to modify
    source_col - the column you want to calculate annual change for
    change_col - the column to hold these calculations
    
    Return:
    df - the dataframe with the updated columns 
    """
    #Sorting columns first so later iterating will work
    df = source_df.sort_values(['FIPS','Year'],ascending=[True,True])

    #Getting the index for source column for use in iloc later
    source_col_index = df.columns.get_loc(source_col) 
    fips_index = df.columns.get_loc('FIPS')
    year_index = df.columns.get_loc('Year')
    df[change_col] = None
    change_col_index = df.columns.get_loc(change_col)
    
    for row in range(len(df)):
        if df.iloc[row,fips_index] == df.iloc[row-1, fips_index] and df.iloc[row - 1, year_index] < df.iloc[row, year_index] : #checking that it is the same county code 
            df.iloc[row, change_col_index] = df.iloc[row, source_col_index] - df.iloc[row - 1, source_col_index]
    return df 

In [None]:
#Actually calculating the changes per year 
full_county_df = yearly_change(full_county_df, 'mean_mort_rate', 'change_mort_rate') 
full_county_df = yearly_change(full_county_df, 'Prescribing Rate', 'change_pres_rate')

In [None]:
full_county_df

In [None]:
## Write the merged prescription dataframe back to a csv
full_county_df.to_csv('../clean_data/county_cdc_full_annual_changes.csv')

In [None]:
full_county_df.info()

In [None]:
full_county_df.descibe()