In [28]:
import pandas as pd
import numpy as np

In [29]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (12,12)

In [30]:
drugoverdose = pd.read_csv("drugoverdose.2.clean.csv")

In [31]:
drugoverdose.head()

Unnamed: 0,state,year,month,deaths
0,AK,2015,January,4034.0
1,AK,2015,February,4084.0
2,AK,2015,March,4101.0
3,AK,2015,April,4133.0
4,AK,2015,May,4196.0


In [32]:
def cardinality_categorical(df):
    n_records = len(df)
    for column in df.select_dtypes([object]):
        print("{} | uniques/records: {:.3f} | Minimum observations: {:.3f}".format(
            column, 
            len(df[column].unique())/n_records,
            df[column].value_counts().min()
        ))        

cardinality_categorical(drugoverdose)

state | uniques/records: 0.014 | Minimum observations: 35.000
month | uniques/records: 0.003 | Minimum observations: 220.000


In [33]:
#Make States into regions

In [34]:
drugoverdose.state.unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KY', 'KS', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'TN', 'SD', 'TX',
       'US', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'YC'], dtype=object)

In [35]:
midwest = ['ND', 'SD', 'NE', 'KS', 'MO', 'IA', 'MN', 'WI', 'IL', 'IN', 'MI', 'OH']
west = ['WA', 'OR', 'CA', 'NV', 'ID', 'MT', 'WY', 'UT', 'CO', 'AZ', 'NM', 'HI']
south = ['TX', 'OK', 'AR', 'LA', 'MS', 'TN', 'KY', 'AL', 'WV', 'MD', 'DE', 'VA', 'NC', 'SV', 'GA', 'SC', 'FL']
northeast = ['AK','PA', 'NY', 'NJ', 'CT', 'RI', 'MA', 'NH', 'VT', 'ME']

drugoverdose.loc[drugoverdose['state'].isin(midwest),
                 'region'] = 'midwest'

drugoverdose.loc[drugoverdose['state'].isin(south),
                 'region'] = 'south'

drugoverdose.loc[drugoverdose['state'].isin(west),
                 'region'] = 'west'

drugoverdose.loc[drugoverdose['state'].isin(northeast),
                 'region'] = 'northeast'


In [36]:
drugoverdose.region.dtype

dtype('O')

In [37]:
drugoverdose.region = drugoverdose.region.astype("category")
drugoverdose.region.dtype

CategoricalDtype(categories=['midwest', 'northeast', 'south', 'west'], ordered=False)

In [38]:
drugoverdose.region.value_counts()

south        1346
west          716
northeast     701
midwest       665
Name: region, dtype: int64

In [39]:
drugoverdose.head()

Unnamed: 0,state,year,month,deaths,region
0,AK,2015,January,4034.0,northeast
1,AK,2015,February,4084.0,northeast
2,AK,2015,March,4101.0,northeast
3,AK,2015,April,4133.0,northeast
4,AK,2015,May,4196.0,northeast


In [40]:
summer = ['June', 'July', 'August']
fall = ['September', 'October', 'November']
winter = ['December', 'January', 'February']
spring = ['March', 'April', 'May']



In [41]:
drugoverdose.loc[drugoverdose['month'].isin(summer),
                 'season'] = 'summer'

drugoverdose.loc[drugoverdose['month'].isin(spring),
                 'season'] = 'spring'

drugoverdose.loc[drugoverdose['month'].isin(fall),
                 'season'] = 'fall'

drugoverdose.loc[drugoverdose['month'].isin(winter),
                 'season'] = 'winter'

In [42]:
drugoverdose.season = drugoverdose.season.astype("category")
drugoverdose.season.dtype

CategoricalDtype(categories=['fall', 'spring', 'summer', 'winter'], ordered=False)

In [43]:
drugoverdose.head()

Unnamed: 0,state,year,month,deaths,region,season
0,AK,2015,January,4034.0,northeast,winter
1,AK,2015,February,4084.0,northeast,winter
2,AK,2015,March,4101.0,northeast,spring
3,AK,2015,April,4133.0,northeast,spring
4,AK,2015,May,4196.0,northeast,spring


In [44]:
#Created groupings of regions in the US and seasons

In [45]:
deathsizes = ['very little', 'little', 'average', 'large', 'very large']

drugoverdose['death_quantitytype'] = pd.qcut(drugoverdose['deaths'],
                                            5, deathsizes)

In [46]:
drugoverdose.head()

Unnamed: 0,state,year,month,deaths,region,season,death_quantitytype
0,AK,2015,January,4034.0,northeast,winter,average
1,AK,2015,February,4084.0,northeast,winter,average
2,AK,2015,March,4101.0,northeast,spring,average
3,AK,2015,April,4133.0,northeast,spring,average
4,AK,2015,May,4196.0,northeast,spring,average


In [47]:
#created grouping of death amount types

In [50]:
drugoverdose.to_pickle("drugoverdose.3.grouped.pkl")