In [7]:
#Dependencies
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
from us import states
import gmaps
import config as conf
import scipy.stats as stats
gmaps.configure(conf.api_key)

Populating the interactive namespace from numpy and matplotlib


AttributeError: module 'config' has no attribute 'api_key'

In [None]:
#Set all the figures on same size
pylab.rcParams['figure.figsize'] = (10, 5)

In [None]:
#Open/read CSV file and look at table head()
death_pd_base = pd.read_csv("Causes_of_Death.csv")
death_pd_base.head()

In [None]:
#Create table with mean of Age-adjusted death rates per cause from 1999-2016
#Drop "All causes" to focus on 10 leading causes
ppcauses = death_pd_base.groupby(["Year", "Cause Name"]).mean()["Age-adjusted Death Rate"].unstack(level=-1)
leadcause = ppcauses.drop("All causes", axis=1)
means = leadcause.mean()
leadcause = pd.DataFrame(means).reset_index()
leadcause.columns = ["Cause Name", "Rate"]
leadcause.sort_values("Rate",inplace=True)
leadcause

In [None]:
#Create a horizontal bar plot with descending order to show top 10 leading causes of death in USA
x = leadcause["Cause Name"]
y = leadcause["Rate"]

plt.barh(x, y, color='green', alpha=1, align="center")
plt.title("Top 10 Leading Causes of Death in USA (1999-2016)")
plt.xlabel("Age-Adjusted Death Rate per 100,000 habitants")
plt.grid(linestyle='--', dashes=(1, 4), linewidth=0.5, color='black')
plt.savefig("Leading Causes of Death USA.png")
plt.show()

In [None]:
#Group the years and plot the trend of the diseases in each group of years
bins = [1998, 2004, 2010, 2016]
group_names = ["1999-2004", "2005-2010", "2011-2016"]

death_pd_base["Year Range"] =pd.cut(death_pd_base["Year"], bins, labels=group_names)
deaths_year = death_pd_base.groupby(["Year Range", "Cause Name"]).mean()["Age-adjusted Death Rate"].unstack(level=-1)
deaths_year = deaths_year.drop("All causes", axis=1)
deaths_year

In [None]:
deaths_year2 = death_pd_base.groupby(["Year", "Cause Name"]).mean()["Age-adjusted Death Rate"].unstack(level=-1)
deaths_year2 = deaths_year2.drop("All causes", axis=1)
deaths_year2.head()

In [None]:
deaths_Heart_Cancer2 = deaths_year2[["Heart disease","Cancer"]]
deaths_Heart_Cancer2.head()

In [None]:
deaths_Heart_Cancer2.plot(kind="line", marker ='o', alpha=1, linewidth=4)
plt.title("Top 2 Leading Causes of Death in USA (1999-2016)")
plt.xlabel("Year")
plt.ylabel("Age-adjusted Death Rate per 100,000 habitants")
plt.legend(loc='best', framealpha=1, edgecolor='k')
plt.grid(linestyle='--', dashes=(1, 4), linewidth=0.5, color='black')
plt.savefig("Top 2 Leading Causes of Death USA.png")
plt.show()

In [None]:
deaths_others2 = deaths_year2[["Alzheimer's disease", "CLRD", "Diabetes", "Influenza and pneumonia",\
                             "Kidney disease", "Stroke", "Suicide", "Unintentional injuries"]]
deaths_others2.head()

In [None]:
deaths_others2.plot(kind="line", marker ='o', alpha=1, linewidth=4, figsize=(20,10))
plt.title("Leading Causes of Death in USA")
plt.xlabel("Year")
plt.ylabel("Age-adjusted Death Rate per 100,000 habitants")
plt.legend(loc='best', framealpha=1, edgecolor='k')
plt.grid(linestyle='--', dashes=(1, 4), linewidth=0.5, color='black')
plt.savefig("Bottom 8 Leading Causes of Death USA.png")
plt.show()

# Null Hypothesis #1

-The trend observed on the decrease of death rate for Heart disease and Cancer (meaning, the decrease on age-adjusted death rate per 100,000 habitants) does not represent a significant improvement throught the years.

In [None]:
#Test the null hypothesis "Heart" with ANOVA test
deaths_Heart = deaths_year2[["Heart disease"]]
dH = deaths_Heart.reset_index()
dH.head()

In [None]:
bins = [1998, 2004, 2010, 2016]
group_names = ["1999-2004", "2005-2010", "2011-2016"]

dH["Year Range"] = pd.cut(dH["Year"], bins, labels=group_names)
dH.head()

In [None]:
dH.boxplot("Heart disease", by="Year Range", figsize=(10,5))

In [None]:
group1hd = dH[dH["Year Range"] == "1999-2004"]["Heart disease"]
group2hd = dH[dH["Year Range"] == "2005-2010"]["Heart disease"]
group3hd = dH[dH["Year Range"] == "2011-2016"]["Heart disease"]

stats.f_oneway(group1hd, group2hd, group3hd)

In [None]:
#Test the null hypothesis "Cancer" with ANOVA test
deaths_Cancer = deaths_year2[["Cancer"]]
dC = deaths_Cancer.reset_index()
dC.head()

In [None]:
bins = [1998, 2004, 2010, 2016]
group_names = ["1999-2004", "2005-2010", "2011-2016"]

dC["Year Range"] = pd.cut(dC["Year"], bins, labels=group_names)
dC.head()

In [None]:
dC.boxplot("Cancer", by="Year Range", figsize=(10, 5))

In [None]:
group1c = dC[dC["Year Range"] == "1999-2004"]["Cancer"]
group2c = dC[dC["Year Range"] == "2005-2010"]["Cancer"]
group3c = dC[dC["Year Range"] == "2011-2016"]["Cancer"]

stats.f_oneway(group1c, group2c, group3c)

In [None]:
#Create a table showing data per year on each State during 1999-2016
by_state= death_pd_base[['State','Cause Name','Year','Deaths']]
state_grouped= by_state.groupby(['State', 'Cause Name'])
state_grouped.first()

In [None]:
#Create a table with mean of age-adjusted rate per State and disease during 1999-2016 period
death_pd= death_pd_base[death_pd_base['Cause Name'] != 'All causes']
by_state_adr= death_pd[['State','Cause Name','Year','Age-adjusted Death Rate']]
grouped= by_state.groupby(['State', 'Cause Name'])

grouped.first()
by_state_adr=pd.DataFrame(death_pd[['State','Cause Name','Age-adjusted Death Rate']])
death_by_state_adr= death_pd.groupby(["State","Cause Name"]).mean()["Age-adjusted Death Rate"].unstack(level=-1)
death_by_state_adr2 = death_by_state_adr.drop("United States")
death_by_state_adr2.head()

In [None]:
state_Heart = death_by_state_adr2[["Heart disease"]]
state_HD = state_Heart.reset_index()
state_HD.head()

In [None]:
state_HD = state_HD.sort_values("Heart disease", ascending=False)
state_HD2 = state_HD.iloc[0:10]
state_HD2

In [None]:
coordinatesHD = [
    (32.741646, -89.678696),
    (35.565342, -96.928917),
    (32.806671, -86.791130),
    (38.897438, -77.026817),
    (31.169546, -91.867805),
    (34.969704, -92.373123),
    (38.491226, -80.954453),
    (37.668140, -84.670067),
    (35.747845, -86.692345),
    (42.165726, -74.948051)
]

figure_layout = {
   'width': '1000px',
   'height': '500px',
   'border': '1px solid black',
   'padding': '1px',
   'margin': '0 auto 0 auto'
}

fig = gmaps.figure(layout=figure_layout)
markers = gmaps.marker_layer(coordinatesHD)
fig.add_layer(markers)
fig

In [None]:
state_Cancer = death_by_state_adr2[["Cancer"]]
state_C = state_Cancer.reset_index()
state_C.head()

In [None]:
state_C = state_C.sort_values("Cancer", ascending=False)
state_C2 = state_C.iloc[0:10]
state_C2

In [None]:
coordinatesC = [
    (32.741646, -89.678696),
    (35.565342, -96.928917),
    (32.806671, -86.791130),
    (38.897438, -77.026817),
    (31.169546, -91.867805),
    (34.969704, -92.373123),
    (38.491226, -80.954453),
    (37.668140, -84.670067),
    (35.747845, -86.692345),
    (39.849426, -86.258278)
]

figure_layout = {
   'width': '1000px',
   'height': '500px',
   'border': '1px solid black',
   'padding': '1px',
   'margin': '0 auto 0 auto'
}

fig = gmaps.figure(layout=figure_layout)
markers = gmaps.marker_layer(coordinatesC)
fig.add_layer(markers)
fig

In [None]:
state_Alz = death_by_state_adr2[["Alzheimer's disease"]]
state_Alz = state_Alz.reset_index()
state_Alz.head()

In [None]:
state_Alz = state_Alz.sort_values("Alzheimer's disease", ascending=False)
state_Alz2 = state_Alz.iloc[0:10]
state_Alz2

In [None]:
coordinatesAlz = [
    (47.400902, -121.490494),
    (47.528912, -99.784012),
    (35.747845, -86.692345),
    (33.856892, -80.945007),
    (31.169546, -91.867805),
    (33.729759, -111.431221),
    (32.806671, -86.791130),
    (44.299782, -99.438828),
    (37.668140, -84.670067),
    (44.572021, -122.070938)
]

figure_layout = {
   'width': '1000px',
   'height': '500px',
   'border': '1px solid black',
   'padding': '1px',
   'margin': '0 auto 0 auto'
}

fig = gmaps.figure(layout=figure_layout)
markers = gmaps.marker_layer(coordinatesAlz)
fig.add_layer(markers)
fig

# Null Hypothesis #2

-The death rate due to suicide is not significantly related to states with lower temperatures throught the year.

In [None]:
#Create a table with mean of age-adjusted rate per State and suicide
death_group_df = death_pd_base.groupby(["State", "Cause Name"]).mean()["Age-adjusted Death Rate"].unstack(level=-1)
death_suicide_df = death_group_df["Suicide"]
death_suicide_df = death_suicide_df.drop("United States")
death_suicide_df = pd.DataFrame(death_suicide_df).reset_index()
death_suicide_df.head()

In [None]:
# Taken from https://inkplant.com/code/state-latitudes-longitudes, nearly state centroid
coord_states = {"Alabama": [32.806671, -86.791130], "Alaska": [61.370716, -152.404419],
"Arizona": [33.729759, -111.431221], "Arkansas": [34.969704, -92.373123],
"California": [36.116203, -119.681564], "Colorado": [39.059811, -105.311104],
"Connecticut": [41.597782, -72.755371], "Delaware": [39.318523, -75.507141],
"District of Columbia": [38.897438, -77.026817], "Florida": [27.766279, -81.686783],
"Georgia": [33.040619, -83.643074], "Hawaii": [21.094318, -157.498337],
"Idaho": [44.240459, -114.478828], "Illinois": [40.349457, -88.986137],
"Indiana": [39.849426, -86.258278], "Iowa": [42.011539, -93.210526],
"Kansas": [38.526600, -96.726486], "Kentucky": [37.668140, -84.670067],
"Louisiana": [31.169546, -91.867805], "Maine": [44.693947, -69.381927],
"Maryland": [39.063946, -76.802101], "Massachusetts": [42.230171, -71.530106],
"Michigan": [43.326618, -84.536095], "Minnesota": [45.694454, -93.900192],
"Mississippi": [32.741646, -89.678696], "Missouri": [38.456085, -92.288368],
"Montana": [46.921925, -110.454353], "Nebraska": [41.125370, -98.268082],
"Nevada": [38.313515, -117.055374], "New Hampshire": [43.452492, -71.563896],
"New Jersey": [40.298904, -74.521011], "New Mexico": [34.840515, -106.248482],
"New York": [42.165726, -74.948051], "North Carolina": [35.630066, -79.806419],
"North Dakota": [47.528912, -99.784012], "Ohio": [40.388783, -82.764915],
"Oklahoma": [35.565342, -96.928917], "Oregon": [44.572021, -122.070938],
"Pennsylvania": [40.590752, -77.209755], "Rhode Island": [41.680893, -71.511780],
"South Carolina": [33.856892, -80.945007], "South Dakota": [44.299782, -99.438828],
"Tennessee": [35.747845, -86.692345], "Texas": [31.054487, -97.563461],
"Utah": [40.150032, -111.862434], "Vermont": [44.045876, -72.710686],
"Virginia": [37.769337, -78.169968], "Washington": [47.400902, -121.490494],
"West Virginia": [38.491226, -80.954453], "Wisconsin": [44.268543, -89.616508],
"Wyoming": [42.755966, -107.302490]}

coord_states_df = pd.DataFrame(coord_states)
coord_states_df = coord_states_df.T.reset_index()
coord_states_df.columns = ["State", "Lat", "Lon"]
suicide_df = pd.merge(death_suicide_df, coord_states_df, on='State')
locations = suicide_df[["Lat", "Lon"]].astype(float)
suicides = suicide_df["Suicide"].astype(float)

suicide_df.head()

In [None]:
coord_states1 = []
for state in coord_states:
    coord_states1.append(coord_states[state])

list_suicides = list(suicides)
int_suicides = []
for i in range(len(list_suicides)):
    int_suicides.append(int(list_suicides[i]))

In [None]:
#Create a heat map showing death rates for suicide
fig = gmaps.figure()

suicide_layer = gmaps.heatmap_layer(locations, weights=suicides)

#Adjust suicide_layer setting to help with heatmap dissipating on zoom
suicide_layer.dissipating = False
suicide_layer.max_intensity = 40
suicide_layer.point_radius = 5
suicide_layer.gradient = ['white','yellow','green']

suicides = list(suicides)
suicide1_layer = gmaps.symbol_layer(coord_states1, fill_color='red',
                                    stroke_color='red', scale=int_suicides)


fig.add_layer(suicide_layer)
fig.add_layer(suicide1_layer)

fig

In [None]:
#Test the null hypothesis with ANOVA test?
# Taken from http://www.usa.com/rank/us--average-temperature--state-rank.htm, cleaned in excel
avg_temp_df = pd.read_csv("Average_Temperature_States.csv")
avg_temp_df = avg_temp_df.sort_values('State').reset_index()
del avg_temp_df['index']
avg_temp_df = avg_temp_df[["State", "Avg Temp"]]

stat = death_suicide_df['State']
suic = death_suicide_df['Suicide']
temp = avg_temp_df['Avg Temp']
cold_hot_df = pd.DataFrame()
cold_hot_df['State'] = stat
cold_hot_df['Suicide'] = suic
cold_hot_df['Avg Temp'] = temp

cold_df = cold_hot_df[cold_hot_df['Avg Temp'] <= 55]
cold_df.loc[:, 'Type State'] = 'Cold State'
hot_df = cold_hot_df[cold_hot_df['Avg Temp'] > 55]
hot_df.loc[:, 'Type State'] = 'Hot State'

suicide_sign_df = pd.concat([cold_df, hot_df])
suicide_sign_df = suicide_sign_df.sort_index()

suicide_sign_df.head(10)

In [None]:
suicide_sign_df1 = suicide_sign_df[["Type State", "Suicide"]]
suicide_sign_df1.boxplot("Suicide", by="Type State", figsize=(20, 10))

#Individual groups
Cold = suicide_sign_df1[suicide_sign_df1["Type State"] == "Cold State"]["Suicide"]
Hot = suicide_sign_df1[suicide_sign_df1["Type State"] == "Hot State"]["Suicide"]

pd.options.mode.chained_assignment = None

#Perform the ANOVA
stats.f_oneway(Cold, Hot)

In [None]:
#Do a map with most common age-adjusted cause of death by state in these 15 years
most_common_causes_df = death_group_df[["Heart disease", "Cancer"]]
most_common_causes_df = most_common_causes_df.drop("United States")
most_common_causes_df = pd.DataFrame(most_common_causes_df).reset_index()
most_common_causes_df1 = pd.merge(coord_states_df, most_common_causes_df, on='State')
most_common_causes_df1.head()

In [None]:
#Set intervals for heart disease and cancer
bins_HD = [130, 180, 230, 280]
group_names2 = ["130-180", "181-230", "231-280"]
most_common_causes_df1["Heart disease Range"] = pd.cut(most_common_causes_df1["Heart disease"], bins_HD, labels=group_names2)

bins_cancer = [130, 160, 190, 220]
group_names1 = ["130-160", "161-190", "191-220"]
most_common_causes_df1["Cancer Range"] =pd.cut(most_common_causes_df1["Cancer"], bins_cancer, labels=group_names1)

most_common_causes_df1.head()

In [None]:
# Most common diseases in US (Heart disease). Values by states
group1_hd_df1 = most_common_causes_df1[most_common_causes_df1['Heart disease Range'] == '231-280']
group2_hd_df1 = most_common_causes_df1[most_common_causes_df1['Heart disease Range'] == '181-230']
group3_hd_df1 = most_common_causes_df1[most_common_causes_df1['Heart disease Range'] == '130-180']

# Most common diseases in US (Cancer). Values by states
group1_c_df1 = most_common_causes_df1[most_common_causes_df1['Cancer Range'] == '191-220']
group2_c_df1 = most_common_causes_df1[most_common_causes_df1['Cancer Range'] == '161-190']
group3_c_df1 = most_common_causes_df1[most_common_causes_df1['Cancer Range'] == '130-160']

In [None]:
fig4 = gmaps.figure()

heart_group1_layer1 = gmaps.symbol_layer(group1_hd_df1[['Lat', 'Lon']], fill_color='red',
                                    stroke_color='red', scale=10)
heart_group2_layer1 = gmaps.symbol_layer(group2_hd_df1[['Lat', 'Lon']], fill_color=(255,165,20),
                                    stroke_color=(255,165,20), scale=8)
heart_group3_layer1 = gmaps.symbol_layer(group3_hd_df1[['Lat', 'Lon']], fill_color='yellow',
                                    stroke_color="yellow", scale=6)


#fig4.add_layer(heart_layer)
fig4.add_layer(heart_group1_layer1)
fig4.add_layer(heart_group2_layer1)
fig4.add_layer(heart_group3_layer1)
fig4

In [None]:
fig5 = gmaps.figure()

cancer_group1_layer1 = gmaps.symbol_layer(group1_c_df1[['Lat', 'Lon']], fill_color='red',
                                    stroke_color='red', scale=10)
cancer_group2_layer1 = gmaps.symbol_layer(group2_c_df1[['Lat', 'Lon']], fill_color=(255,165,20),
                                    stroke_color=(255,165,20), scale=8)
cancer_group3_layer1 = gmaps.symbol_layer(group3_c_df1[['Lat', 'Lon']], fill_color='yellow',
                                    stroke_color='yellow', scale=6)



#fig3.add_layer(heart_layer)
fig5.add_layer(cancer_group1_layer1)
fig5.add_layer(cancer_group2_layer1)
fig5.add_layer(cancer_group3_layer1)
fig5