# Split Data

In [1]:
import numpy as np
import pandas as pd
import copy as cp
import matplotlib.pyplot as plt

In [2]:
%matplotlib notebook

In [3]:
df0 = pd.read_csv("data.csv")
df0.shape

(2703, 18)

In [4]:
df0.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,Age-adjusted Rate,Standard Error Age-adjusted Rate,Lower Confidence Limit for Age-adjusted rate,Upper Confidence Limit for Age-adjusted Rate,State Crude Rate in Range,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,,,,,,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,,,,,,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,,,,,,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,,,,,,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,,,,,,12.4,12.3


In [5]:
df1 = df0.drop(['Age-adjusted Rate',
                'Standard Error Age-adjusted Rate',
                'Lower Confidence Limit for Age-adjusted rate',
                'Upper Confidence Limit for Age-adjusted Rate',
                'State Crude Rate in Range',
                'US Age-adjusted Rate'], axis=1)
df1.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,US Crude Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,14.8
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,13.2
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,12.0
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,12.4


In [6]:
df2 = df1.drop(['Crude Death Rate',
                'Standard Error for Crude Rate',
                'Low Confidence Limit for Crude Rate',
                'Upper Confidence Limit for Crude Rate', 
                'US Crude Rate'], axis=1)
df2.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571


In [7]:
df = cp.deepcopy(df2)
df['Rate'] = 100000 * df['Deaths'] / df['Population']

df.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.635779
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.62093
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.319967
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.582385
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.170103


In [8]:
years = df['Year'].unique()
years = sorted(years)

yrDFs = []
for year in years:
    yrDFs.append(df.loc[df['Year'] == year])

lsYRs = []
for aDF in yrDFs:
    lsYRs.append(aDF['Year'].unique()[0])
    
print(lsYRs)

[1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]


In [9]:
len(yrDFs)

17

In [10]:
17*159

2703

In [11]:
yrDFs[2].head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Rate
5,2001,Both Sexes,45-54 years,Non-Hispanic White,United States,3850,29733531,12.948344
9,2001,Male,55-64 years,Non-Hispanic Black,United States,129,1100021,11.727049
15,2001,Both Sexes,75+ years,Non-Hispanic White,United States,418,14549012,2.873047
50,2001,Female,75+ years,All Races-All Origins,United States,295,10641163,2.772253
55,2001,Both Sexes,Less than 15 years,Non-Hispanic Black,United States,24,9438333,0.254282


In [12]:
yrDFs[2].plot.scatter(x='Population', y='Deaths', c='Rate', colormap='viridis')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11dc92390>

In [13]:
df.plot.scatter(x='Population', y='Deaths', c='Year', colormap='viridis')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11fdcbe10>

In [14]:
groups = df.groupby('Sex')

fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group['Population'], group['Deaths'], marker='o', linestyle='', ms=5, label=name)
ax.legend()

plt.show()

<IPython.core.display.Javascript object>

In [15]:
groups = df.groupby('Age')

fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group['Population'], group['Deaths'], marker='o', linestyle='', ms=5, label=name)
ax.legend()

plt.show()

<IPython.core.display.Javascript object>

In [16]:
groups = df.groupby('Race and Hispanic Origin')

fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group['Population'], group['Deaths'], marker='o', linestyle='', ms=5, label=name)
ax.legend()

plt.show()

<IPython.core.display.Javascript object>

In [17]:
groups = df.groupby('State')

fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group['Population'], group['Deaths'], marker='o', linestyle='', ms=5, label=name)
ax.legend()

plt.show()

<IPython.core.display.Javascript object>

In [18]:
df['State'].value_counts()

United States           1836
Vermont                   17
Alaska                    17
Iowa                      17
West Virginia             17
Wyoming                   17
Michigan                  17
Missouri                  17
South Dakota              17
Montana                   17
Hawaii                    17
Mississippi               17
Illinois                  17
Delaware                  17
North Dakota              17
Kentucky                  17
South Carolina            17
Alabama                   17
Idaho                     17
Louisiana                 17
Colorado                  17
New York                  17
District of Columbia      17
Nevada                    17
Wisconsin                 17
Rhode Island              17
Maryland                  17
Oklahoma                  17
New Jersey                17
Tennessee                 17
Washington                17
Connecticut               17
Florida                   17
North Carolina            17
Arizona       

In [19]:
df['Sex'].value_counts()

Both Sexes    1479
Female         612
Male           612
Name: Sex, dtype: int64

In [20]:
df['Age'].value_counts()

All Ages              1071
65-74 years            204
55-64 years            204
45-54 years            204
25-34 years            204
35-44 years            204
15-24 years            204
Less than 15 years     204
75+ years              204
Name: Age, dtype: int64

In [21]:
df['Race and Hispanic Origin'].value_counts()

All Races-All Origins    1326
Hispanic                  459
Non-Hispanic White        459
Non-Hispanic Black        459
Name: Race and Hispanic Origin, dtype: int64