In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import pickle
import os

In [2]:
%matplotlib inline

In [3]:
# Create folder to store csv files:
dir_name = './DATA'
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

# Creating Database

## Vehicle Data

In [4]:
years = [2016, 2017, 2018, 2019, 2020]

In [5]:
def clean_state(x):
    if type(x) != str:
        print(x)
    for char in '(1234567890)':
        x = x.replace(char,'')
    x = x.strip(' ')
    return x

# Read in dataframe
vehicle_dfs = {}
for year in years:
    url = "https://www.fhwa.dot.gov/policyinformation/statistics/{}/mv1.cfm".format(year)
    html = requests.get(url).content
    tempdf = pd.read_html(html)[0]

    # Select correct columns to use
    tempdf.columns = list(range(16))
    tempdf = tempdf[[0,3,6,9,12]]
    # Rename columns
    tempdf.columns = ['State', 'Automobiles', 'Buses', 'Trucks', 'Motorcycles']
    # add 'Year' Column
    tempdf['Year'] = year
    # Reorder Column
    tempdf = tempdf[['State', 'Year', 'Automobiles', 'Buses', 'Trucks', 'Motorcycles']]
    # Drop Columns with NaNs
    tempdf.dropna(inplace=True)
    # Drop 'Total' Row:
    tempdf = tempdf[tempdf['State'] != 'Total']
    # Clean 'State' Column
    tempdf['State'] = tempdf['State'].map(clean_state)
    vehicle_dfs[year] = tempdf

In [6]:
# Combine dfs into one
dfs = [df for df in vehicle_dfs.values()]
vehicle_df = pd.concat(dfs, axis=0, ignore_index=True)
vehicle_df[['Automobiles', 'Buses', 'Trucks', 'Motorcycles']] = vehicle_df[['Automobiles', 'Buses', 'Trucks', 'Motorcycles']].astype(int)
vehicle_df

Unnamed: 0,State,Year,Automobiles,Buses,Trucks,Motorcycles
0,Alabama,2016,2284443,6238,3067917,109703
1,Alaska,2016,183259,8119,571287,31949
2,Arizona,2016,2377962,9016,3233330,166583
3,Arkansas,2016,942604,11931,1762765,90838
4,California,2016,14768392,98622,14511913,842106
...,...,...,...,...,...,...
250,Virginia,2020,3057254,35463,4329294,184441
251,Washington,2020,2800370,24172,4211411,221448
252,West Virginia,2020,508663,3155,1102015,43529
253,Wisconsin,2020,1901497,14941,3423523,276310


In [7]:
vehicle_df.to_csv(os.path.join(dir_name, 'vehicle_data.csv'), index=False)

## Population data

Before running this, download [*popest-annual.xls*](https://www.icip.iastate.edu/tables/population/states-estimates) and place into **./DATA**

Also, install xlrd

In [8]:
years = list(range(2010, 2020))

In [9]:
# Install xlrd to run this!
pop_df = pd.read_excel(os.path.join(dir_name, 'popest-annual.xls'), sheet_name='States', skiprows=[0,1,2,3,4,5,7,8,9,10,11])
pop_df.drop(columns = ['Fips', 'Estimates Base (4/1/2010)'], inplace=True)

In [10]:
pop_df.head()

Unnamed: 0,Area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Alabama,4785514.0,4799642.0,4816632.0,4831586.0,4843737.0,4854803.0,4866824.0,4877989.0,4891628.0,4907965.0
1,Alaska,713982.0,722349.0,730810.0,737626.0,737075.0,738430.0,742575.0,740983.0,736624.0,733603.0
2,Arizona,6407342.0,6473416.0,6556344.0,6634690.0,6732873.0,6832810.0,6944767.0,7048088.0,7164228.0,7291843.0
3,Arkansas,2921998.0,2941038.0,2952876.0,2960459.0,2968759.0,2979732.0,2991815.0,3003855.0,3012161.0,3020985.0
4,California,37319550.0,37636311.0,37944551.0,38253768.0,38586706.0,38904296.0,39149186.0,39337785.0,39437463.0,39437610.0


In [11]:
year_dfs = {}
for year in years:
    new_df = pop_df[['Area', year]]
    new_df = new_df.rename(columns={'Area': 'State', year:'Population'})
    # Remove all not-state rows:
    new_df = new_df.iloc[:51]
    new_df['Population'] = new_df['Population'].astype(int)
    new_df['Year'] = year
    new_df = new_df[['State', 'Year', 'Population']]

    year_dfs[year] = new_df

In [12]:
dfs = [df for df in year_dfs.values()]
pop_df = pd.concat(dfs, axis=0, ignore_index=True)
pop_df

Unnamed: 0,State,Year,Population
0,Alabama,2010,4785514
1,Alaska,2010,713982
2,Arizona,2010,6407342
3,Arkansas,2010,2921998
4,California,2010,37319550
...,...,...,...
505,Virginia,2019,8556642
506,Washington,2019,7614024
507,West Virginia,2019,1795263
508,Wisconsin,2019,5824581


In [13]:
pop_df.to_csv(os.path.join(dir_name, 'pop_data.csv'), index=False)

## Drivers License Minimum Age

In [14]:
url = "https://www.verywellfamily.com/driving-age-by-state-2611172#toc-learners-permits"
html = requests.get(url).content
minage_df = pd.read_html(html)[0]
minage_df.columns = minage_df.iloc[0]
minage_df.drop(index=[0], inplace=True)
minage_df.set_index('State', inplace=True)
minage_df.rename(columns = {"Learner's Permit":'Learners_Permit', "Restricted License":"Restricted_License", "Full License":"Full_License"}, inplace=True)
minage_df.head()

Unnamed: 0_level_0,Learners_Permit,Restricted_License,Full_License
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,15,16,17
Alaska,14,16,"16, 6 mos."
Arizona,"15, 6 mos.",16,"16, 6 mos."
Arkansas,14,16,18
California,"15, 6 mos.",16,17


In [15]:
def clean_age(val):
    if type(val) == float:
        return val
    age_split = val.split(', ')
    age = float(age_split[0])
    if len(age_split) > 1:
        age += float(age_split[1][0])/12
    return age

minage_df['Learners_Permit'] = minage_df['Learners_Permit'].map(clean_age)
minage_df['Restricted_License'] = minage_df['Restricted_License'].map(clean_age)
minage_df['Full_License'] = minage_df['Full_License'].map(clean_age)

In [16]:
minage_df.head()

Unnamed: 0_level_0,Learners_Permit,Restricted_License,Full_License
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,15.0,16.0,17.0
Alaska,14.0,16.0,16.5
Arizona,15.5,16.0,16.5
Arkansas,14.0,16.0,18.0
California,15.5,16.0,17.0


In [17]:
minage_df.to_csv(os.path.join(dir_name, 'min_age_data.csv'))

## Speed Limit data

In [18]:
url = "https://www.iihs.org/topics/speed/speed-limit-laws"
html = requests.get(url).content
limit_df = pd.read_html(html)[0]
limit_df.columns = ['State', 'Rural_Interstate', 'Urban_Interstate', 'Other_Limited_Access', 'Other']
limit_df.head()

Unnamed: 0,State,Rural_Interstate,Urban_Interstate,Other_Limited_Access,Other
0,Alabama,70,65,65,65
1,Alaska,65,55,65,55
2,Arizona,75,65,65,65 trucks: 65
3,Arkansas,75 trucks: 70,65,75 trucks: 70,65
4,California,70; trucks: 55,65 trucks: 55,70 trucks: 55,65 trucks: 55


In [19]:
def clean_speed(val):
    if type(val) == float:
        return val
    else:
        try:
            return int(str(val)[:2])
        except:
            return int(str(val)[5:7])

for col in ['Rural_Interstate', 'Urban_Interstate', 'Other_Limited_Access', 'Other']:
    limit_df[col] = limit_df[col].map(clean_speed)

In [20]:
limit_df.head()

Unnamed: 0,State,Rural_Interstate,Urban_Interstate,Other_Limited_Access,Other
0,Alabama,70.0,65,65.0,65
1,Alaska,65.0,55,65.0,55
2,Arizona,75.0,65,65.0,65
3,Arkansas,75.0,65,75.0,65
4,California,70.0,65,70.0,65


In [21]:
limit_df.to_csv(os.path.join(dir_name, 'speedlimit_data.csv'), index=False)

## Licensed Drivers

Download csv from [here](https://datahub.transportation.gov/Roadways-and-Bridges/Licensed-Drivers-by-state-gender-and-age-group/xfkb-3bxx)  
Place it in **./DATA**, renamed to *licensed_drivers.csv*


In [22]:
license_df = pd.read_csv(os.path.join(dir_name, 'licensed_drivers.csv'))
license_df.rename(columns={'Cohort':'Age'}, inplace=True)
# Reorder columns
license_df = license_df[['State', 'Year', 'Gender', 'Age', 'Drivers']]
# Only keep 2010 and above
license_df = license_df[license_df['Year'] >= 2010]
# Cleaning
license_df.fillna(0, inplace=True)
license_df['Drivers'] = license_df['Drivers'].astype(int)
license_df.replace({'Under 16':'15-'}, inplace=True)
license_df.head()

Unnamed: 0,State,Year,Gender,Age,Drivers
0,Alabama,2017,Male,15-,0
1,Alaska,2017,Male,15-,0
2,Arizona,2017,Male,15-,0
3,Arkansas,2017,Male,15-,0
4,California,2017,Male,15-,0


In [23]:
early_20s = license_df[license_df['Age'].isin(['19', '20', '21', '22', '23', '24'])]
early_20s = early_20s.groupby(['State', 'Year', 'Gender'], as_index=False).sum(numeric_only=True)
early_20s['Age'] = '19-24'
early_20s = early_20s[['State', 'Year', 'Gender', 'Age', 'Drivers']]

In [24]:
license_df = license_df[~license_df['Age'].isin(['19', '20', '21', '22', '23', '24'])]

In [25]:
license_df = pd.concat([license_df, early_20s])

In [26]:
# Sort
license_df.sort_values(['Year', 'State', 'Gender', 'Age'], inplace=True)

In [27]:
def get_age(val):
    bounds = val.strip('+').split('-')
    return np.mean([int(x) for x in bounds if len(x) > 0])
    

license_df['Age_Numeric'] = license_df['Age'].map(get_age)

In [28]:
license_df

Unnamed: 0,State,Year,Gender,Age,Drivers,Age_Numeric
17595,Alabama,2010,Female,15-,15050,15.0
17646,Alabama,2010,Female,16,22814,16.0
17697,Alabama,2010,Female,17,25723,17.0
17748,Alabama,2010,Female,18,27903,18.0
0,Alabama,2010,Female,19-24,192651,21.5
...,...,...,...,...,...,...
57806,Wyoming,2018,Male,65-69,17043,67.0
57807,Wyoming,2018,Male,70-74,12434,72.0
57808,Wyoming,2018,Male,75-79,7936,77.0
57809,Wyoming,2018,Male,80-84,4579,82.0


In [29]:
license_df.to_csv(os.path.join(dir_name, 'license_data.csv'), index=False)

## Temperature Data

In [30]:
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

In [31]:
all = {}
for month in months:
    url = "https://www.extremeweatherwatch.com/us-state-averages/month-{}".format(month)
    html = requests.get(url).content
    tempdf = pd.read_html(html)[0]
    tempdf.columns = ['State', 'High_F', 'Low_F', 'Precipitation_in']
    tempdf.set_index('State', inplace=True)
    tempdf['Month'] = month[0].upper() + month[1:]
    tempdf['Avg_F'] = (tempdf['High_F'] + tempdf['Low_F'])/2
    tempdf= tempdf[['Month', 'Low_F', 'Avg_F', 'High_F', 'Precipitation_in']]
    tempdf.replace({'Hawaii[1]':'Hawaii'}, inplace=True)
    all[month] = tempdf
    
temperature_df = pd.concat(all.values(), axis=0)

In [32]:
temperature_df.to_csv(os.path.join(dir_name, 'temperature_data.csv'), index=True)

## Precipitation and accidents per year and state

#### Weather ds

In [33]:
weatherdf = pd.read_csv('DATA/WeatherEvents_Jan2016-Dec2021.csv')

In [34]:
weatherdf.head(2)

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),Precipitation(in),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
0,W-1,Snow,Light,2016-01-06 23:14:00,2016-01-07 00:34:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
1,W-2,Snow,Light,2016-01-07 04:14:00,2016-01-07 04:54:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0


In [35]:
weatherdf.rename(columns={
    'StartTime(UTC)': 'StartTime_UTC', 
    'EndTime(UTC)': 'EndTime_UTC',
    'Precipitation(in)': 'Precipitation_in'
}, inplace=True)

In [36]:
weatherdf['StartTime_UTC'] = pd.to_datetime(weatherdf['StartTime_UTC'], format='%Y/%m/%d %H:%M:%S')
weatherdf['EndTime_UTC'] = pd.to_datetime(weatherdf['EndTime_UTC'], format='%Y/%m/%d %H:%M:%S')
weatherdf['StartTime_UTC'] = pd.DatetimeIndex(weatherdf['StartTime_UTC']).year
weatherdf['EndTime_UTC'] = pd.DatetimeIndex(weatherdf['EndTime_UTC']).year

In [37]:
weatherdf = weatherdf.groupby(['State', 'StartTime_UTC'], as_index=False)['Precipitation_in'].sum()
weatherdf = weatherdf.rename(columns = {'StartTime_UTC':'Year', 'size':'Precipitation_in'})
weatherdf.head(10)

Unnamed: 0,State,Year,Precipitation_in
0,AL,2016,3115.76
1,AL,2017,4177.86
2,AL,2018,3912.48
3,AL,2019,3608.45
4,AL,2020,4229.7
5,AL,2021,4200.09
6,AR,2016,9124.29
7,AR,2017,2751.58
8,AR,2018,3409.64
9,AR,2019,3797.28


In [38]:
weatherdf = weatherdf[weatherdf.Year != 2021]
weatherdf.head(10)

Unnamed: 0,State,Year,Precipitation_in
0,AL,2016,3115.76
1,AL,2017,4177.86
2,AL,2018,3912.48
3,AL,2019,3608.45
4,AL,2020,4229.7
6,AR,2016,9124.29
7,AR,2017,2751.58
8,AR,2018,3409.64
9,AR,2019,3797.28
10,AR,2020,3351.14


#### Accidents ds

In [39]:
accidentsdf = pd.read_pickle('DATA/US_Accidents_June20_Cleaned.pkl')

In [40]:
accidentsdf.head(2)

Unnamed: 0,ID,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance_mi,Description,Street,...,Traffic_Signal,Sunrise_Sunset,Rain_Bool,Snow_Bool,Thunder_Bool,Wind_Bool,Fog_Bool,Hail_Bool,Sand_Dust_Bool,StateFull
0,A-1,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,...,False,1.0,True,False,False,False,False,False,False,Ohio
1,A-2,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,...,False,1.0,True,False,False,False,False,False,False,Ohio


In [41]:
accidentsdf['Start_Time'] = pd.to_datetime(accidentsdf['Start_Time'], format='%Y/%m/%d %H:%M:%S')
accidentsdf['End_Time'] = pd.to_datetime(accidentsdf['End_Time'], format='%Y/%m/%d %H:%M:%S')
accidentsdf['Start_Time'] = pd.DatetimeIndex(accidentsdf['Start_Time']).year
accidentsdf['End_Time'] = pd.DatetimeIndex(accidentsdf['End_Time']).year

In [42]:
accidentsdf = accidentsdf.groupby(['State', 'Start_Time'], as_index=False)['Severity'].count()
accidentsdf = accidentsdf.rename(columns = {'Start_Time':'Year', 'Severity':'No_accidents'})
accidentsdf.head(10)

Unnamed: 0,State,Year,No_accidents
0,AL,2016,135
1,AL,2017,2904
2,AL,2018,14096
3,AL,2019,19238
4,AL,2020,8251
5,AZ,2016,2768
6,AZ,2017,12328
7,AZ,2018,22699
8,AZ,2019,24526
9,AZ,2020,16257


#### Merge

In [43]:
totaldf = pd.merge(weatherdf, accidentsdf, on=['State', 'Year'])
totaldf.head(10)

Unnamed: 0,State,Year,Precipitation_in,No_accidents
0,AL,2016,3115.76,135
1,AL,2017,4177.86,2904
2,AL,2018,3912.48,14096
3,AL,2019,3608.45,19238
4,AL,2020,4229.7,8251
5,AZ,2016,1118.57,2768
6,AZ,2017,531.17,12328
7,AZ,2018,536.22,22699
8,AZ,2019,451.01,24526
9,AZ,2020,204.26,16257


#### Add full state name

In [44]:
url = 'https://www.50states.com/abbreviations.htm'
html = requests.get(url).content
df_list = pd.read_html(html)
websitedf = df_list[0]

In [45]:
websitedf.columns =['stateFull', 'State', 'skip']

In [46]:
websitedf.head()

Unnamed: 0,stateFull,State,skip
0,Alabama,AL,Ala.
1,Alaska,AK,Alaska
2,Arizona,AZ,Ariz.
3,Arkansas,AR,Ark.
4,California,CA,Calif.


In [47]:
mergedf = pd.merge(totaldf, websitedf, on = 'State')

In [48]:
mergedf.head()

Unnamed: 0,State,Year,Precipitation_in,No_accidents,stateFull,skip
0,AL,2016,3115.76,135,Alabama,Ala.
1,AL,2017,4177.86,2904,Alabama,Ala.
2,AL,2018,3912.48,14096,Alabama,Ala.
3,AL,2019,3608.45,19238,Alabama,Ala.
4,AL,2020,4229.7,8251,Alabama,Ala.


In [49]:
mergedf = mergedf[['State', 'Year', 'Precipitation_in', 'No_accidents', 'stateFull']]

In [50]:
mergedf.head()

Unnamed: 0,State,Year,Precipitation_in,No_accidents,stateFull
0,AL,2016,3115.76,135,Alabama
1,AL,2017,4177.86,2904,Alabama
2,AL,2018,3912.48,14096,Alabama
3,AL,2019,3608.45,19238,Alabama
4,AL,2020,4229.7,8251,Alabama


In [51]:
mergedf.to_csv('DATA/pre_acc_state_year.csv')

## Accidents per speed limit per state

#### Speed limit

In [52]:
speeddf = pd.read_csv('DATA/speedlimit_data.csv')
speeddf.head()

Unnamed: 0,State,Rural_Interstate,Urban_Interstate,Other_Limited_Access,Other
0,Alabama,70.0,65,65.0,65
1,Alaska,65.0,55,65.0,55
2,Arizona,75.0,65,65.0,65
3,Arkansas,75.0,65,75.0,65
4,California,70.0,65,70.0,65


#### Accidents

In [53]:
accidentsdf = pd.read_pickle('DATA/US_Accidents_June20_Cleaned.pkl')

In [54]:
accidentsdf['Start_Time'] = pd.DatetimeIndex(accidentsdf['Start_Time']).year
accidentsdf['End_Time'] = pd.DatetimeIndex(accidentsdf['End_Time']).year
accidentsdf = accidentsdf[(accidentsdf.Start_Time == 2016) | (accidentsdf.Start_Time == 2017) | (accidentsdf.Start_Time == 2018)]

In [55]:
accidentsdf = accidentsdf.groupby('StateFull', as_index=False)['Start_Time'].count()
accidentsdf = accidentsdf.rename(columns = {'StateFull':'State', 'Start_Time':'No_accidents'})
accidentsdf.head()

Unnamed: 0,State,No_accidents
0,Alabama,17135
1,Arizona,37795
2,California,449832
3,Colorado,22539
4,Connecticut,17349


#### Licensed

In [56]:
licensedf = pd.read_csv('DATA/license_data.csv')

In [57]:
keep_years = [2016, 2017, 2018]
licensedf = licensedf[(licensedf.Year == keep_years[0]) | (licensedf.Year == keep_years[1]) | (licensedf.Year == keep_years[2])]

In [58]:
licensedf = licensedf.groupby('State', as_index=False)['Drivers'].sum()
licensedf['Drivers'] = licensedf['Drivers']/3
licensedf.head()

Unnamed: 0,State,Drivers
0,Alabama,3965506.0
1,Alaska,535067.7
2,Arizona,5177414.0
3,Arkansas,2317967.0
4,California,26671990.0


#### Merge

In [59]:
tempdf = pd.merge(speeddf, accidentsdf, on='State')
mergedf = pd.merge(tempdf, licensedf, on='State')

In [60]:
mergedf['acc_per_driver'] = mergedf['No_accidents'] / mergedf['Drivers']*100
mergedf.head()

Unnamed: 0,State,Rural_Interstate,Urban_Interstate,Other_Limited_Access,Other,No_accidents,Drivers,acc_per_driver
0,Alabama,70.0,65,65.0,65,17135,3965506.0,0.432101
1,Arizona,75.0,65,65.0,65,37795,5177414.0,0.729998
2,California,70.0,65,70.0,65,449832,26671990.0,1.686533
3,Colorado,75.0,65,65.0,65,22539,4155810.0,0.542349
4,Connecticut,65.0,55,65.0,55,17349,2601204.0,0.66696


In [61]:
mergedf.to_csv('DATA/acc_sl_state.csv')

## Accidents per driver per month

#### Weather ds

In [62]:
start_date = '2016'
end_date = '2018'

In [63]:
weatherdf = pd.read_csv('DATA/WeatherEvents_Jan2016-Dec2021.csv')

In [64]:
weatherdf.head(2)

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),Precipitation(in),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
0,W-1,Snow,Light,2016-01-06 23:14:00,2016-01-07 00:34:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
1,W-2,Snow,Light,2016-01-07 04:14:00,2016-01-07 04:54:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0


In [65]:
mask = (weatherdf['StartTime(UTC)'] > start_date) & (weatherdf['StartTime(UTC)'] <= end_date)
weatherdf = weatherdf.loc[mask]

In [66]:
weatherdf['Month'] = pd.DatetimeIndex(weatherdf['StartTime(UTC)']).month_name()

In [67]:
weatherdf = weatherdf.groupby('Month', as_index = False)['Precipitation(in)'].sum()

In [68]:
weatherdf=weatherdf.rename(columns = {'Precipitation(in)':'Precipitation'})

In [69]:
weatherdf.head(12)

Unnamed: 0,Month,Precipitation
0,April,25595.16
1,August,28942.84
2,December,18506.39
3,February,14756.89
4,January,23398.14
5,July,26699.05
6,June,26542.57
7,March,21972.4
8,May,27319.81
9,November,14435.62


#### Accidents ds

In [70]:
accidentsdf = pd.read_pickle('DATA/US_Accidents_June20_Cleaned.pkl')

In [71]:
accidentsdf.head(2)

Unnamed: 0,ID,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance_mi,Description,Street,...,Traffic_Signal,Sunrise_Sunset,Rain_Bool,Snow_Bool,Thunder_Bool,Wind_Bool,Fog_Bool,Hail_Bool,Sand_Dust_Bool,StateFull
0,A-1,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,...,False,1.0,True,False,False,False,False,False,False,Ohio
1,A-2,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,...,False,1.0,True,False,False,False,False,False,False,Ohio


In [72]:
mask = (accidentsdf['Start_Time'] > start_date) & (accidentsdf['Start_Time'] <= end_date)
accidentsdf = accidentsdf.loc[mask]

In [73]:
accidentsdf['Month'] = pd.DatetimeIndex(accidentsdf['Start_Time']).month_name()

In [74]:
accidentsdf = accidentsdf.groupby('Month', as_index = False)['ID'].size()

In [75]:
accidentsdf = accidentsdf.rename(columns = {'size':'No_accidents'})

In [76]:
accidentsdf.head(12)

Unnamed: 0,Month,No_accidents
0,April,65981
1,August,136937
2,December,130710
3,February,52334
4,January,55449
5,July,88514
6,June,76473
7,March,63367
8,May,58554
9,November,134750


#### Sum license

In [77]:
licensedf = pd.read_csv('DATA/license_data.csv')

In [78]:
licensedf.head()

Unnamed: 0,State,Year,Gender,Age,Drivers,Age_Numeric
0,Alabama,2010,Female,15-,15050,15.0
1,Alabama,2010,Female,16,22814,16.0
2,Alabama,2010,Female,17,25723,17.0
3,Alabama,2010,Female,18,27903,18.0
4,Alabama,2010,Female,19-24,192651,21.5


In [79]:
licensedf = licensedf[(licensedf.Year.isin([2016, 2017, 2018]))]

In [80]:
tempdf = licensedf.groupby('Year', as_index = False)['Drivers'].sum()

In [81]:
tempdf.head()

Unnamed: 0,Year,Drivers
0,2016,221994424
1,2017,225346257
2,2018,226065619


In [82]:
meanDrivers = tempdf['Drivers'].sum()

In [83]:
meanDrivers

673406300

#### Merge

In [84]:
acc_wea_dri = pd.merge(weatherdf, accidentsdf, on='Month')

In [85]:
acc_wea_dri.head(12)

Unnamed: 0,Month,Precipitation,No_accidents
0,April,25595.16,65981
1,August,28942.84,136937
2,December,18506.39,130710
3,February,14756.89,52334
4,January,23398.14,55449
5,July,26699.05,88514
6,June,26542.57,76473
7,March,21972.4,63367
8,May,27319.81,58554
9,November,14435.62,134750


In [86]:
acc_wea_dri['acc_per_driver'] = acc_wea_dri['No_accidents']/meanDrivers*100

In [87]:
acc_wea_dri.head()

Unnamed: 0,Month,Precipitation,No_accidents,acc_per_driver
0,April,25595.16,65981,0.009798
1,August,28942.84,136937,0.020335
2,December,18506.39,130710,0.01941
3,February,14756.89,52334,0.007772
4,January,23398.14,55449,0.008234


In [88]:
acc_wea_dri.to_csv('DATA/acc_pre_month.csv')

## Age Distribution per State

In [91]:
license_df

Unnamed: 0,State,Year,Gender,Age,Drivers,Age_Numeric
17595,Alabama,2010,Female,15-,15050,15.0
17646,Alabama,2010,Female,16,22814,16.0
17697,Alabama,2010,Female,17,25723,17.0
17748,Alabama,2010,Female,18,27903,18.0
0,Alabama,2010,Female,19-24,192651,21.5
...,...,...,...,...,...,...
57806,Wyoming,2018,Male,65-69,17043,67.0
57807,Wyoming,2018,Male,70-74,12434,72.0
57808,Wyoming,2018,Male,75-79,7936,77.0
57809,Wyoming,2018,Male,80-84,4579,82.0


In [139]:
license_df = license_df[license_df['Year'].isin([2016,2017,2018])]

In [140]:
age_dist_df = license_df[['State', 'Year', 'Drivers']].groupby(['State', 'Year']).sum()

In [141]:
lower_ages = {'18-':18, '24-':24}
for group, val in lower_ages.items():
    t_df = license_df[license_df['Age_Numeric'] <= val][['State', 'Year', 'Drivers']]
    t_df = t_df.groupby(['State', 'Year']).sum()
    age_dist_df[group] = t_df['Drivers']
    age_dist_df[group] = age_dist_df[group] / age_dist_df['Drivers']

In [142]:
upper_ages = {'65+':65, '70+':70, '75+':75, '80+':80}
for group, val in upper_ages.items():
    t_df = license_df[license_df['Age_Numeric'] >= val][['State', 'Year', 'Drivers']]
    t_df = t_df.groupby(['State', 'Year']).sum()
    age_dist_df[group] = t_df['Drivers']
    age_dist_df[group] = age_dist_df[group] / age_dist_df['Drivers']

In [143]:
age_dist_df.reset_index(inplace=True)

In [144]:
age_dist_df

Unnamed: 0,State,Year,Drivers,18-,24-,65+,70+,75+,80+
0,Alabama,2016,3943082,0.040398,0.138590,0.212362,0.142414,0.089654,0.052121
1,Alabama,2017,3954378,0.040412,0.138195,0.219277,0.149124,0.093009,0.053611
2,Alabama,2018,3999057,0.039710,0.136538,0.222956,0.152636,0.094853,0.054544
3,Alaska,2016,534585,0.024563,0.122005,0.137971,0.076156,0.038059,0.020334
4,Alaska,2017,534585,0.024563,0.122005,0.137971,0.076156,0.038059,0.020334
...,...,...,...,...,...,...,...,...,...
148,Wisconsin,2017,4234793,0.032935,0.121994,0.199623,0.127641,0.077863,0.043613
149,Wisconsin,2018,4288171,0.031507,0.118169,0.211459,0.136852,0.081652,0.044137
150,Wyoming,2016,421098,0.037124,0.131352,0.193874,0.119269,0.068473,0.035153
151,Wyoming,2017,422465,0.036687,0.129350,0.201271,0.124943,0.070590,0.035343


In [145]:
age_dist_df.to_csv('DATA/age_distribution.csv', index=False)