In [48]:
import numpy as np
import pandas as pd

In [50]:
df = pd.read_pickle('US_Accidents_June20_Cleaned_new.pkl')

## Obtain N_Accidents Data

In [51]:
df['Year_Month'] = df['Start_Time'].dt.to_period('M')
df['Year'] = df['Start_Time'].dt.year
df = df[df['Year'].isin([2016, 2017, 2018])]
df['StateFull'].nunique()

49

In [52]:
per_month = df[['Year_Month', 'StateFull']].groupby(['Year_Month', 'StateFull'], as_index=True).size()

In [53]:
per_month

Year_Month  StateFull    
2016-02     Indiana            90
            Kentucky           41
            Michigan           10
            Ohio              770
            Pennsylvania       42
                             ... 
2018-12     Virginia         2678
            Washington       1551
            West Virginia      75
            Wisconsin         464
            Wyoming            16
Length: 1550, dtype: int64

In [54]:
new_index = pd.MultiIndex.from_product(per_month.index.levels)
full_df = per_month.reindex(new_index)
full_df = full_df.fillna(0).astype(int)

In [55]:
full_df = full_df.reset_index()
full_df.rename(columns={'StateFull':'State', 0:'N_Accidents'}, inplace=True)

In [56]:
full_df['Year'] = full_df['Year_Month'].dt.year
full_df['Month'] = full_df['Year_Month'].dt.strftime('%B')
full_df = full_df[['State', 'Year_Month', 'Year', 'Month', 'N_Accidents']]
full_df

Unnamed: 0,State,Year_Month,Year,Month,N_Accidents
0,Alabama,2016-02,2016,February,0
1,Arizona,2016-02,2016,February,0
2,Arkansas,2016-02,2016,February,0
3,California,2016-02,2016,February,0
4,Colorado,2016-02,2016,February,0
...,...,...,...,...,...
1710,Virginia,2018-12,2018,December,2678
1711,Washington,2018-12,2018,December,1551
1712,West Virginia,2018-12,2018,December,75
1713,Wisconsin,2018-12,2018,December,464


## Add Drivers Data

In [57]:
licensedf = pd.read_csv('DATA/license_data.csv').rename(columns={'Drivers':'N_Licenses'})

In [58]:
licensedf.head()

Unnamed: 0,State,Year,Gender,Age,N_Licenses,Age_Numeric
0,Alabama,2010,Female,15-,15050,15.0
1,Alabama,2010,Female,16,22814,16.0
2,Alabama,2010,Female,17,25723,17.0
3,Alabama,2010,Female,18,27903,18.0
4,Alabama,2010,Female,19-24,192651,21.5


In [59]:
grouped_licensedf = licensedf[['State', 'Year', 'N_Licenses']].groupby(['State', 'Year'], as_index=False).sum()

In [60]:
grouped_licensedf = grouped_licensedf[grouped_licensedf['Year'] >= 2016]
grouped_licensedf

Unnamed: 0,State,Year,N_Licenses
6,Alabama,2016,3943082
7,Alabama,2017,3954378
8,Alabama,2018,3999057
15,Alaska,2016,534585
16,Alaska,2017,534585
...,...,...,...
448,Wisconsin,2017,4234793
449,Wisconsin,2018,4288171
456,Wyoming,2016,421098
457,Wyoming,2017,422465


In [61]:
full_df['State']
grouped_licensedf['State']
full_df = full_df.merge(grouped_licensedf, on=['State', 'Year'])

In [62]:
full_df.drop(columns=['index'], inplace=True, errors='ignore')
full_df['Acc_Per_Driver'] = full_df['N_Accidents'] / full_df['N_Licenses']
full_df

Unnamed: 0,State,Year_Month,Year,Month,N_Accidents,N_Licenses,Acc_Per_Driver
0,Alabama,2016-02,2016,February,0,3943082,0.000000
1,Alabama,2016-03,2016,March,0,3943082,0.000000
2,Alabama,2016-04,2016,April,0,3943082,0.000000
3,Alabama,2016-05,2016,May,0,3943082,0.000000
4,Alabama,2016-06,2016,June,18,3943082,0.000005
...,...,...,...,...,...,...,...
1710,Wyoming,2018-08,2018,August,7,419256,0.000017
1711,Wyoming,2018-09,2018,September,8,419256,0.000019
1712,Wyoming,2018-10,2018,October,11,419256,0.000026
1713,Wyoming,2018-11,2018,November,71,419256,0.000169


## Weather Data

In [63]:
weather_df = pd.read_csv('Data/WeatherEvents_Jan2016-Dec2021.csv')

In [64]:
weather_df['StartTime(UTC)'] = pd.to_datetime(weather_df['StartTime(UTC)'])
weather_df['StartTime(UTC)']

0         2016-01-06 23:14:00
1         2016-01-07 04:14:00
2         2016-01-07 05:54:00
3         2016-01-08 05:34:00
4         2016-01-08 13:54:00
                  ...        
7479160   2021-12-26 18:19:00
7479161   2021-12-26 18:29:00
7479162   2021-12-28 00:53:00
7479163   2021-12-28 02:27:00
7479164   2021-12-28 03:09:00
Name: StartTime(UTC), Length: 7479165, dtype: datetime64[ns]

In [65]:
weather_df['Year_Month'] = weather_df['StartTime(UTC)'].dt.to_period('M')
weather_df['Year'] = weather_df['StartTime(UTC)'].dt.year
weather_df = weather_df[weather_df['Year'].isin([2016, 2017, 2018])]

In [66]:
weather_df_pm = weather_df[['Year_Month', 'State', 'Precipitation(in)']].groupby(['Year_Month', 'State'], as_index=True).sum()

In [67]:
weather_df_pm.reset_index(inplace=True)
weather_df_pm.rename(columns = {'State':'State_Code', 'Precipitation(in)':'Precipitation_in'}, inplace=True)

In [68]:
weather_df_pm

Unnamed: 0,Year_Month,State_Code,Precipitation_in
0,2016-01,AL,263.21
1,2016-01,AR,75.38
2,2016-01,AZ,49.83
3,2016-01,CA,757.41
4,2016-01,CO,41.30
...,...,...,...
1723,2018-12,VT,27.35
1724,2018-12,WA,272.89
1725,2018-12,WI,133.08
1726,2018-12,WV,83.38


In [69]:
import requests
url = "https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=53971"
html = requests.get(url).content
state_codes = pd.read_html(html)[0].drop(columns = ['Code', 'Abbreviation'])
state_codes.rename(columns = {"Alpha code":"State_Code"}, inplace=True)

In [70]:
state_codes.head()

Unnamed: 0,State,State_Code
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [71]:
weather_df_pm = weather_df_pm.merge(state_codes, on='State_Code')

In [72]:
full_df = full_df.merge(weather_df_pm, on=['State', 'Year_Month'])

## Temperature Data

In [73]:
temp_df = pd.read_csv('Data/temperature_data.csv').drop(columns = 'Precipitation_in')

In [74]:
final_df = full_df.merge(temp_df, on=['State', 'Month'])
final_df = final_df[['State', 'State_Code', 'Year_Month', 'Year', 'Month', 'N_Accidents', 
'N_Licenses', 'Acc_Per_Driver', 'Precipitation_in', 'Low_F', 'Avg_F', 'High_F']]

In [75]:
print(final_df['State'].nunique())
print(final_df['Year_Month'].nunique())

48
35


## State Coords

In [76]:
url = "https://www.latlong.net/category/states-236-14.html"
html = requests.get(url).content
state_coords = pd.read_html(html)[0]

In [77]:
state_coords.columns = ['State', 'Lat', 'Lng']

In [78]:
state_coords['State'] = state_coords['State'].map(lambda x:x.split(',')[0])

In [79]:
final_df = final_df.merge(state_coords, on=['State'])

## Save

In [80]:
final_df.to_csv('DATA/per_month_per_driver.csv', index=False)

In [81]:
final_df

Unnamed: 0,State,State_Code,Year_Month,Year,Month,N_Accidents,N_Licenses,Acc_Per_Driver,Precipitation_in,Low_F,Avg_F,High_F,Lat,Lng
0,Alabama,AL,2016-02,2016,February,0,3943082,0.000000,392.05,37.9,49.30,60.7,32.31823,-86.902298
1,Alabama,AL,2017-02,2017,February,13,3954378,0.000003,237.33,37.9,49.30,60.7,32.31823,-86.902298
2,Alabama,AL,2018-02,2018,February,728,3999057,0.000182,463.68,37.9,49.30,60.7,32.31823,-86.902298
3,Alabama,AL,2016-03,2016,March,0,3943082,0.000000,357.49,43.9,56.15,68.4,32.31823,-86.902298
4,Alabama,AL,2017-03,2017,March,25,3954378,0.000006,230.74,43.9,56.15,68.4,32.31823,-86.902298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,Wyoming,WY,2016-12,2016,December,14,421098,0.000033,24.80,11.3,21.75,32.2,43.07597,-107.290283
1676,Wyoming,WY,2017-12,2017,December,11,422465,0.000026,30.48,11.3,21.75,32.2,43.07597,-107.290283
1677,Wyoming,WY,2018-12,2018,December,16,419256,0.000038,17.59,11.3,21.75,32.2,43.07597,-107.290283
1678,Wyoming,WY,2017-01,2017,January,13,422465,0.000031,26.92,11.1,21.75,32.4,43.07597,-107.290283
