# Import dependencies

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Store CSV into DataFrame

In [2]:
accident_csv_file = "Resources/US_Accidents_June20.csv"
accident_data_df = pd.read_csv(accident_csv_file)
accident_data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


# state_df

In [3]:
state_csv_file = "Resources/state_csvData.csv"
state_df = pd.read_csv(state_csv_file)
state_df.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [4]:
# Rename columns to match accident df for future merging
state_df = state_df.drop(columns={"Abbrev"})
state_df = state_df.rename(columns={"State": "state_name",
                                   "Code": "State"})
state_df.head()

Unnamed: 0,state_name,State
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


# accident_df

In [5]:
# view data before transformation
accident_data_df.nunique()

ID                       3513617
Source                         3
TMC                           21
Severity                       4
Start_Time               3200042
End_Time                 3246120
Start_Lat                1124695
Start_Lng                1113407
End_Lat                   375074
End_Lng                   383569
Distance(mi)               13476
Description              1780092
Number                     40365
Street                    176262
Side                           3
City                       11895
County                      1724
State                         49
Zipcode                   418780
Country                        1
Timezone                       4
Airport_Code                2001
Weather_Timestamp         546086
Temperature(F)               831
Wind_Chill(F)                974
Humidity(%)                  100
Pressure(in)                1022
Visibility(mi)                85
Wind_Direction                24
Wind_Speed(mph)              160
Precipitat

In [6]:
accident_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float64
 3   Severity               int64  
 4   Start_Time             object 
 5   End_Time               object 
 6   Start_Lat              float64
 7   Start_Lng              float64
 8   End_Lat                float64
 9   End_Lng                float64
 10  Distance(mi)           float64
 11  Description            object 
 12  Number                 float64
 13  Street                 object 
 14  Side                   object 
 15  City                   object 
 16  County                 object 
 17  State                  object 
 18  Zipcode                object 
 19  Country                object 
 20  Timezone               object 
 21  Airport_Code           object 
 22  Weather_Timestamp 

# Create new accident df

In [7]:
new_accident_data_df = accident_data_df[['ID', 'Start_Time', 'City', 'County', 'State', 'Zipcode', 'Timezone', 'Temperature(F)', 'Visibility(mi)', 'Weather_Condition']].copy()
new_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,US/Eastern,37.9,10.0,Light Rain
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,US/Eastern,36.0,10.0,Overcast
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,US/Eastern,35.1,9.0,Mostly Cloudy
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,US/Eastern,36.0,6.0,Mostly Cloudy


# merge state and accident df

In [8]:
# Merge accident_df with state_df to obtain state_name for future merging with population df
clean_accident_data_df = pd.merge(new_accident_data_df, state_df, on= "State")
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,US/Eastern,37.9,10.0,Light Rain,Ohio
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,US/Eastern,36.0,10.0,Overcast,Ohio
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,US/Eastern,35.1,9.0,Mostly Cloudy,Ohio
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,US/Eastern,36.0,6.0,Mostly Cloudy,Ohio


In [9]:
# convert start_time from object to datetime 
clean_accident_data_df['Start_Time'] = pd.to_datetime(clean_accident_data_df['Start_Time'])
clean_accident_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3513617 entries, 0 to 3513616
Data columns (total 11 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ID                 object        
 1   Start_Time         datetime64[ns]
 2   City               object        
 3   County             object        
 4   State              object        
 5   Zipcode            object        
 6   Timezone           object        
 7   Temperature(F)     float64       
 8   Visibility(mi)     float64       
 9   Weather_Condition  object        
 10  state_name         object        
dtypes: datetime64[ns](1), float64(2), object(8)
memory usage: 321.7+ MB


In [10]:
clean_accident_data_df.nunique()

ID                   3513617
Start_Time           3200042
City                   11895
County                  1724
State                     49
Zipcode               418780
Timezone                   4
Temperature(F)           831
Visibility(mi)            85
Weather_Condition        127
state_name                49
dtype: int64

In [11]:
# Limit data from 2016-2020 to 2016 only
clean_accident_data_df = clean_accident_data_df[(clean_accident_data_df['Start_Time']>= "2016-01-01") & 
                                            (clean_accident_data_df['Start_Time']<= "2017-01-01")]
clean_accident_data_df.nunique()

ID                   410593
Start_Time           374315
City                   6862
County                 1326
State                    49
Zipcode               69431
Timezone                  4
Temperature(F)          690
Visibility(mi)           47
Weather_Condition        62
state_name               49
dtype: int64

In [12]:
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,US/Eastern,37.9,10.0,Light Rain,Ohio
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,US/Eastern,36.0,10.0,Overcast,Ohio
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,US/Eastern,35.1,9.0,Mostly Cloudy,Ohio
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,US/Eastern,36.0,6.0,Mostly Cloudy,Ohio


In [13]:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!! Didn't set ID as Index, ID is acting weird (ask team members) !!!!!!!!!!!!!!!!!!!!!!!!!!
clean_accident_data_df = clean_accident_data_df.sort_values(by='ID')
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio
9,A-10,2016-02-08 08:10:04,Westerville,Franklin,OH,43081,US/Eastern,37.4,3.0,Light Rain,Ohio
99,A-100,2016-02-11 08:13:24,Dayton,Montgomery,OH,45410,US/Eastern,7.5,10.0,Scattered Clouds,Ohio
68791,A-1000,2016-06-23 10:31:12,El Dorado Hills,El Dorado,CA,95762,US/Pacific,77.0,10.0,Clear,California
167790,A-100000,2016-04-05 15:47:30,Brea,Orange,CA,92821,US/Pacific,84.9,10.0,Clear,California


# Check to see if all 2016 data is present

In [14]:
# Sort by datetime
clean_accident_data_df = clean_accident_data_df.sort_values('Start_Time')
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
45629,A-2478859,2016-02-08 00:37:08,Dublin,Franklin,OH,43017,US/Eastern,42.1,10.0,Light Rain,Ohio
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio
45630,A-2478860,2016-02-08 05:56:20,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,US/Eastern,37.9,10.0,Light Rain,Ohio
45631,A-2478861,2016-02-08 06:15:39,Cincinnati,Hamilton,OH,45203,US/Eastern,36.0,10.0,Overcast,Ohio


In [15]:
clean_accident_data_df.tail()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
1591619,A-2521660,2016-12-31 23:42:02,Janesville,Rock,WI,53546-9128,US/Central,21.2,10.0,Clear,Wisconsin
1591618,A-2521658,2016-12-31 23:42:02,Janesville,Rock,WI,53546,US/Central,21.2,10.0,Clear,Wisconsin
3215485,A-2494106,2016-12-31 23:48:21,Yukon,Canadian,OK,73099,US/Central,32.0,10.0,,Oklahoma
3500069,A-2521653,2016-12-31 23:49:59,Saco,York,ME,04072,US/Eastern,34.0,7.0,Light Rain,Maine
1443015,A-2494107,2016-12-31 23:58:25,Johnston,Polk,IA,50131,US/Central,19.4,10.0,Clear,Iowa


In [16]:
clean_accident_data_df.count()

ID                   410593
Start_Time           410593
City                 410566
County               410593
State                410593
Zipcode              410477
Timezone             410464
Temperature(F)       404137
Visibility(mi)       401810
Weather_Condition    402033
state_name           410593
dtype: int64

In [17]:
clean_accident_data_df.nunique()

ID                   410593
Start_Time           374315
City                   6862
County                 1326
State                    49
Zipcode               69431
Timezone                  4
Temperature(F)          690
Visibility(mi)           47
Weather_Condition        62
state_name               49
dtype: int64

# Test to see if correct data is included

In [18]:
clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-9152']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
76943,A-9152,2016-12-30 23:42:34,Hollister,Santa Clara,CA,95023,US/Pacific,48.2,10.0,Overcast,California


In [19]:
clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-59918']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
127709,A-59918,2016-12-30 23:53:59,San Bernardino,San Bernardino,CA,92411,US/Pacific,53.6,2.0,Drizzle,California


In [20]:
clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-9153']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
76944,A-9153,2016-12-31 01:23:30,San Jose,Santa Clara,CA,95112,US/Pacific,50.0,10.0,Overcast,California


In [21]:
clean_accident_data_df.loc[clean_accident_data_df['Start_Time']=='2016-12-31 23:42:02']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
1591619,A-2521660,2016-12-31 23:42:02,Janesville,Rock,WI,53546-9128,US/Central,21.2,10.0,Clear,Wisconsin
1591618,A-2521658,2016-12-31 23:42:02,Janesville,Rock,WI,53546,US/Central,21.2,10.0,Clear,Wisconsin


In [22]:
clean_accident_data_df['normalised_date'] = clean_accident_data_df['Start_Time'].dt.normalize()
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name,normalised_date
45629,A-2478859,2016-02-08 00:37:08,Dublin,Franklin,OH,43017,US/Eastern,42.1,10.0,Light Rain,Ohio,2016-02-08
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio,2016-02-08
45630,A-2478860,2016-02-08 05:56:20,Dayton,Montgomery,OH,45424,US/Eastern,36.9,10.0,Light Rain,Ohio,2016-02-08
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,US/Eastern,37.9,10.0,Light Rain,Ohio,2016-02-08
45631,A-2478861,2016-02-08 06:15:39,Cincinnati,Hamilton,OH,45203,US/Eastern,36.0,10.0,Overcast,Ohio,2016-02-08


In [23]:
clean_accident_data_df.loc[clean_accident_data_df['normalised_date']=='2016-01-01']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name,normalised_date


# Reorganize df to look pretty

In [24]:
clean_accident_data_df = clean_accident_data_df[['ID', 'Start_Time', 'normalised_date', 'City', 'County', 'State', 'state_name', 'Zipcode', 'Timezone', 'Temperature(F)', 'Visibility(mi)', 'Weather_Condition']]
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,normalised_date,City,County,State,state_name,Zipcode,Timezone,Temperature(F),Visibility(mi),Weather_Condition
45629,A-2478859,2016-02-08 00:37:08,2016-02-08,Dublin,Franklin,OH,Ohio,43017,US/Eastern,42.1,10.0,Light Rain
0,A-1,2016-02-08 05:46:00,2016-02-08,Dayton,Montgomery,OH,Ohio,45424,US/Eastern,36.9,10.0,Light Rain
45630,A-2478860,2016-02-08 05:56:20,2016-02-08,Dayton,Montgomery,OH,Ohio,45424,US/Eastern,36.9,10.0,Light Rain
1,A-2,2016-02-08 06:07:59,2016-02-08,Reynoldsburg,Franklin,OH,Ohio,43068-3402,US/Eastern,37.9,10.0,Light Rain
45631,A-2478861,2016-02-08 06:15:39,2016-02-08,Cincinnati,Hamilton,OH,Ohio,45203,US/Eastern,36.0,10.0,Overcast


In [None]:
# clean_accident_data_df.set_index("ID", inplace=True)

In [None]:
# clean_accident_data_df = clean_accident_data_df.rename(columns={"":""})