In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
# Set to False to 
plots = False

In [3]:
df = pd.read_csv('US_Accidents_June20.csv')

In [4]:
df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


# Data Cleaning

# Renaming Columns

In [5]:
df.rename(columns={
    'Precipitation(in)': 'Precipitation_in', 
    'Wind_Chill(F)': 'Wind_Chill_F', 
    'Wind_Speed(mph)': 'Wind_Speed_mph', 
    'Visibility(mi)': 'Visibility_mi', 
    'Humidity(%)': 'Humidity_%', 
    'Temperature(F)': 'Temperature_F', 
    'Pressure(in)': 'Pressure_in', 
    'Distance(mi)': 'Distance_mi'
}, inplace=True)

## Type changes

In [6]:
df.dtypes

ID                        object
Source                    object
TMC                      float64
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance_mi              float64
Description               object
Number                   float64
Street                    object
Side                      object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature_F            float64
Wind_Chill_F             float64
Humidity_%               float64
Pressure_in              float64
Visibility_mi            float64
Wind_Direction            object
Wind_Speed_mph           float64
Precipitat

In [7]:
df['Weather_Timestamp'].unique()

array(['2016-02-08 05:58:00', '2016-02-08 05:51:00',
       '2016-02-08 06:56:00', ..., '2019-08-23 12:35:00',
       '2019-08-23 15:18:00', '2019-08-23 01:20:00'], dtype=object)

In [8]:
def clean_times(val):
    if type(val) == pd._libs.tslibs.timestamps.Timestamp or val != val:
        return val
    return datetime.datetime.strptime(val, '%Y-%m-%d %H:%M:%S')
df['Start_Time'] = df['Start_Time'].map(clean_times)
df['End_Time'] = df['End_Time'].map(clean_times)
df['Weather_Timestamp'] = df['Weather_Timestamp'].map(clean_times)

In [9]:
df['Sunrise_Sunset'].unique()

array(['Night', 'Day', nan], dtype=object)

In [10]:
for colname in ['Sunrise_Sunset', 'Nautical_Twilight', 'Civil_Twilight', 'Astronomical_Twilight']:
    df[colname] = df[colname].map({'Night':1, 'Day':0})

## Replacing NaN Precipitation Values

In [11]:
order = ['Rain', 'Light Rain', 'Light Drizzle', 'Heavy Rain']
rep_dict = {
    "Light Rain":0.1,
    "Light Drizzle":0.1,
    "Rain":0.2,
    "Heavy Rain":1.2,
}

In [12]:
print(df['Precipitation_in'].isna().sum())
for k in order:
    v = rep_dict[k]
    t_df = df[['Precipitation_in', 'Weather_Condition']]
    t_df = t_df[(t_df['Precipitation_in'] != t_df['Precipitation_in'])]
    t_df = t_df[t_df['Weather_Condition'].map(lambda x: k in x if x==x else False)]
    df.loc[t_df.index, 'Precipitation_in'] = v
df['Precipitation_in'].isna().sum()


2025874


2002291

In [13]:
df

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,1.0,1.0,1.0,1.0
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,1.0,1.0,1.0,0.0
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,1.0,1.0,0.0,0.0
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,1.0,0.0,0.0,0.0
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,A-3513776,Bing,,2,2019-08-23 18:03:25,2019-08-23 18:32:01,34.002480,-117.379360,33.99888,-117.37094,...,False,False,False,False,False,False,0.0,0.0,0.0,0.0
3513613,A-3513777,Bing,,2,2019-08-23 19:11:30,2019-08-23 19:38:23,32.766960,-117.148060,32.76555,-117.15363,...,False,False,False,False,False,False,0.0,0.0,0.0,0.0
3513614,A-3513778,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:28:49,33.775450,-117.847790,33.77740,-117.85727,...,False,False,False,False,False,False,0.0,0.0,0.0,0.0
3513615,A-3513779,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:29:42,33.992460,-118.403020,33.98311,-118.39565,...,False,False,False,False,False,False,0.0,0.0,0.0,0.0


## Boolean Weather Columns

In [14]:
df['Rain_Bool'] =      df['Weather_Condition'].map(lambda x: 'Rain' in x or 'Drizzle' in x or 'Sleet' in x or 'storm' in x.lower() if x==x else False)
df['Snow_Bool'] =      df['Weather_Condition'].map(lambda x: 'Snow' in x or 'Sleet' in x if x==x else False)
df['Thunder_Bool'] =   df['Weather_Condition'].map(lambda x: 'Thunder' in x if x==x else False)
df['Wind_Bool'] =      df['Weather_Condition'].map(lambda x: 'Windy' in x or 'Squalls' in x if x==x else False)
df['Fog_Bool'] =       df['Weather_Condition'].map(lambda x: 'Fog' in x if x==x else False)
df['Hail_Bool'] =      df['Weather_Condition'].map(lambda x: 'Hail' in x in x if x==x else False)
df['Sand_Dust_Bool'] = df['Weather_Condition'].map(lambda x: 'Sand' in x or 'Dust' in x if x==x else False)

## Dropping Columns

This list will contain the columns to drop

In [15]:
drop_columns = []

In [16]:
if plots:
    print('- - Percentages of nans per column - -')
    df.isna().sum() / len(df) * 100

In [17]:
if plots:
    plt.figure(figsize = (10,10))
    sns.set(font_scale=0.7)
    ignore_cols = ['TMC', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Number']
    sns.heatmap(abs(df.drop(columns=ignore_cols).corr(numeric_only=True)), annot=True, fmt='.2f')

*Turning Loop* has only one value (False), meaning no incident in the database occurred near a turning loop.

In [18]:
drop_columns.append('Turning_Loop')
df['Turning_Loop'].unique()

array([False])

*Wind_Chill* has extremely high correlation with temperature, and also over 50% nans:

In [19]:
drop_columns.append('Wind_Chill_F')

*End_Lat* and *End_Lng* contain over 70% nans.  
They also have very high correlation with *Start_Lat* and *Start_Lng*.  

In [20]:
if plots:
    sns.heatmap(abs(df[['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng']].corr(numeric_only=True)), annot=True, fmt='.2f')

Furthermore, in almost 40% of instances, *End_Lat* and *End_Lng* are exactly the same as *Start_Lat* and *Start_Lng*

In [21]:
print((df['Start_Lat'] == df['End_Lat']).mean() / (1-df['End_Lat'].isna().mean()))
print((df['Start_Lng'] == df['End_Lng']).mean() / (1-df['End_Lng'].isna().mean()))

0.39341456650035417
0.3906188544828512


In [22]:
drop_columns += ['End_Lat', 'End_Lng']

In [23]:
if plots:
    df['Precipitation_in'].plot()

In [24]:
if plots:
    rain_per_cond = df[['Weather_Condition', 'Precipitation_in']].fillna(0).groupby('Weather_Condition').mean()
    rain_per_cond.reset_index(inplace=True)
    rain_per_cond.sort_values(by='Precipitation_in', inplace=True)

In [25]:
if plots:
    sns.set(rc={"figure.figsize":(12, 4), "figure.dpi":300})
    sns.set(font_scale = 0.5)
    plt.xticks(rotation=90)
    a = sns.barplot(x='Weather_Condition', y='Precipitation_in', data=rain_per_cond)
    a.set(yscale="log")

In [26]:
df['Precipitation_in'].fillna(0, inplace=True)
df['Airport_Code'].fillna("", inplace=True)
df['City'].fillna("", inplace=True)

Drop the twilight columns because they correlate highly with sunset and time

In [27]:
drop_columns += ['Nautical_Twilight', 'Civil_Twilight', 'Astronomical_Twilight']

Other

In [28]:
drop_columns += ['Source', 'Number', 'Weather_Condition']

In [29]:
df.drop(columns=drop_columns, inplace=True)

## Dropping Rows

In [30]:
df.duplicated().sum()

0

In [31]:
print('Before dropping: {} rows'.format(len(df)))
df = df.dropna(thresh=len(df.columns) - 9)
print('After dropping: {} rows'.format(len(df)))

Before dropping: 3513617 rows
After dropping: 3512858 rows


In [32]:
if plots:
    print('- - Percentages of nans per column - -')
    df.isna().sum() / len(df) * 100

In [33]:
# joejoe = df.groupby('State', as_index = False)['Severity'].count().sort_values('Severity', ascending=True)

# states_to_go = list(joejoe[joejoe['Severity'] < 3000]['State'])
# for state in states_to_go:
#     df = df[df['State'] != state]

## State abbreviations to full name

In [34]:
import requests
url = 'https://www.50states.com/abbreviations.htm'
html = requests.get(url).content
df_list = pd.read_html(html)
websitedf = df_list[0]
websitedf.drop(columns = ['STANDARD ABBREVIATION'], inplace=True)
new_row = {'US STATE':'District of Columbia', "POSTAL ABBREVIATION": "DC"}
websitedf = websitedf.append(new_row, ignore_index=True)

  websitedf = websitedf.append(new_row, ignore_index=True)


In [35]:
websitedf.columns = ['StateFull', 'State']
df = pd.merge(df, websitedf, on='State')
df['StateFull'].unique()

array(['Ohio', 'West Virginia', 'California', 'Florida', 'Georgia',
       'South Carolina', 'Nebraska', 'Iowa', 'Illinois', 'Missouri',
       'Wisconsin', 'Indiana', 'Michigan', 'New Jersey', 'New York',
       'Connecticut', 'Massachusetts', 'Rhode Island', 'New Hampshire',
       'Pennsylvania', 'Kentucky', 'Maryland', 'Virginia',
       'District of Columbia', 'Delaware', 'Texas', 'Washington',
       'Oregon', 'Alabama', 'Tennessee', 'North Carolina', 'Kansas',
       'Louisiana', 'Oklahoma', 'Colorado', 'Utah', 'Arizona',
       'Minnesota', 'Mississippi', 'Nevada', 'Maine', 'Arkansas', 'Idaho',
       'Vermont', 'New Mexico', 'North Dakota', 'Wyoming', 'South Dakota',
       'Montana'], dtype=object)

# Save to Pickle

In [36]:
df.head(5)

Unnamed: 0,ID,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance_mi,Description,Street,...,Traffic_Signal,Sunrise_Sunset,Rain_Bool,Snow_Bool,Thunder_Bool,Wind_Bool,Fog_Bool,Hail_Bool,Sand_Dust_Bool,StateFull
0,A-1,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,...,False,1.0,True,False,False,False,False,False,False,Ohio
1,A-2,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,...,False,1.0,True,False,False,False,False,False,False,Ohio
2,A-3,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,...,True,1.0,False,False,False,False,False,False,False,Ohio
3,A-4,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,...,False,1.0,False,False,False,False,False,False,False,Ohio
4,A-5,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,...,True,0.0,False,False,False,False,False,False,False,Ohio


In [37]:
df.to_pickle('US_Accidents_June20_Cleaned_new.pkl')
df.to_csv('US_Accidents_June20_Cleaned_new.csv')

In [38]:
df['City'].isna().sum()

0