![Add a relevant banner image here](path_to_image)

# Project Title

## Overview

Short project description. Your bottom line up front (BLUF) insights.

## Business Understanding

Text here

## Data Understanding

Text here

In [1]:
# Load relevant imports here
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import gc
import csv
from datetime import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

np.random.seed(42)

In [2]:
df_all_data = pd.read_csv('Data/US_Accidents_March23.csv')
print(df_all_data.describe())
print(df_all_data.head())
print(df_all_data.columns)

           Severity     Start_Lat     Start_Lng       End_Lat       End_Lng  \
count  7.728394e+06  7.728394e+06  7.728394e+06  4.325632e+06  4.325632e+06   
mean   2.212384e+00  3.620119e+01 -9.470255e+01  3.626183e+01 -9.572557e+01   
std    4.875313e-01  5.076079e+00  1.739176e+01  5.272905e+00  1.810793e+01   
min    1.000000e+00  2.455480e+01 -1.246238e+02  2.456601e+01 -1.245457e+02   
25%    2.000000e+00  3.339963e+01 -1.172194e+02  3.346207e+01 -1.177543e+02   
50%    2.000000e+00  3.582397e+01 -8.776662e+01  3.618349e+01 -8.802789e+01   
75%    2.000000e+00  4.008496e+01 -8.035368e+01  4.017892e+01 -8.024709e+01   
max    4.000000e+00  4.900220e+01 -6.711317e+01  4.907500e+01 -6.710924e+01   

       Distance(mi)  Temperature(F)  Wind_Chill(F)   Humidity(%)  \
count  7.728394e+06    7.564541e+06   5.729375e+06  7.554250e+06   
mean   5.618423e-01    6.166329e+01   5.825105e+01  6.483104e+01   
std    1.776811e+00    1.901365e+01   2.238983e+01  2.282097e+01   
min    0.000000e

In [None]:
df_severity_by_state = pd.crosstab(df_all_data['Severity'], df_all_data['State'])
df_severity_by_state.head()

In [None]:
# chart of accident severity by state
states = df_severity_by_state.columns
sev1 = df_severity_by_state.iloc[0]
sev2 = df_severity_by_state.iloc[1]
sev3 = df_severity_by_state.iloc[2]
sev4 = ct_sev4_by_state = df_severity_by_state.iloc[3]

plt.figure(figsize=(12, 7))
plt.bar(states, sev1)
plt.bar(states, sev2, bottom=sev1)
plt.bar(states, sev3, bottom=sev1+sev2)
plt.bar(states, sev4, bottom=sev1+sev2+sev3)
plt.xlabel("State")
plt.xticks(rotation=60)
plt.ylabel("Number of Accidents")
plt.legend(["Severity 1", "Severity 2", "Severity 3", "Severity 4"])
plt.title("Accident Severity by State (2016 - 2023)")
plt.show()

In [None]:
# chart of accident severity by state without sev2 (sev 2 >> than the others so obscures sev 1, 3, 4 above)
states = df_severity_by_state.columns
sev1 = df_severity_by_state.iloc[0]
sev3 = df_severity_by_state.iloc[2]
sev4 = ct_sev4_by_state = df_severity_by_state.iloc[3]

plt.figure(figsize=(12, 7))
plt.bar(states, sev1)
plt.bar(states, sev3)
plt.bar(states, sev4)
plt.xlabel("State")
plt.xticks(rotation=60)
plt.ylabel("Number of Accidents")
plt.legend(["Severity 1", "Severity 3", "Severity 4"])
plt.title("Accident Severity by State, Excluding Severity 2 (2016 - 2023)")
plt.show()

In [None]:
df_sev_by_crossing = pd.crosstab(df_all_data['Severity'], df_all_data['Crossing'])
df_sev_by_crossing = df_sev_by_crossing.rename(columns={False: "No", True: "Yes"})
df_sev_by_crossing.head()

In [None]:
# charts of accident severity by bump, traffic calming, roundabout
# chart of accident severity by state
crossing = df_sev_by_crossing.columns
cr1 = df_sev_by_crossing.iloc[0]
cr2 = df_sev_by_crossing.iloc[1]
cr3 = df_sev_by_crossing.iloc[2]
cr4 = df_sev_by_crossing.iloc[3]

plt.figure(figsize=(10, 7))
plt.bar(crossing, cr1)
plt.bar(crossing, cr2)
plt.bar(crossing, cr3)
plt.bar(crossing, cr4)
plt.xticks(rotation=60)
plt.xlabel("Nearby Crossing")
plt.ylabel("Number of Accidents")
plt.legend(["Severity 1", "Severity 2", "Severity 3", "Severity 4"])
plt.title("Accident Severity by Proximity to Crossing, (2016 - 2023)")
plt.show()

In [None]:
print(f"Count of Wind Direction Entries: {df_all_data['Wind_Direction'].nunique()}")

## Data Preparation

### Data Selection

Based on my exploration of the data, I'm dropping the following fields from the dataset for the following reasons:

- Source: contains information that has no relationship to causes and effects of accidents
- Start_Lat, Start_Lng, End_Lat, End_Lng, Street, City, County, State, Zipcode, Country: I'm going to model factors that influence accident severity, independent of the part of the country the accident occurs in.
    - I'll retain Zipcode and State until after data cleaning and construction since they'll be used for imputation and creating a new field
- Description: unstructured text that will not give meaningful results with the planned modeling
- Timezone: duplicates zip/state with less precision
- Street: this field is not standardized and will introduce noise to the modeling
- Country: all data is from the United States so this field is redundant
- Airport_Code: doesn't provide germane information-the exact location where weather conditions are reported is not a variable that can be adjusted
- Weather_Timestamp: not related to the conditions of the accidents in any way
- Wind_Direction: too many unique values; values are also not related to travel directions so it's unlikely they'll  produce clear/actionable conclusions


In [3]:
df_refined = df_all_data.drop(['ID', 'Source', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Description',
                                'Timezone', 'Street', 'City', 'County', 'Country', 'Airport_Code', 'Weather_Timestamp', 'Wind_Direction'], axis=1)

print(df_refined.describe())
print(df_refined.info())
print(df_refined.columns)

           Severity  Distance(mi)  Temperature(F)  Wind_Chill(F)  \
count  7.728394e+06  7.728394e+06    7.564541e+06   5.729375e+06   
mean   2.212384e+00  5.618423e-01    6.166329e+01   5.825105e+01   
std    4.875313e-01  1.776811e+00    1.901365e+01   2.238983e+01   
min    1.000000e+00  0.000000e+00   -8.900000e+01  -8.900000e+01   
25%    2.000000e+00  0.000000e+00    4.900000e+01   4.300000e+01   
50%    2.000000e+00  3.000000e-02    6.400000e+01   6.200000e+01   
75%    2.000000e+00  4.640000e-01    7.600000e+01   7.500000e+01   
max    4.000000e+00  4.417500e+02    2.070000e+02   2.070000e+02   

        Humidity(%)  Pressure(in)  Visibility(mi)  Wind_Speed(mph)  \
count  7.554250e+06  7.587715e+06    7.551296e+06     7.157161e+06   
mean   6.483104e+01  2.953899e+01    9.090376e+00     7.685490e+00   
std    2.282097e+01  1.006190e+00    2.688316e+00     5.424983e+00   
min    1.000000e+00  0.000000e+00    0.000000e+00     0.000000e+00   
25%    4.800000e+01  2.937000e+01    

In [4]:
del df_all_data
gc.collect()

16

### Data Cleaning

##### Missing Values

I've managed missing values in the code blocks below. Here's a brief explanation of my approch to each column:

- Temperature, Wind_Chill, Humidity, Pressure, Visibility: imputed based on the mean temp of other accident entries sharing the same day and zip code (or state if there are none in the zip code)
- Precipitation, Wind_Speed: assumed NaN indicates no precipitation/wind and replaced NaN with zero
- Sunrise_Sunset: imputed based on
- Civil_Twilight: imputed based on
- Nautical_Twilight: imputed based on
- Astronomical_Twilight: imputed based on

In [5]:
df_refined['Start_Time'] = pd.to_datetime(df_refined['Start_Time'], yearfirst=True, format='mixed')
df_refined['End_Time'] = pd.to_datetime(df_refined['End_Time'], yearfirst=True, format='mixed')
df_refined['Acc_date'] = df_refined['Start_Time'].dt.date
df_refined['Acc_time'] = df_refined['Start_Time'].dt.time

In [6]:
# replacing NaN values in the temperature column

def fill_missing_temp(df, date='Acc_date', zip='Zipcode', state='State', temp='Temperature(F)'):
    """
    Replaces NaN humidity values with the mean humidity of entries
    with the same date and zip code. If no match exists with date and county,
    uses date and state.
    """
    # Step 1: mean humidity for each date-zip combination
    df_temp_fill = df_refined.copy()
    
    temp_means_zip = df_temp_fill.groupby([date, zip])[temp].transform('mean')
    
    # fill NaN values with the date-zip group mean
    df_temp_fill[temp] = df_temp_fill[temp].fillna(temp_means_zip)
    
    # Step 2: date-state combination for remaining NaNs
    if df_temp_fill[temp].isna().any():
        remaining = df_temp_fill[temp].isna().sum()
        print(f"Info: {remaining} temperature entries still missing after date-zip fill.")
        print("Filling remaining with date-state mean.")
        
        temp_means_state = df_temp_fill.groupby([date, state])[temp].transform('mean')
        df_temp_fill[temp] = df_temp_fill[temp].fillna(temp_means_state)
    
    # Step 3: if any NaNs still remain, fill with overall mean as last resort
    if df_temp_fill[temp].isna().any():
        remaining = df_temp_fill[temp].isna().sum()
        print(f"Warning: {remaining} temperature entries still missing after date-state fill.")
        print("Filling remaining with overall mean as last resort.")
        df_temp_fill[temp] = df_temp_fill[temp].fillna(df_temp_fill[temp].mean())
    
    return df_temp_fill

df_temp_fill = fill_missing_temp(df_refined)

Info: 158386 temperature entries still missing after date-zip fill.
Filling remaining with date-state mean.
Filling remaining with overall mean as last resort.


In [7]:
# verifying NaNs have been replaced
df_temp_fill['Temperature(F)'].isna().sum()

0

In [8]:
# writing NaN-free column back to the original dataframe
df_refined['Temperature(F)'] = df_temp_fill['Temperature(F)']
print(f"Entries with NaN Temperature: {df_refined['Temperature(F)'].isna().sum()}")

Entries with NaN Temperature: 0


In [9]:
# deleting working df to free up memory
del df_temp_fill
gc.collect()

65

In [10]:
# replacing NaN values in the wind chill column

def fill_missing_windchill(df, date='Acc_date', zip='Zipcode', state='State', windchill='Wind_Chill(F)'):
    """
    Replaces NaN wind chill values with the mean wind chill of entries
    with the same date and zip code. If no match exists with date and county,
    uses date and state.
    """
    # Step 1: mean temperature for each date-zip combination
    df_windchill_fill = df_refined.copy()
    
    windchill_means_zip = df_windchill_fill.groupby([date, zip])[windchill].transform('mean')
    
    # fill NaN values with the date-zip group mean
    df_windchill_fill[windchill] = df_windchill_fill[windchill].fillna(windchill_means_zip)
    
    # Step 2: date-state combination for remaining NaNs
    if df_windchill_fill[windchill].isna().any():
        remaining = df_windchill_fill[windchill].isna().sum()
        print(f"Info: {remaining} wind chills still missing after date-zip fill.")
        print("Filling remaining with date-state mean.")
        
        windchill_means_state = df_windchill_fill.groupby([date, state])[windchill].transform('mean')
        df_windchill_fill[windchill] = df_windchill_fill[windchill].fillna(windchill_means_state)
    
    # Step 3: if any NaNs still remain, fill with overall mean as last resort
    if df_windchill_fill[windchill].isna().any():
        remaining = df_windchill_fill[windchill].isna().sum()
        print(f"Warning: {remaining} wind chills still missing after date-state fill.")
        print("Filling remaining with overall mean as last resort.")
        df_windchill_fill[windchill] = df_windchill_fill[windchill].fillna(df_windchill_fill[windchill].mean())
    
    return df_windchill_fill

df_windchill_fill = fill_missing_windchill(df_refined)

Info: 1949795 wind chills still missing after date-zip fill.
Filling remaining with date-state mean.
Filling remaining with overall mean as last resort.


In [11]:
# verifying NaNs have been replaced in the working df
df_windchill_fill['Wind_Chill(F)'].isna().sum()

0

In [12]:
# writing NaN-free column back to the original dataframe
df_refined['Wind_Chill(F)'] = df_windchill_fill['Wind_Chill(F)']
print(f"Entries with NaN Wind Chill: {df_refined['Wind_Chill(F)'].isna().sum()}")

Entries with NaN Wind Chill: 0


In [13]:
# deleting working df to free up memory
del df_windchill_fill
gc.collect()

48

In [14]:
# replacing NaN values in the humidity column

def fill_missing_hum(df, date='Acc_date', zip='Zipcode', state='State', hum='Humidity(%)'):
    """
    Replaces NaN humidity values with the mean humidity of entries
    with the same date and zip code. If no match exists with date and county,
    uses date and state.
    """
    # Step 1: mean humidity for each date-zip combination
    df_hum_fill = df_refined.copy()
    
    hum_means_zip = df_hum_fill.groupby([date, zip])[hum].transform('mean')
    
    # fill NaN values with the date-zip group mean
    df_hum_fill[hum] = df_hum_fill[hum].fillna(hum_means_zip)
    
    # Step 2: date-state combination for remaining NaNs
    if df_hum_fill[hum].isna().any():
        remaining = df_hum_fill[hum].isna().sum()
        print(f"Info: {remaining} humidity entries still missing after date-zip fill.")
        print("Filling remaining with date-state mean.")
        
        hum_means_state = df_hum_fill.groupby([date, state])[hum].transform('mean')
        df_hum_fill[hum] = df_hum_fill[hum].fillna(hum_means_state)
    
    # Step 3: if any NaNs still remain, fill with overall mean as last resort
    if df_hum_fill[hum].isna().any():
        remaining = df_hum_fill[hum].isna().sum()
        print(f"Warning: {remaining} humidity entries still missing after date-state fill.")
        print("Filling remaining with overall mean as last resort.")
        df_hum_fill[hum] = df_hum_fill[hum].fillna(df_hum_fill[hum].mean())
    
    return df_hum_fill

df_hum_fill = fill_missing_hum(df_refined)

Info: 166329 humidity entries still missing after date-zip fill.
Filling remaining with date-state mean.
Filling remaining with overall mean as last resort.


In [15]:
# verifying NaNs have been replaced in the working df
df_hum_fill['Humidity(%)'].isna().sum()

0

In [16]:
# writing NaN-free column back to the original dataframe
df_refined['Humidity(%)'] = df_hum_fill['Humidity(%)']
print(f"Entries with NaN humidity: {df_refined['Humidity(%)'].isna().sum()}")

Entries with NaN humidity: 0


In [17]:
# deleting working df to free up memory
del df_hum_fill
gc.collect()

48

In [18]:
# replacing NaN values in the pressure column

def fill_missing_press(df, date='Acc_date', zip='Zipcode', state='State', press='Pressure(in)'):
    """
    Replaces NaN pressure values with the mean pressure of entries
    with the same date and zip code. If no match exists with date and county,
    uses date and state.
    """
    # Step 1: mean pressure for each date-zip combination
    df_press_fill = df_refined.copy()
    
    press_means_zip = df_press_fill.groupby([date, zip])[press].transform('mean')
    
    # fill NaN values with the date-zip group mean
    df_press_fill[press] = df_press_fill[press].fillna(press_means_zip)
    
    # Step 2: date-state combination for remaining NaNs
    if df_press_fill[press].isna().any():
        remaining = df_press_fill[press].isna().sum()
        print(f"Info: {remaining} pressure entries still missing after date-zip fill.")
        print("Filling remaining with date-state mean.")
        
        press_means_state = df_press_fill.groupby([date, state])[press].transform('mean')
        df_press_fill[press] = df_press_fill[press].fillna(press_means_state)
    
    # Step 3: if any NaNs still remain, fill with overall mean as last resort
    if df_press_fill[press].isna().any():
        remaining = df_press_fill[press].isna().sum()
        print(f"Warning: {remaining} pressure entries still missing after date-state fill.")
        print("Filling remaining with overall mean as last resort.")
        df_press_fill[press] = df_press_fill[press].fillna(df_press_fill[press].mean())
    
    return df_press_fill

df_press_fill = fill_missing_press(df_refined)

Info: 137561 pressure entries still missing after date-zip fill.
Filling remaining with date-state mean.
Filling remaining with overall mean as last resort.


In [19]:
# verifying NaNs have been replaced in the working df
df_press_fill['Pressure(in)'].isna().sum()

0

In [20]:
# writing NaN-free column back to the original dataframe
df_refined['Pressure(in)'] = df_press_fill['Pressure(in)']
print(f"Entries with NaN pressure: {df_refined['Pressure(in)'].isna().sum()}")

Entries with NaN pressure: 0


In [21]:
# deleting working df to free up memory
del df_press_fill
gc.collect()

48

In [22]:
# replacing NaN values in the visibility column

def fill_missing_vis(df, date='Acc_date', zip='Zipcode', state='State', vis='Visibility(mi)'):
    """
    Replaces NaN visibility values with the mean visibility of entries
    with the same date and zip code. If no match exists with date and county,
    uses date and state.
    """
    # Step 1: mean visibility for each date-zip combination
    df_vis_fill = df_refined.copy()
    
    vis_means_zip = df_vis_fill.groupby([date, zip])[vis].transform('mean')
    
    # fill NaN values with the date-zip group mean
    df_vis_fill[vis] = df_vis_fill[vis].fillna(vis_means_zip)
    
    # Step 2: date-state combination for remaining NaNs
    if df_vis_fill[vis].isna().any():
        remaining = df_vis_fill[vis].isna().sum()
        print(f"Info: {remaining} visibility entries still missing after date-zip fill.")
        print("Filling remaining with date-state mean.")
        
        vis_means_state = df_vis_fill.groupby([date, state])[vis].transform('mean')
        df_vis_fill[vis] = df_vis_fill[vis].fillna(vis_means_state)
    
    # Step 3: if any NaNs still remain, fill with overall mean as last resort
    if df_vis_fill[vis].isna().any():
        remaining = df_vis_fill[vis].isna().sum()
        print(f"Warning: {remaining} visibility entries still missing after date-state fill.")
        print("Filling remaining with overall mean as last resort.")
        df_vis_fill[vis] = df_vis_fill[vis].fillna(df_vis_fill[vis].mean())
    
    return df_vis_fill

df_vis_fill = fill_missing_vis(df_refined)

Info: 171980 visibility entries still missing after date-zip fill.
Filling remaining with date-state mean.
Filling remaining with overall mean as last resort.


In [23]:
# verifying NaNs have been replaced in the working df
df_vis_fill['Visibility(mi)'].isna().sum()

0

In [24]:
# writing NaN-free column back to the original dataframe
df_refined['Visibility(mi)'] = df_vis_fill['Visibility(mi)']
print(f"Entries with NaN visibility: {df_refined['Visibility(mi)'].isna().sum()}")

Entries with NaN visibility: 0


In [25]:
# deleting working df to free up memory
del df_vis_fill
gc.collect()

48

In [26]:
# replacing NaN with 0 in wind speed and precipitation

df_refined['Wind_Speed(mph)'] = df_refined['Wind_Speed(mph)'].fillna(0)
df_refined['Precipitation(in)'] = df_refined['Precipitation(in)'].fillna(0)
print(f"Entries with NaN Wind Speed: {df_refined['Wind_Speed(mph)'].isna().sum()}")
print(f"Entries with NaN Precipitation: {df_refined['Precipitation(in)'].isna().sum()}")

Entries with NaN Wind Speed: 0
Entries with NaN Precipitation: 0


In [27]:
df_refined.columns

Index(['Severity', 'Start_Time', 'End_Time', 'Distance(mi)', 'State',
       'Zipcode', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Acc_date', 'Acc_time'],
      dtype='object')

In [28]:
# converting boolean to integer
df_refined['Amenity'] = df_refined['Amenity'].astype(int)
df_refined['Bump'] = df_refined['Bump'].astype(int)
df_refined['Crossing'] = df_refined['Crossing'].astype(int)
df_refined['Give_Way'] = df_refined['Give_Way'].astype(int)
df_refined['Junction'] = df_refined['Junction'].astype(int)
df_refined['No_Exit'] = df_refined['No_Exit'].astype(int)
df_refined['Railway'] = df_refined['Railway'].astype(int)
df_refined['Roundabout'] = df_refined['Roundabout'].astype(int)
df_refined['Station'] = df_refined['Station'].astype(int)
df_refined['Stop'] = df_refined['Stop'].astype(int)
df_refined['Traffic_Calming'] = df_refined['Traffic_Calming'].astype(int)
df_refined['Traffic_Signal'] = df_refined['Traffic_Signal'].astype(int)
df_refined['Turning_Loop'] = df_refined['Turning_Loop'].astype(int)

In [29]:
# checking to see if the day/night data points are missing for the same entries
day_night_cols =['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
blank_counts = df_refined[day_night_cols].isna().sum(axis=1)
results = blank_counts.value_counts().sort_index()
results

0    7705148
4      23246
Name: count, dtype: int64

In [30]:
del blank_counts
del results
gc.collect()

32

In [31]:
# creating column displaying Day or Night based on a day with equal length days and nights
def day_or_night(t):
    if time(6, 0, 0) <= t < time(18, 0, 0):
        return 'Day'
    else:
        return 'Night'

df_refined['Day_Night_Calc'] = df_refined['Acc_time'].apply(day_or_night)
df_refined['Day_Night_Calc'].value_counts()

Day_Night_Calc
Day      5515796
Night    2212598
Name: count, dtype: int64

In [32]:
df_refined_dup2 = df_refined.copy()

# replacing missing day/night values with the generic day/night column created above
for col in day_night_cols:
    df_refined_dup2[col] = np.where(df_refined_dup2[col].isna(), df_refined_dup2['Day_Night_Calc'], df_refined_dup2[col])

In [33]:
# verifying that no empty entries remain
blank_counts2 = df_refined_dup2[day_night_cols].isna().sum(axis=1)
results2 = blank_counts2.value_counts().sort_index()
results2

0    7728394
Name: count, dtype: int64

In [34]:
df_refined['Sunrise_Sunset'] = df_refined_dup2['Sunrise_Sunset']
df_refined['Civil_Twilight'] = df_refined_dup2['Civil_Twilight']
df_refined['Nautical_Twilight'] = df_refined_dup2['Nautical_Twilight']
df_refined['Astronomical_Twilight'] = df_refined_dup2['Astronomical_Twilight']

In [35]:
del df_refined_dup2
del results2
gc.collect()

80

#### Cleaning Weather Condition

The Weather_Condition field has a large number of values across the dataset. Many of these values are used rarely and will complicate modeling, so I'll consolidate them into a smaller number of value options. This will be a manual process-I'll review and create a mapping table by hand.

In [None]:
#print(f"Count of Unique Weather Condition Entries: {df_all_data['Weather_Condition'].nunique()}")
df_refined['Weather_Condition'].value_counts().sort_values(ascending=True)

In [None]:
# printing a list of weather condition unique values to csv
unique_wthr_cond = pd.Series(df_refined['Weather_Condition'].unique())
unique_wthr_cond.to_csv('conditions.csv')

In [36]:
def csv_to_dict_reader(filename):
    result = {}
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row) >=2:
                result[row[0]] = row[1]
    return result

In [37]:
weather_mapper = csv_to_dict_reader('conditions_consolidated.csv')
df_refined_dup1 = df_refined.copy()

df_refined_dup1['Weather_Condition'] = df_refined_dup1['Weather_Condition'].map(weather_mapper)
print(df_refined_dup1['Weather_Condition'].value_counts().sort_values(ascending=True))
print(df_refined_dup1['Weather_Condition'].nunique())

Weather_Condition
Hail                             133
Freezing Rain                    262
Sleet                            319
Blowing Ash / Dust / Sand        769
Light Freezing Fog              1001
Blowing Snow                    1673
Mist                            3554
Light Sleet                     5319
Heavy Snow                      5922
Light Fog                       7271
Smoke                          12764
Wintry  Mix                    12768
Heavy Thunderstorm             13259
Snow                           16850
Tornado                        17664
Thunderstorm                   21501
Light Thunderstorm             22545
Heavy Rain                     33843
Hazy                           77828
Rain                           87340
Fog                            99981
Light Snow                    135662
Light Rain                    402997
Partly Cloudy                 915556
Cloudy                       2249686
Fair                         3408468
Name: count, dtype: 

In [38]:
df_refined['Weather_Condition'] = df_refined_dup1['Weather_Condition']
df_refined['Weather_Condition'].nunique()

26

In [39]:
del df_refined_dup1
del weather_mapper
gc.collect()

64

### Constructing Data

I'm adding a feature that categorizes the location of the accidents as urban / suburban / rural. I'm using Rural-Urban Commuting Area Codes from the USDA Economic Research Service as the data source. https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes

In [40]:
loc_type_mapper = csv_to_dict_reader('category_by_zip.csv')

In [41]:
# converting zip code entries to the 5-digit format

df_refined['Zipcode'] = df_refined['Zipcode'].astype(str)
df_refined['Zipcode'] = df_refined['Zipcode'].str[:5]

# mapping a new location type field using the mapper created above
df_refined['Location_Type'] = df_refined['Zipcode'].map(loc_type_mapper)
df_refined['Location_Type'].isna().sum()

1919

In [42]:
# checking for clear pattern(s) in the missing location type rows
nan_zip_codes = df_refined.loc[df_refined['Location_Type'].isna(), 'Zipcode']
nan_zip_codes

7794       nan
13889      nan
14515      nan
16208      nan
23539      nan
          ... 
7722065    nan
7723723    nan
7724048    nan
7724049    nan
7726723    nan
Name: Zipcode, Length: 1919, dtype: object

In [43]:
# determining proportions of location types in the whole dataset; will impute the NaNs with the same proportion
df_refined['Location_Type'].value_counts()

Location_Type
Urban       6426638
Suburban     781803
Rural        518034
Name: count, dtype: int64

In [47]:
# filling missing values in line with value distribution in the rest of the dataset
location_types = ['Urban', 'Suburban', 'Rural']
proportions = [0.83, 0.10, 0.07]

num_missing = df_refined['Location_Type'].isna().sum()
random_values = np.random.choice(location_types, size=num_missing, p=proportions)

df_refined.loc[df_refined['Location_Type'].isna(), 'Location_Type'] = random_values

In [48]:
# verifying all rows have been filled
num_missing

0

In [None]:
df_refined.columns

In [52]:
# removing fields created for cleaning
df_cleaned = df_refined.drop(['State', 'Zipcode', 'Acc_date', 'Acc_time', 'Day_Night_Calc'], axis=1)
df_cleaned.columns

Index(['Severity', 'Start_Time', 'End_Time', 'Distance(mi)', 'Temperature(F)',
       'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Location_Type'],
      dtype='object')

In [53]:
df_encoded = pd.get_dummies(df_cleaned, columns=['Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight', 
                                                 'Nautical_Twilight', 'Astronomical_Twilight', 'Location_Type'], drop_first=True)

In [55]:
del df_cleaned
gc.collect()

77

## Analysis

Text here

## Evaluation

### Business Insight/Recommendation 1

### Business Insight/Recommendation 2

### Business Insight/Recommendation 3

### Tableau Dashboard link

## Conclusion and Next Steps
Text here