In [41]:
# Dependencies
# Data Science Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Regression/Modelling Libraries
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
# Read in the cleaned data
df = pd.read_csv("../Data/cleaned_car_crash_data.csv")

# Remove the redundant index column
df = df.drop("index", axis=1)

# Display first 5 rows
df.head()

Unnamed: 0,year,month,day,hour,collision_type,injury_type,primary_factor,reported_location,latitude,longitude
0,2015,1,5,0.0,2-Car,No injury/unknown,OTHER,1ST & FESS,39.159207,-86.525874
1,2015,1,6,1500.0,2-Car,No injury/unknown,FOLLOWING TOO CLOSELY,2ND & COLLEGE,39.16144,-86.534848
2,2015,1,6,2300.0,2-Car,Non-incapacitating,DISREGARD SIGNAL/REG SIGN,BASSWOOD & BLOOMFIELD,39.14978,-86.56889
3,2015,1,7,900.0,2-Car,Non-incapacitating,FAILURE TO YIELD RIGHT OF WAY,GATES & JACOBS,39.165655,-86.575956
4,2015,1,7,1100.0,2-Car,No injury/unknown,FAILURE TO YIELD RIGHT OF WAY,W 3RD,39.164848,-86.579625


### One-Hot Encoding for Collision Type
---
Source: https://www.geeksforgeeks.org/ml-one-hot-encoding/

In [18]:
# Most of this code came from: https://www.geeksforgeeks.org/ml-one-hot-encoding/

#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns = categorical_columns[0:3] # Exclude the Reported Location column due to the number of locations

#Initialize OneHotEncoder
encoder = OneHotEncoder(drop="first", sparse_output=False)

# Apply one-hot encoding to the collision_type column
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)

df_encoded

Unnamed: 0,year,month,day,hour,reported_location,latitude,longitude,collision_type_1-Car,collision_type_2-Car,collision_type_3+ Cars,...,primary_factor_PEDESTRIAN ACTION,primary_factor_RAN OFF ROAD,primary_factor_ROAD CONDITIONS,primary_factor_SPEED TOO FAST FOR WEATHER CONDITIONS,primary_factor_UNSAFE BACKING,primary_factor_UNSAFE LANE MOVEMENT,primary_factor_UNSAFE SPEED,primary_factor_VEHICLE DEFECT,primary_factor_VIEW OBSTRUCTED,primary_factor_WRONG WAY ON ONE WAY
0,2015,1,5,0.0,1ST & FESS,39.159207,-86.525874,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015,1,6,1500.0,2ND & COLLEGE,39.161440,-86.534848,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015,1,6,2300.0,BASSWOOD & BLOOMFIELD,39.149780,-86.568890,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015,1,7,900.0,GATES & JACOBS,39.165655,-86.575956,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015,1,7,1100.0,W 3RD,39.164848,-86.579625,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52577,2003,10,6,1700.0,DUNN & WHITE LOT WEST,0.000000,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52578,2003,11,3,800.0,RED OAK & SR446,0.000000,0.000000,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52579,2003,12,5,1200.0,2ND ST & WALNUT,0.000000,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
52580,2003,12,1,700.0,NINETH & NORTH,0.000000,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Random Forest Regressor
---

In [25]:
# Dropping columns that aren't likely to be relevant
newdf = df_encoded.copy()
newdf = newdf.drop(["reported_location"], axis=1)
newdf.head()

Unnamed: 0,year,month,day,hour,latitude,longitude,collision_type_1-Car,collision_type_2-Car,collision_type_3+ Cars,collision_type_Bus,...,primary_factor_PEDESTRIAN ACTION,primary_factor_RAN OFF ROAD,primary_factor_ROAD CONDITIONS,primary_factor_SPEED TOO FAST FOR WEATHER CONDITIONS,primary_factor_UNSAFE BACKING,primary_factor_UNSAFE LANE MOVEMENT,primary_factor_UNSAFE SPEED,primary_factor_VEHICLE DEFECT,primary_factor_VIEW OBSTRUCTED,primary_factor_WRONG WAY ON ONE WAY
0,2015,1,5,0.0,39.159207,-86.525874,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015,1,6,1500.0,39.16144,-86.534848,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015,1,6,2300.0,39.14978,-86.56889,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015,1,7,900.0,39.165655,-86.575956,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015,1,7,1100.0,39.164848,-86.579625,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Dropping rows that have bad information (i.e. incorrect latitude and longitude)
# From the maps, we know that these values are going to have a latitude > 10, and delete any positive longitudes
ohe_df = newdf.loc[(newdf.latitude > 10) & (newdf.longitude < 0)]

In [58]:
# Define X and Y values
# From: https://stackoverflow.com/questions/65749305/labelencoder-vs-onehot-encoding-in-random-forest-regressor
y = ohe_df.day
X = ohe_df.drop(columns=["day"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

rf_reg = RandomForestRegressor(n_estimators=100,
                               n_jobs=-1,
                               random_state=42)

rf_reg.fit(X_train, y_train)

test_pred_y = rf_reg.predict(X_test)

print(f"test_MSE = {mean_squared_error(y_test, test_pred_y)}")
print(f"The r-squared is: {r2_score(y_test, test_pred_y)}")

test_MSE = 3.782609656013799
The r-squared is: -0.03986010021098374
