In [3]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle, geodesic
from shapely.geometry import MultiPoint
import folium
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy.spatial.distance import cdist
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, ArrayType, DoubleType

In [5]:
# Suppress scientific notations for large float values
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_columns', None)

### This is the dataframe from the previous DBSCAN processing

In [6]:
sd_collision = pd.read_csv(
    'pre_rf_data.csv', header='infer', engine='pyarrow')

### Selecting only the relevant columns

In [7]:
sd_collision_selected = sd_collision[["Severity", "Start_Time", "Start_Lat", "Start_Lng", "Temperature_F", "Wind_Chill_F", "Humidity_pct", "Pressure_in", "Visibility_mi", "Wind_Speed_mph", "Precipitation_in", "Weather_Condition",
                                      "Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop", "is_hotspot"]]

In [8]:
sd_collision_selected

Unnamed: 0,Severity,Start_Time,Start_Lat,Start_Lng,Temperature_F,Wind_Chill_F,Humidity_pct,Pressure_in,Visibility_mi,Wind_Speed_mph,Precipitation_in,Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,is_hotspot
0,2,2016-06-21 10:39:52,33.1707,-117.2078,69.1000,,84.0000,29.9700,9.0000,6.9000,,Partly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,1
1,3,2016-06-21 10:57:39,32.7660,-117.1283,69.1000,,75.0000,29.9800,9.0000,8.1000,,Scattered Clouds,False,False,False,False,False,False,False,False,False,False,False,False,False,1
2,3,2016-06-21 10:50:48,32.9323,-117.1127,73.0000,,66.0000,29.9700,10.0000,8.1000,,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,1
3,3,2016-06-21 10:50:13,33.0938,-117.0844,69.1000,,84.0000,29.9700,9.0000,6.9000,,Partly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,1
4,2,2016-06-21 11:12:25,32.7029,-117.0130,75.2000,,61.0000,29.9700,10.0000,11.5000,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104160,2,2019-08-22 18:14:31,33.0701,-117.0693,79.0000,79.0000,47.0000,28.3300,10.0000,8.0000,0.0000,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,1
104161,2,2019-08-22 18:35:31,33.1811,-117.3357,68.0000,68.0000,76.0000,29.4300,8.0000,10.0000,0.0000,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,1
104162,2,2019-08-23 15:33:46,32.7735,-117.1612,74.0000,74.0000,62.0000,29.3900,10.0000,7.0000,0.0000,Fair,False,False,False,False,True,False,False,False,False,False,False,False,False,1
104163,2,2019-08-23 16:43:59,32.7293,-117.1068,75.0000,75.0000,64.0000,29.8000,10.0000,7.0000,0.0000,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,1


## Data Cleanup

### Fill nulls with the mean values for numerical features

In [None]:
# Replace NaNs with column mean for numerical data
sd_collision_selected.loc[:, 'Wind_Chill_F'] = sd_collision_selected['Wind_Chill_F'].fillna(
    sd_collision_selected['Wind_Chill_F'].mean())
sd_collision_selected.loc[:, 'Temperature_F'] = sd_collision_selected['Temperature_F'].fillna(
    sd_collision_selected['Temperature_F'].mean())
sd_collision_selected.loc[:, 'Humidity_pct'] = sd_collision_selected['Humidity_pct'].fillna(
    sd_collision_selected['Humidity_pct'].mean())
sd_collision_selected.loc[:, 'Pressure_in'] = sd_collision_selected['Pressure_in'].fillna(
    sd_collision_selected['Pressure_in'].mean())
sd_collision_selected.loc[:, 'Visibility_mi'] = sd_collision_selected['Visibility_mi'].fillna(
    sd_collision_selected['Visibility_mi'].mean())
sd_collision_selected.loc[:, 'Wind_Speed_mph'] = sd_collision_selected['Wind_Speed_mph'].fillna(
    sd_collision_selected['Wind_Speed_mph'].mean())
sd_collision_selected.loc[:, 'Precipitation_in'] = sd_collision_selected['Precipitation_in'].fillna(
    sd_collision_selected['Precipitation_in'].mean())


# Fill the categorical features with unknown
sd_collision_selected.loc[:, 'Weather_Condition'] = sd_collision_selected['Weather_Condition'].fillna(
    'Unknown')

### Fill all boolean nulls with FALSE

In [10]:
# Automatically identify boolean columns
boolean_columns = sd_collision_selected.select_dtypes(include='bool').columns

# Convert all boolean columns to 0/1 and change the datatype to integer
for col in boolean_columns:
    # First, ensure that NaNs are replaced with False (or True, depending on your case)
    sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
        False).astype(int)

  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(
  sd_collision_selected.loc[:, col] = sd_collision_selected[col].fillna(


In [12]:
ditinct_Weather_Condition = sd_collision_selected['Weather_Condition'].unique()
print((ditinct_Weather_Condition))

['Partly Cloudy' 'Scattered Clouds' 'Mostly Cloudy' 'Clear' 'Overcast'
 'Unknown' 'Mist' 'Haze' 'Volcanic Ash' 'Fog' 'Drizzle' 'Light Rain'
 'Rain' 'Heavy Rain' 'Light Drizzle' 'Thunderstorms and Rain' 'Hail'
 'Light Thunderstorms and Rain' 'Fair' 'Cloudy' 'Mostly Cloudy / Windy'
 'Fair / Windy' 'Heavy Rain / Windy' 'Light Rain / Windy' 'T-Storm'
 'Thunder' 'Light Snow' 'Thunder in the Vicinity' 'N/A Precipitation'
 'Rain / Windy' 'Cloudy / Windy' 'Partly Cloudy / Windy' 'Shallow Fog'
 'Smoke' 'Patches of Fog' 'Blowing Dust' 'Light Rain with Thunder'
 'Heavy T-Storm' 'Heavy Thunderstorms and Rain' 'Heavy Snow']


### Convert the time in UTC format to minute of the day, so that we can use it as a numerical parameter for our predictive algorithm

In [13]:
sd_collision_selected.loc[:, 'Minute_of_Day'] = sd_collision_selected['Start_Time'].dt.hour * \
    60 + sd_collision_selected['Start_Time'].dt.minute

sd_collision_selected = sd_collision_selected.drop(
    columns=['Start_Time'])
sd_collision_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sd_collision_selected.loc[:, 'Minute_of_Day'] = sd_collision_selected['Start_Time'].dt.hour * \


Unnamed: 0,Severity,Start_Lat,Start_Lng,Temperature_F,Wind_Chill_F,Humidity_pct,Pressure_in,Visibility_mi,Wind_Speed_mph,Precipitation_in,Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,is_hotspot,Minute_of_Day
0,2,33.1707,-117.2078,69.1000,63.1353,84.0000,29.9700,9.0000,6.9000,0.0056,Partly Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,639
1,3,32.7660,-117.1283,69.1000,63.1353,75.0000,29.9800,9.0000,8.1000,0.0056,Scattered Clouds,0,0,0,0,0,0,0,0,0,0,0,0,0,1,657
2,3,32.9323,-117.1127,73.0000,63.1353,66.0000,29.9700,10.0000,8.1000,0.0056,Mostly Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,650
3,3,33.0938,-117.0844,69.1000,63.1353,84.0000,29.9700,9.0000,6.9000,0.0056,Partly Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,650
4,2,32.7029,-117.0130,75.2000,63.1353,61.0000,29.9700,10.0000,11.5000,0.0056,Clear,0,0,0,0,0,0,0,0,0,0,0,0,0,1,672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104160,2,33.0701,-117.0693,79.0000,79.0000,47.0000,28.3300,10.0000,8.0000,0.0000,Fair,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1094
104161,2,33.1811,-117.3357,68.0000,68.0000,76.0000,29.4300,8.0000,10.0000,0.0000,Fair,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1115
104162,2,32.7735,-117.1612,74.0000,74.0000,62.0000,29.3900,10.0000,7.0000,0.0000,Fair,0,0,0,0,1,0,0,0,0,0,0,0,0,1,933
104163,2,32.7293,-117.1068,75.0000,75.0000,64.0000,29.8000,10.0000,7.0000,0.0000,Fair,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1003


### Reducing the categorical features weather conditions column to logical groups

In [14]:
# Define mapping of weather conditions to groups
weather_mapping = {
    'Clear': 'Clear',
    'Fair': 'Clear',
    'Fair / Windy': 'Clear',

    'Partly Cloudy': 'Cloudy',
    'Mostly Cloudy': 'Cloudy',
    'Overcast': 'Cloudy',
    'Cloudy': 'Cloudy',
    'Scattered Clouds': 'Cloudy',
    'Cloudy / Windy': 'Cloudy',
    'Mostly Cloudy / Windy': 'Cloudy',
    'Partly Cloudy / Windy': 'Cloudy',

    'Light Rain': 'Rainy',
    'Rain': 'Rainy',
    'Heavy Rain': 'Rainy',
    'Drizzle': 'Rainy',
    'Light Drizzle': 'Rainy',
    'Rain / Windy': 'Rainy',
    'Light Rain / Windy': 'Rainy',
    'Heavy Rain / Windy': 'Rainy',

    'T-Storm': 'Thunderstorm',
    'Thunder': 'Thunderstorm',
    'Thunder in the Vicinity': 'Thunderstorm',
    'Thunderstorms and Rain': 'Thunderstorm',
    'Heavy T-Storm': 'Thunderstorm',
    'Light Thunderstorms and Rain': 'Thunderstorm',
    'Heavy Thunderstorms and Rain': 'Thunderstorm',
    'Light Rain with Thunder': 'Thunderstorm',

    'Light Snow': 'Snowy',
    'Heavy Snow': 'Snowy',

    'Mist': 'Hazy',
    'Haze': 'Hazy',
    'Fog': 'Hazy',
    'Shallow Fog': 'Hazy',
    'Patches of Fog': 'Hazy',
    'Smoke': 'Hazy',

    'Volcanic Ash': 'Dusty',
    'Blowing Dust': 'Dusty',

    'Unknown': 'Unknown',
    'N/A Precipitation': 'Unknown',
    'Hail': 'Unknown'
}

# Apply the mapping to the Weather_Condition column
sd_collision_selected['Weather_Condition'] = sd_collision_selected['Weather_Condition'].replace(
    weather_mapping)

# Verify the updated DataFrame
sd_collision_selected

Unnamed: 0,Severity,Start_Lat,Start_Lng,Temperature_F,Wind_Chill_F,Humidity_pct,Pressure_in,Visibility_mi,Wind_Speed_mph,Precipitation_in,Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,is_hotspot,Minute_of_Day
0,2,33.1707,-117.2078,69.1000,63.1353,84.0000,29.9700,9.0000,6.9000,0.0056,Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,639
1,3,32.7660,-117.1283,69.1000,63.1353,75.0000,29.9800,9.0000,8.1000,0.0056,Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,657
2,3,32.9323,-117.1127,73.0000,63.1353,66.0000,29.9700,10.0000,8.1000,0.0056,Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,650
3,3,33.0938,-117.0844,69.1000,63.1353,84.0000,29.9700,9.0000,6.9000,0.0056,Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,650
4,2,32.7029,-117.0130,75.2000,63.1353,61.0000,29.9700,10.0000,11.5000,0.0056,Clear,0,0,0,0,0,0,0,0,0,0,0,0,0,1,672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104160,2,33.0701,-117.0693,79.0000,79.0000,47.0000,28.3300,10.0000,8.0000,0.0000,Clear,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1094
104161,2,33.1811,-117.3357,68.0000,68.0000,76.0000,29.4300,8.0000,10.0000,0.0000,Clear,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1115
104162,2,32.7735,-117.1612,74.0000,74.0000,62.0000,29.3900,10.0000,7.0000,0.0000,Clear,0,0,0,0,1,0,0,0,0,0,0,0,0,1,933
104163,2,32.7293,-117.1068,75.0000,75.0000,64.0000,29.8000,10.0000,7.0000,0.0000,Clear,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1003


### Random Forest

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

### One Hot encoding to convert the categorical column to boolean features

In [16]:
# One-hot encode the categorical column
df_encoded = pd.get_dummies(sd_collision_selected, columns=[
                            'Weather_Condition'], drop_first=True)

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=['Severity'])
y = df_encoded['Severity']

### Train Test Split

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

### Create the Random Forest Model

In [18]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

### Make prediction Sets

In [19]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.86512

Classification Report:
               precision    recall  f1-score   support

           1       0.25      0.02      0.03        60
           2       0.89      0.95      0.92     24670
           3       0.73      0.58      0.64      6313
           4       0.80      0.25      0.38       207

    accuracy                           0.87     31250
   macro avg       0.67      0.45      0.49     31250
weighted avg       0.86      0.87      0.86     31250



### Feature Importance

In [20]:
# Display feature importance
importances = rf_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for better readability
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances.head(10))  # Show top 10 features

             Feature  Importance
1          Start_Lng      0.1612
0          Start_Lat      0.1554
23     Minute_of_Day      0.1159
5        Pressure_in      0.1016
8   Precipitation_in      0.1000
3       Wind_Chill_F      0.0771
4       Humidity_pct      0.0768
2      Temperature_F      0.0707
7     Wind_Speed_mph      0.0651
6      Visibility_mi      0.0241


## Working Model predictor

In [None]:
# # Example input for a single row
# new_data = {
#     'Start_Lat': 32.7660,
#     'Start_Lng': -117.1283,
#     'Temperature_F': 75,
#     'Wind_Chill_F': 70,
#     'Humidity_pct': 50,
#     'Pressure_in': 29.92,
#     'Visibility_mi': 10,
#     'Wind_Direction': 'North',  # Will be converted during preprocessing
#     'Wind_Speed_mph': 5,
#     'Precipitation_in': 0.0056,
#     'Amenity': 0,
#     'Bump': 0,
#     'Crossing': 1,
#     'Give_Way': 0,
#     'Junction': 0,
#     'No_Exit': 0,
#     'Railway': 0,
#     'Roundabout': 0,
#     'Station': 0,
#     'Stop': 0,
#     'Traffic_Calming': 0,
#     'Traffic_Signal': 1,
#     'Turning_Loop': 0,
#     'is_hotspot': 1,
#     'Minute_of_Day': 657,
#     'Weather_Condition_Overcast': 1,  # Example one-hot encoded column
#     'Weather_Condition_Clear': 0,
#     # Add other one-hot encoded weather conditions as needed
# }

In [22]:
# Example data as a dictionary
new_data = {
    'Start_Lat': 32.7660,
    'Start_Lng': -117.1283,
    'Temperature_F': 69.1,
    'Wind_Chill_F': 63.1353,
    'Humidity_pct': 75.0,
    'Pressure_in': 29.98,
    'Visibility_mi': 9.0,
    'Wind_Speed_mph': 8.1,
    'Precipitation_in': 0.0056,
    'Amenity': 0,
    'Bump': 0,
    'Crossing': 0,
    'Give_Way': 0,
    'Junction': 0,
    'No_Exit': 0,
    'Railway': 0,
    'Roundabout': 0,
    'Station': 0,
    'Stop': 0,
    'Traffic_Calming': 0,
    'Traffic_Signal': 0,
    'Turning_Loop': 0,
    'is_hotspot': 1,
    'Minute_of_Day': 657,
    'Weather_Condition_Cloudy': 1,  # Add one-hot encoding for 'Cloudy'
    'Weather_Condition_Clear': 0,
    'Weather_Condition_Rain': 0,
    # Add other one-hot encoded weather conditions if applicable
}

In [23]:
# Convert to DataFrame
new_data_df = pd.DataFrame([new_data])

In [30]:
X.columns

Index(['Start_Lat', 'Start_Lng', 'Temperature_F', 'Wind_Chill_F',
       'Humidity_pct', 'Pressure_in', 'Visibility_mi', 'Wind_Speed_mph',
       'Precipitation_in', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'is_hotspot',
       'Minute_of_Day', 'Weather_Condition_Cloudy', 'Weather_Condition_Dusty',
       'Weather_Condition_Hazy', 'Weather_Condition_Rainy',
       'Weather_Condition_Snowy', 'Weather_Condition_Thunderstorm',
       'Weather_Condition_Unknown'],
      dtype='object')

### Fill unknown column values with 0, in case it is not provided for prediction

In [24]:
# Ensure the new data has the same columns as the training data
for col in X.columns:  # X is the training feature set
    if col not in new_data_df:
        new_data_df[col] = 0  # Add missing columns with default value 0
new_data_df = new_data_df[X.columns]  # Align column order

In [25]:
# Predict severity using the Random Forest model
predicted_severity = rf_model.predict(new_data_df)

# Print the prediction
print("Predicted Severity:", predicted_severity[0])  # Predicted class

# Predict probabilities
probabilities = rf_model.predict_proba(new_data_df)

# Print the probabilities
# [prob_class_0, prob_class_1, ...]
print("Prediction Probabilities:", probabilities[0])

Predicted Severity: 3
Prediction Probabilities: [0.   0.11 0.89 0.  ]


### Save the model using Pickel for frontend

In [26]:
# Step 2: Save the trained model using pickle
with open('collision_rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)