In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [3]:
data = pd.read_csv("/content/drive/My Drive/CS439 DS project/collisions_data.csv")

In [4]:
# Let's take a sneak peek at the data
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,CRASH DATE,BOROUGH,LATITUDE,LONGITUDE,CRASH FACTOR,VEHICLE TYPE,VEHICLES INVOLVED,DEGREE OF EMERGENCY,CRASH HOUR
0,09/11/2021,,,,aggressive driving/road rage,sedan,2,1,2
1,03/26/2022,,,,pavement slippery,sedan,1,1,11
2,11/01/2023,BROOKLYN,40.62179,-73.970024,unspecified,moped,3,1,1
3,06/29/2022,,,,following too closely,sedan,2,0,6
4,09/21/2022,,,,passing too closely,st wagon/suv,1,0,13


In [5]:
# Let's drop unnecessary columns
data.drop(columns=['CRASH DATE', 'LATITUDE', 'LONGITUDE'], inplace=True)
data.head()

Unnamed: 0,BOROUGH,CRASH FACTOR,VEHICLE TYPE,VEHICLES INVOLVED,DEGREE OF EMERGENCY,CRASH HOUR
0,,aggressive driving/road rage,sedan,2,1,2
1,,pavement slippery,sedan,1,1,11
2,BROOKLYN,unspecified,moped,3,1,1
3,,following too closely,sedan,2,0,6
4,,passing too closely,st wagon/suv,1,0,13


In [6]:
# Feature types

nominal_cols = ["BOROUGH", "CRASH FACTOR", "VEHICLE TYPE"]
ordinal_cols = ["VEHICLES INVOLVED", "CRASH HOUR"]

In [7]:
# missing value handling
# (categoricals: "MISSING")

data[nominal_cols] = data[nominal_cols].fillna("MISSING")

In [8]:
X = data.drop(columns = ["DEGREE OF EMERGENCY"])
y = data["DEGREE OF EMERGENCY"]

In [9]:
# Stratified Trainâ€“Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

## Performing One Hot Encoding

In [10]:
# Perform one hot encoding

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_cat = ohe.fit_transform(X_train[nominal_cols])
X_test_cat = ohe.transform(X_test[nominal_cols])

In [11]:
# Perform scaling of numerical feature
scaler = StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train[ordinal_cols])
X_test_num_scaled = scaler.transform(X_test[ordinal_cols])

## Performing Scaling of numerical features

In [12]:
# Stacking together the final sets

X_train_prepared = np.hstack([X_train_cat, X_train_num_scaled])
X_test_prepared = np.hstack([X_test_cat, X_test_num_scaled])

# Function to train Logistic Regression model

In [None]:
# Implementing Logistic Regression

def train_logistic_regression(X_train, y_train, C=1.0):
  print("Training Logistic Regression...")
  logreg_model = LogisticRegression(C=C,
        max_iter=1000,
        solver="lbfgs",
        class_weight="balanced",
        n_jobs=-1)

  logreg_model.fit(X_train, y_train)
  print("Logistic Regression trained")

  return logreg_model

# Function to evaluate models

In [13]:
# Function that will automatically evaluate the model against different evaluation metrics

def evaluate(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    print(f"\n===== {model_name} Evaluation =====")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
    print("Weighted Precision:", precision_score(y_test, y_pred, average="weighted"))
    print("Weighted Recall:", recall_score(y_test, y_pred, average="weighted"))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Training and evaluating Logistic Regression models with different 'C' values

In [None]:
logistic_regression_model1 = train_logistic_regression(X_train_prepared, y_train)

Training Logistic Regression...
Logistic Regression trained


In [None]:
evaluate(logistic_regression_model1, X_test_prepared, y_test, "Logistic Regression")


===== Logistic Regression Evaluation =====
Accuracy: 0.541281097104962
Weighted F1: 0.6062642311590095
Weighted Precision: 0.7015629871924708
Weighted Recall: 0.541281097104962

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.60      0.69    419834
           1       0.31      0.37      0.34    134238
           2       0.01      0.62      0.01       843

    accuracy                           0.54    554915
   macro avg       0.38      0.53      0.35    554915
weighted avg       0.70      0.54      0.61    554915



In [None]:
logistic_regression_model2 = train_logistic_regression(X_train_prepared, y_train, C=0.1)

Training Logistic Regression...
Logistic Regression trained


In [None]:
evaluate(logistic_regression_model2, X_test_prepared, y_test, "Logistic Regression")


===== Logistic Regression Evaluation =====
Accuracy: 0.5422217817143166
Weighted F1: 0.6069109844476311
Weighted Precision: 0.7015659253090994
Weighted Recall: 0.5422217817143166

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.60      0.69    419834
           1       0.31      0.37      0.34    134238
           2       0.01      0.61      0.01       843

    accuracy                           0.54    554915
   macro avg       0.38      0.53      0.35    554915
weighted avg       0.70      0.54      0.61    554915



In [None]:
logistic_regression_model3 = train_logistic_regression(X_train_prepared, y_train, C=5)

Training Logistic Regression...
Logistic Regression trained


In [None]:
evaluate(logistic_regression_model3, X_test_prepared, y_test, "Logistic Regression")


===== Logistic Regression Evaluation =====
Accuracy: 0.542248812881252
Weighted F1: 0.6069904733157212
Weighted Precision: 0.701641159951496
Weighted Recall: 0.542248812881252

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.60      0.69    419834
           1       0.31      0.37      0.34    134238
           2       0.01      0.62      0.01       843

    accuracy                           0.54    554915
   macro avg       0.38      0.53      0.35    554915
weighted avg       0.70      0.54      0.61    554915



In [None]:
logistic_regression_model4 = train_logistic_regression(X_train_prepared, y_train, C=10)

Training Logistic Regression...
Logistic Regression trained


In [None]:
evaluate(logistic_regression_model4, X_test_prepared, y_test, "Logistic Regression")


===== Logistic Regression Evaluation =====
Accuracy: 0.5420055323788328
Weighted F1: 0.6067853583158686
Weighted Precision: 0.7016068880664698
Weighted Recall: 0.5420055323788328

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.60      0.69    419834
           1       0.31      0.37      0.34    134238
           2       0.01      0.62      0.01       843

    accuracy                           0.54    554915
   macro avg       0.38      0.53      0.35    554915
weighted avg       0.70      0.54      0.61    554915



In [None]:
logistic_regression_model5 = train_logistic_regression(X_train_prepared, y_train, C=0.01)

Training Logistic Regression...
Logistic Regression trained


In [None]:
evaluate(logistic_regression_model5, X_test_prepared, y_test, "Logistic Regression")


===== Logistic Regression Evaluation =====
Accuracy: 0.5413027220385104
Weighted F1: 0.6062279050150159
Weighted Precision: 0.701545886497156
Weighted Recall: 0.5413027220385104

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.60      0.69    419834
           1       0.31      0.37      0.34    134238
           2       0.01      0.61      0.01       843

    accuracy                           0.54    554915
   macro avg       0.38      0.53      0.35    554915
weighted avg       0.70      0.54      0.61    554915



# <font color='red'> No logistic regression model stands out. The best LR model has 0.60 F1 score, which is satisfactory, but not good.</font>

# Function to train Random Forest model

In [14]:
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=None, min_samples_leaf=2):
  print("Training Random Forest...")
  rf = RandomForestClassifier(random_state=42, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, class_weight="balanced")
  rf.fit(X_train, y_train)
  print("Random Forest Trained")
  return rf

# Training and evaluating different Random Forest models with varying <font color='red'> n_estimnators, max_depth, and min_samples_leaf</font>

In [None]:
rf_model1 = train_random_forest(X_train_prepared, y_train)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model1, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.6589711937864358
Weighted F1: 0.6925580690096268
Weighted Precision: 0.7392973929333471
Weighted Recall: 0.6589711937864358

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.71      0.77    419834
           1       0.41      0.52      0.46    134238
           2       0.01      0.23      0.01       843

    accuracy                           0.66    554915
   macro avg       0.42      0.48      0.41    554915
weighted avg       0.74      0.66      0.69    554915



In [None]:
rf_model2 = train_random_forest(X_train_prepared, y_train, n_estimators=200, max_depth=20, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model2, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.663636773199499
Weighted F1: 0.6989386532155598
Weighted Precision: 0.7415086533201101
Weighted Recall: 0.663636773199499

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.73      0.78    419834
           1       0.42      0.45      0.44    134238
           2       0.01      0.46      0.02       843

    accuracy                           0.66    554915
   macro avg       0.42      0.55      0.41    554915
weighted avg       0.74      0.66      0.70    554915



In [None]:
rf_model3 = train_random_forest(X_train_prepared, y_train, n_estimators=300, max_depth=20, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model3, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.6645324058639611
Weighted F1: 0.6995235683043155
Weighted Precision: 0.7415069016209286
Weighted Recall: 0.6645324058639611

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.73      0.79    419834
           1       0.42      0.45      0.44    134238
           2       0.01      0.46      0.02       843

    accuracy                           0.66    554915
   macro avg       0.42      0.55      0.41    554915
weighted avg       0.74      0.66      0.70    554915



In [15]:
rf_model4 = train_random_forest(X_train_prepared, y_train, n_estimators=300, max_depth=10, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model4, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.6355640052981087
Weighted F1: 0.6748248532619723
Weighted Precision: 0.729768986819146
Weighted Recall: 0.6355640052981087

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.80    419834
           1       0.41      0.23      0.30    134238
           2       0.01      0.71      0.01       843

    accuracy                           0.64    554915
   macro avg       0.42      0.57      0.37    554915
weighted avg       0.73      0.64      0.67    554915



In [None]:
rf_model5 = train_random_forest(X_train_prepared, y_train, n_estimators=300, max_depth=30, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model5, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.6618274870926177
Weighted F1: 0.6954320229827232
Weighted Precision: 0.7420649997104561
Weighted Recall: 0.6618274870926177

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.71      0.77    419834
           1       0.41      0.52      0.46    134238
           2       0.01      0.31      0.01       843

    accuracy                           0.66    554915
   macro avg       0.42      0.51      0.41    554915
weighted avg       0.74      0.66      0.70    554915



In [None]:
rf_model6 = train_random_forest(X_train_prepared, y_train, n_estimators=300, max_depth=5, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model6, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.6155447230656947
Weighted F1: 0.6494743094697317
Weighted Precision: 0.716608157144928
Weighted Recall: 0.6155447230656947

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.77      0.80    419834
           1       0.39      0.13      0.20    134238
           2       0.01      0.75      0.01       843

    accuracy                           0.62    554915
   macro avg       0.41      0.55      0.33    554915
weighted avg       0.72      0.62      0.65    554915



In [None]:
rf_model7 = train_random_forest(X_train_prepared, y_train, n_estimators=300, max_depth=10, min_samples_leaf=2)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model7, X_test_prepared, y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.626090482326122
Weighted F1: 0.6703351837561984
Weighted Precision: 0.7271751817656066
Weighted Recall: 0.626090482326122

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.75      0.79    419834
           1       0.39      0.25      0.31    134238
           2       0.01      0.71      0.01       843

    accuracy                           0.63    554915
   macro avg       0.41      0.57      0.37    554915
weighted avg       0.73      0.63      0.67    554915



# Random Forest model with <font color='red'> n_estimnators=300, max_depth=10, and min_samples_leaf=5</font> stands out.
* We will try to:
> * Play around with threshold probability to improve recall and precision of label=2
> * Try to train model on top 20 features based on feature importance and evaluate

# <font color='red'> Trying different threshold probabilities to find probability which will give imporved recall and frequency of RF model with abovespecified hyperparameters</font>

In [None]:
proba = rf_model4.predict_proba(X_test_prepared)

In [None]:
classes = rf_model4.classes_
classes

array([0, 1, 2])

In [None]:
def evaluate_thresholds_for_class2(
    proba,
    y_true,
    thresholds=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    model_name="Random Forest"
):
    # find index of class 2 in proba columns
    classes = rf_model4.classes_
    class2_idx = np.where(classes == 2)[0][0]

    # baseline argmax predictions
    y_pred_argmax = classes[np.argmax(proba, axis=1)]

    print(f"Baseline ({model_name}) with argmax decision rule:")
    print(classification_report(y_true, y_pred_argmax, zero_division=0))
    print("=" * 70)

    for thresh in thresholds:
        # start from argmax predictions
        y_pred_custom = y_pred_argmax.copy()

        # probability of class 2
        p_class2 = proba[:, class2_idx]

        # override: if P(class 2) >= threshold, set label to 2
        y_pred_custom[p_class2 >= thresh] = 2

        # metrics
        prec_weighted = precision_score(y_true, y_pred_custom, average="weighted", zero_division=0)
        rec_weighted = recall_score(y_true, y_pred_custom, average="weighted", zero_division=0)
        f1_weighted = f1_score(y_true, y_pred_custom, average="weighted", zero_division=0)

        # metrics for class 2 specifically
        prec_2 = precision_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)
        rec_2 = recall_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)
        f1_2 = f1_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)

        print(f"\n===== Threshold for class 2: {thresh:.2f} =====")
        print(f"Weighted  - Precision: {prec_weighted:.4f}, Recall: {rec_weighted:.4f}, F1: {f1_weighted:.4f}")
        print(f"Class 2   - Precision: {prec_2:.4f}, Recall: {rec_2:.4f}, F1: {f1_2:.4f}")
        # If you want full report, uncomment:
        print("\nClassification Report:\n", classification_report(y_true, y_pred_custom, zero_division=0))


In [None]:
thresholds_to_test = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90]

evaluate_thresholds_for_class2(
    proba=proba,
    y_true=y_test,
    thresholds=thresholds_to_test,
    model_name="Random Forest (final)"
)

Baseline (Random Forest (final)) with argmax decision rule:
              precision    recall  f1-score   support

           0       0.83      0.76      0.80    419834
           1       0.41      0.23      0.30    134238
           2       0.01      0.71      0.01       843

    accuracy                           0.64    554915
   macro avg       0.42      0.57      0.37    554915
weighted avg       0.73      0.64      0.67    554915


===== Threshold for class 2: 0.10 =====
Weighted  - Precision: 0.6642, Recall: 0.0148, F1: 0.0261
Class 2   - Precision: 0.0015, Recall: 1.0000, F1: 0.0031

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.02      0.03    419834
           1       0.25      0.00      0.00    134238
           2       0.00      1.00      0.00       843

    accuracy                           0.01    554915
   macro avg       0.35      0.34      0.01    554915
weighted avg       0.66      0.01      0.03    5549

# Baseline RF model with abovespecified hyperparameters still stands out

In [None]:
predictions_y = rf_model4.predict(X_test_prepared)

In [None]:
# Get unique values and their counts
unique_values, counts_values = np.unique(predictions_y, return_counts=True)

# Print the results
print("Original Array:", predictions_y)
print("Unique Values:", unique_values)
print("Counts:", counts_values)

Original Array: [0 0 2 ... 0 2 0]
Unique Values: [0 1 2]
Counts: [385676  75012  94227]


In [None]:
# Get unique values and their counts
unique_values, counts_values = np.unique(y_test, return_counts=True)

# Print the results
print("Original Array:", y_test)
print("Unique Values:", unique_values)
print("Counts:", counts_values)

Original Array: 621240     0
465481     1
553134     1
635215     0
1424873    0
          ..
647566     0
626824     0
2085248    0
449280     0
1199584    0
Name: DEGREE OF EMERGENCY, Length: 554915, dtype: int64
Unique Values: [0 1 2]
Counts: [419834 134238    843]


**This model stands out because it is able to predict label 1 and 2 emergencies better collectively, and also draws a clear and distinct prediction difference between (label 1 and 2 emergencies) and label 0 emergency**, letting us to adjust our focus on actual emergencies rather than normal fender bender with 0 injuries, causalties and fatalties.

# <font color='red'> Trying constructing RF model with top 20 features based on feature importance, and trying adjusting their threshold probabilities</font>

In [None]:
importances = rf_model4.feature_importances_

In [None]:
top_20_features = np.argsort(importances)[-20:]
top_20_features

array([76, 11,  9, 58, 42, 27,  2, 69, 15, 21,  3, 39, 71, 24, 73, 54, 57,
       64, 81, 80])

In [None]:
rf_model_top_features = train_random_forest(X_train_prepared[:, top_20_features], y_train, n_estimators=300, max_depth=10, min_samples_leaf=5)

Training Random Forest...
Random Forest Trained


In [None]:
evaluate(rf_model_top_features, X_test_prepared[:, top_20_features], y_test, "Random Forest")


===== Random Forest Evaluation =====
Accuracy: 0.646354847138751
Weighted F1: 0.6830609482149096
Weighted Precision: 0.7322245322347111
Weighted Recall: 0.646354847138751

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.77      0.80    419834
           1       0.42      0.26      0.32    134238
           2       0.01      0.66      0.01       843

    accuracy                           0.65    554915
   macro avg       0.42      0.56      0.38    554915
weighted avg       0.73      0.65      0.68    554915



In [None]:
rf_top_features_proba = rf_model_top_features.predict_proba(X_test_prepared[:, top_20_features])

In [None]:
def evaluate_thresholds_for_class2(
    proba,
    y_true,
    thresholds=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    model_name="Random Forest"
):
    # find index of class 2 in proba columns
    classes = rf_model_top_features.classes_
    class2_idx = np.where(classes == 2)[0][0]

    # baseline argmax predictions
    y_pred_argmax = classes[np.argmax(proba, axis=1)]

    print(f"Baseline ({model_name}) with argmax decision rule:")
    print(classification_report(y_true, y_pred_argmax, zero_division=0))
    print("=" * 70)

    for thresh in thresholds:
        # start from argmax predictions
        y_pred_custom = y_pred_argmax.copy()

        # probability of class 2
        p_class2 = proba[:, class2_idx]

        # override: if P(class 2) >= threshold, set label to 2
        y_pred_custom[p_class2 >= thresh] = 2

        # metrics
        prec_weighted = precision_score(y_true, y_pred_custom, average="weighted", zero_division=0)
        rec_weighted = recall_score(y_true, y_pred_custom, average="weighted", zero_division=0)
        f1_weighted = f1_score(y_true, y_pred_custom, average="weighted", zero_division=0)

        # metrics for class 2 specifically
        prec_2 = precision_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)
        rec_2 = recall_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)
        f1_2 = f1_score(y_true, y_pred_custom, labels=[2], average="macro", zero_division=0)

        print(f"\n===== Threshold for class 2: {thresh:.2f} =====")
        print(f"Weighted  - Precision: {prec_weighted:.4f}, Recall: {rec_weighted:.4f}, F1: {f1_weighted:.4f}")
        print(f"Class 2   - Precision: {prec_2:.4f}, Recall: {rec_2:.4f}, F1: {f1_2:.4f}")
        # If you want full report, uncomment:
        print("\nClassification Report:\n", classification_report(y_true, y_pred_custom, zero_division=0))


In [None]:
thresholds_to_test = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90]

evaluate_thresholds_for_class2(
    proba=rf_top_features_proba,
    y_true=y_test,
    thresholds=thresholds_to_test,
    model_name="Random Forest (final)"
)

Baseline (Random Forest (final)) with argmax decision rule:
              precision    recall  f1-score   support

           0       0.83      0.77      0.80    419834
           1       0.42      0.26      0.32    134238
           2       0.01      0.66      0.01       843

    accuracy                           0.65    554915
   macro avg       0.42      0.56      0.38    554915
weighted avg       0.73      0.65      0.68    554915


===== Threshold for class 2: 0.10 =====
Weighted  - Precision: 0.7285, Recall: 0.1614, F1: 0.2575
Class 2   - Precision: 0.0019, Recall: 0.9941, F1: 0.0037

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.21      0.34    419834
           1       0.35      0.01      0.02    134238
           2       0.00      0.99      0.00       843

    accuracy                           0.16    554915
   macro avg       0.40      0.40      0.12    554915
weighted avg       0.73      0.16      0.26    5549

# <font color='blue'> Conclusion:</font>

* <font color='red'> RF model with following hyperparameter values stands out:</font>
> * <font color='red'> n_estimators = 300</font>
> * <font color='red'> max_depth = 10</font>
> * <font color='red'> min_samples_leaf = 5</font>
* <font color='red'> Abovementioned model is less complex than other RF models with training time of 10 mins on Google Colab pro version.</font>
* <font color='red'> Key findings for the abovementioned RF model:</font>
> * <font color='red'> Label 0</font>
>> * <font color='red'> Precision = 0.83</font>
>> * <font color='red'> Recall = 0.76</font>
>> * <font color='red'> This means it captures most of the non emergency cases enabling us to know where not to invest additional resources, because non-emergency cases constitues the most crashes (419834).</font>
> * <font color='red'> Label 1</font>
>> * <font color='red'> Precision = 0.41</font>
>> * <font color='red'> Recall = 0.23</font>
> * <font color='red'> Label 2</font>
>> * <font color='red'> Precision = 0.01</font>
>> * <font color='red'> Recall = 0.71</font>
> * <font color='red'> This means it flags decent amount of medium emergency crashes correctly (which still require attention by NYPD and ambulance) and others get flagged as high emergency cases.</font>
> * <font color='red'> High recall of label 2 proves that it captures 71% valyes accurately while upon analyzing it also captures vast amount of label 1 amergencies. This is desired behavior because sometimes people with injury also require immediate attention, based on degree of injury.</font>

# **Saving model with Pickle for future use**

In [16]:
import pickle

In [17]:
filename = 'finalized_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf_model4, file)