In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [14]:
# Import and read the charity_data.csv.

import pandas as pd 
motor_df = pd.read_csv('Resources/Motor_Vehicle_Crashes_-_Vehicle_Information__Three_Year_Window.csv')
motor_df.head()

Unnamed: 0,Year,Case Vehicle ID,Vehicle Body Type,Registration Class,Action Prior to Accident,Type / Axles of Truck or Bus,Direction of Travel,Fuel Type,Vehicle Year,State of Registration,Number of Occupants,Engine Cylinders,Vehicle Make,Contributing Factor 1,Contributing Factor 1 Description,Contributing Factor 2,Contributing Factor 2 Description,Event Type,Partial VIN
0,2018,15232120,VAN TRUCK,AGRICULTURAL TRUCK,Parked,Not Entered,North,Gas,2015.0,NY,0.0,4.0,NISSA,HUMAN,Not Entered,HUMAN,Not Entered,Not Entered,3N6CM0KN2FK713460
1,2018,15232535,SUBURBAN,PASSENGER OR SUBURBAN,Making U Turn,Not Entered,South,Gas,2015.0,NY,1.0,6.0,HONDA,HUMAN,Not Entered,HUMAN,Unknown,Not Entered,5J6TF2H55FL001556
2,2018,15232536,SEDAN,Not Entered,Going Straight Ahead,Not Entered,North,Not Entered,,PA,1.0,,,HUMAN,Not Entered,HUMAN,Unknown,Not Entered,
3,2018,15232557,4 DOOR SEDAN,PASSENGER OR SUBURBAN,Parked,Not Entered,South,Gas,2013.0,NY,5.0,4.0,NISSA,HUMAN,Passenger Distraction,VEHICLE,Other*,Not Entered,1N4AL3AP8DC229156
4,2018,15232556,SUBURBAN,PASSENGER OR SUBURBAN,Going Straight Ahead,Not Entered,South,Gas,2009.0,NY,2.0,8.0,CADIL,HUMAN,Not Applicable,HUMAN,Not Applicable,Not Entered,3GYFK22209G213337


Cleaning the Data for processing


In [15]:
# Drop all NA from rows

motor_df = motor_df.dropna()

In [16]:
# Get Names for indexs for which Contributing Factor  1 Description 1 is Not Entered
# Drop these rows

not_entered = motor_df[motor_df['Contributing Factor 1 Description'] == 'Not Entered'].index
motor_df.drop(not_entered, inplace = True)

In [17]:
# Get Names for indexs for which Contributing Factor  1 Description 1 is Not applicable
# Drop these rows

not_app = motor_df[motor_df['Contributing Factor 1 Description'] == 'Not Applicable'].index
motor_df.drop(not_app, inplace = True)

In [18]:
# Get Names for index for which Contributing Factor 1 Description is Unknown
# Drop these rows

unknown = motor_df[motor_df['Contributing Factor 1 Description'] == 'Unknown'].index
motor_df.drop(unknown, inplace = True)

In [19]:
# Get unknown values in direction of travel
# Drop these rows

unknown2 = motor_df[motor_df['Direction of Travel'] == 'Unknown'].index
motor_df.drop(unknown2, inplace = True)

In [20]:
# Get Not Applicable in Direction of Travel and drop these rows

not_app = motor_df[motor_df['Direction of Travel'] == 'Not Applicable'].index
motor_df.drop(not_app, inplace = True)

In [21]:
# Looking at the Vehicle Make column
motor_count = motor_df['Vehicle Make'].value_counts()

# Determine which values to replace if counts are less than 5000
replace_Vehicle = list(motor_count[motor_count < 3000].index)
replace_Vehicle

# Replace in dataframe
for car in replace_Vehicle:
    motor_df["Vehicle Make"] = motor_df["Vehicle Make"].replace(car,"Other")
    
# Check to make sure binning was successful
motor_df['Vehicle Make'].value_counts()

TOYOT    57239
HONDA    46095
CHEVR    44133
FORD     44126
NISSA    35334
Other    33098
JEEP     21089
HYUND    18128
DODGE    16350
SUBAR    13317
KIA       9343
GMC       8263
BMW       8175
ME/BE     7529
VOLKS     7428
CHRYS     7369
LEXUS     5752
ACURA     5745
MAZDA     5685
RAM       4640
BUICK     4562
INFIN     4461
MITSU     3586
AUDI      3495
CADIL     3133
Name: Vehicle Make, dtype: int64

In [22]:
# Look at Regristration Class value counts for binning
class_count = motor_df['Registration Class'].value_counts()


# Determine which values to replace if counts are less than 4000
replace_class = list(class_count[class_count < 2000].index)

# Replace in dataframe
for Class in replace_class:
    motor_df['Registration Class'] = motor_df['Registration Class'].replace(Class,"Other")
    
# Check to make sure binning was successful
motor_df['Registration Class'].value_counts()

PASSENGER OR SUBURBAN         347712
OMNIBUS - TAXI                 20924
AGRICULTURAL TRUCK             14251
Other                          10599
COMMERCIAL                      9818
SPECIAL PASSENGER               5611
POLITICAL SUBDIVISION           4352
MOTORCYCLE                      2771
INTERNATIONAL REGISTRATION      2037
Name: Registration Class, dtype: int64

In [23]:
# Action Prior to accident
actions = motor_df['Action Prior to Accident'].value_counts()

# Determine which values to replace if counts are less than 4000
replace_action = list(actions[actions < 2000].index)

# Replace in dataframe
for action in replace_action:
    motor_df['Action Prior to Accident'] = motor_df['Action Prior to Accident'].replace(action,"Other")
    
# Check to make sure binning was successful
motor_df['Action Prior to Accident'].value_counts()

Going Straight Ahead        243029
Making Left Turn             46153
Backing                      23642
Slowing or Stopping          22816
Making Right Turn            21186
Changing Lanes               20355
Starting in Traffic           7221
Merging                       5984
Other                         5926
Starting from Parking         5644
Overtaking/Passing            4366
Making U Turn                 4236
Entering Parked Position      3154
Stopped in Traffic            2309
Parked                        2054
Name: Action Prior to Accident, dtype: int64

In [24]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME','Contributing Factor 1 Description'
# and 'Contributing Factor 2 Description'.

motor_df = motor_df.drop(['Case Vehicle ID', 'Partial VIN', 'Contributing Factor 2','Contributing Factor 2 Description'], axis=1)

Selecting features for the model

In [13]:
# Create our target
y = motor_df.Year 

# Create our features
X = motor_df.drop(columns = 'Year')
#X = pd.get_dummies(X)

In [25]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [26]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Training The Model

In [27]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [28]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [29]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
Showing the results

In [30]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6455,18836
Actual 1,9634,69594


In [31]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [32]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6455,18836
Actual 1,9634,69594


Accuracy Score : 0.7276093341880424
Classification Report
              precision    recall  f1-score   support

        2018       0.40      0.26      0.31     25291
        2019       0.79      0.88      0.83     79228

    accuracy                           0.73    104519
   macro avg       0.59      0.57      0.57    104519
weighted avg       0.69      0.73      0.70    104519

