In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import os
import sqlite3


In [2]:
# Open database
conn = sqlite3.connect('practice.db')
print ("Opened database successfully")


Opened database successfully


In [3]:
# Define a variable by querying database
data = conn.execute("SELECT * FROM ML_MOCK_DATA").fetchall()

In [4]:
# Name the columns
cols = ['Index','Crash Death Count','Crash Severity','Crash Time','Crash Total Injury Count','Crash Year','Day of Week','Highway Number','Highway System','Latitude','Light Condition','Longitude','Surface Condition','Surface Type','Weather Condition','Vehicle Body Style','Vehicle Damage Rating 1 - Severity','Vehicle Damage Rating 2 - Severity','Vehicle Make','Vehicle Model Name','Vehicle Model Year','Citation','Person Age','Person Alcohol Result','Person Drug Test Result','Person Gender','Person Injury Severity','Person Type']


In [5]:
# Convert the queried data into a dataframe
mock_df = pd.DataFrame(data, columns=cols)
# Drop the extra index column
mock_df.drop(columns=['Index'], inplace=True)
print(mock_df.head())


   Crash Death Count   Crash Severity  Crash Time  Crash Total Injury Count  \
0                  0  N - NOT INJURED        1111                         0   
1                  0  N - NOT INJURED          36                         0   
2                  0  N - NOT INJURED          36                         0   
3                  0  N - NOT INJURED          36                         0   
4                  0  N - NOT INJURED         728                         0   

   Crash Year Day of Week Highway Number  Highway System     Latitude  \
0        2018      MONDAY        No Data         No Data  30.20098162   
1        2018      MONDAY            734  FARM TO MARKET  30.36856269   
2        2018      MONDAY            734  FARM TO MARKET  30.36856269   
3        2018      MONDAY            734  FARM TO MARKET  30.36856269   
4        2018      MONDAY            734  FARM TO MARKET  30.40533278   

         Light Condition  ... Vehicle Make  Vehicle Model Name  \
0           1 - DAYL

In [6]:
# Preprocess the data

print(mock_df.columns)

print(mock_df['Crash Severity'].value_counts())

print(mock_df['Person Type'].value_counts())

print(mock_df['Vehicle Body Style'].value_counts())


Index(['Crash Death Count', 'Crash Severity', 'Crash Time',
       'Crash Total Injury Count', 'Crash Year', 'Day of Week',
       'Highway Number', 'Highway System', 'Latitude', 'Light Condition',
       'Longitude', 'Surface Condition', 'Surface Type', 'Weather Condition',
       'Vehicle Body Style', 'Vehicle Damage Rating 1 - Severity',
       'Vehicle Damage Rating 2 - Severity', 'Vehicle Make',
       'Vehicle Model Name', 'Vehicle Model Year', 'Citation', 'Person Age',
       'Person Alcohol Result', 'Person Drug Test Result', 'Person Gender',
       'Person Injury Severity', 'Person Type'],
      dtype='object')
N - NOT INJURED                  60327
B - NON-INCAPACITATING INJURY    26841
C - POSSIBLE INJURY              26370
A - SUSPECTED SERIOUS INJURY      3576
99 - UNKNOWN                      1389
K - KILLED                         600
Name: Crash Severity, dtype: int64
1 - DRIVER                                           85538
2 - PASSENGER/OCCUPANT                      

In [7]:
# Select features and output columns
mock_df = mock_df[['Crash Severity', 'Person Age', 'Person Gender', 'Person Type', 'Vehicle Model Year', 'Vehicle Body Style', 'Vehicle Make']]
print(mock_df.head())

print(mock_df.shape)

    Crash Severity Person Age Person Gender             Person Type  \
0  N - NOT INJURED         34    2 - FEMALE              1 - DRIVER   
1  N - NOT INJURED         49    2 - FEMALE              1 - DRIVER   
2  N - NOT INJURED         22    2 - FEMALE              1 - DRIVER   
3  N - NOT INJURED         48    2 - FEMALE  2 - PASSENGER/OCCUPANT   
4  N - NOT INJURED         21      1 - MALE              1 - DRIVER   

  Vehicle Model Year          Vehicle Body Style Vehicle Make  
0               2005                    VN - VAN        HONDA  
1               2013  P4 - PASSENGER CAR, 4-DOOR        HONDA  
2               2011  P4 - PASSENGER CAR, 4-DOOR      HYUNDAI  
3               2011  P4 - PASSENGER CAR, 4-DOOR      HYUNDAI  
4               2006  P4 - PASSENGER CAR, 4-DOOR     CHRYSLER  
(119103, 7)


In [8]:
# Drop rows with Unknown
mock_df = mock_df[mock_df['Crash Severity'] != "99 - UNKNOWN"]
print(mock_df.shape)


(117714, 7)


In [9]:
mock_df = mock_df[(mock_df['Person Type'] != "2 - PASSENGER/OCCUPANT") & (mock_df['Person Type'] != "4 - PEDESTRIAN") & (mock_df['Person Type'] != "99 - UNKNOWN") & (mock_df['Person Type'] != "98 - OTHER (EXPLAIN IN NARRATIVE)") & (mock_df['Person Type'] != "6 - PASSENGER/OCCUPANT ON MOTORCYCLE TYPE VEHICLE")]
print(mock_df.shape)

(86279, 7)


In [10]:
print(mock_df['Vehicle Body Style'].value_counts())

P4 - PASSENGER CAR, 4-DOOR                47653
SV - SPORT UTILITY VEHICLE                13090
PK - PICKUP                               11660
P2 - PASSENGER CAR, 2-DOOR                 4244
VN - VAN                                   2636
TR - TRUCK                                 1952
99 - UNKNOWN                               1059
MC - MOTORCYCLE                             972
No Data                                     831
TT - TRUCK TRACTOR                          738
98 - OTHER  (EXPLAIN IN NARRATIVE)          445
PC - POLICE CAR/TRUCK                       430
BU - BUS                                    396
AM - AMBULANCE                               88
FT - FIRE TRUCK                              33
SB - YELLOW SCHOOL BUS                       26
PM - POLICE MOTORCYCLE                       13
EV - NEV-NEIGHBORHOOD ELECTRIC VEHICLE        9
FE - FARM EQUIPMENT                           4
Name: Vehicle Body Style, dtype: int64


In [11]:
# Clean Vehicle Body Style column
mock_df = mock_df[(mock_df['Vehicle Body Style'] != "99 - UNKNOWN") & (mock_df['Vehicle Body Style'] != "No Data") & (mock_df['Vehicle Body Style'] != "98 - OTHER  (EXPLAIN IN NARRATIVE)") & (mock_df['Vehicle Body Style'] != "EV - NEV-NEIGHBORHOOD ELECTRIC VEHICLE") & (mock_df['Vehicle Body Style'] != "FE - FARM EQUIPMENT")]
print(mock_df.shape)

(83931, 7)


In [12]:
print(mock_df.head())

    Crash Severity Person Age Person Gender Person Type Vehicle Model Year  \
0  N - NOT INJURED         34    2 - FEMALE  1 - DRIVER               2005   
1  N - NOT INJURED         49    2 - FEMALE  1 - DRIVER               2013   
2  N - NOT INJURED         22    2 - FEMALE  1 - DRIVER               2011   
4  N - NOT INJURED         21      1 - MALE  1 - DRIVER               2006   
5  N - NOT INJURED         45      1 - MALE  1 - DRIVER               2001   

           Vehicle Body Style Vehicle Make  
0                    VN - VAN        HONDA  
1  P4 - PASSENGER CAR, 4-DOOR        HONDA  
2  P4 - PASSENGER CAR, 4-DOOR      HYUNDAI  
4  P4 - PASSENGER CAR, 4-DOOR     CHRYSLER  
5  P4 - PASSENGER CAR, 4-DOOR        MAZDA  


In [13]:
# Drop Vehicle Make column
mock_df = mock_df.drop(columns=['Vehicle Make'], index=1)
print(mock_df.head())



                  Crash Severity Person Age Person Gender Person Type  \
0                N - NOT INJURED         34    2 - FEMALE  1 - DRIVER   
2                N - NOT INJURED         22    2 - FEMALE  1 - DRIVER   
4                N - NOT INJURED         21      1 - MALE  1 - DRIVER   
5                N - NOT INJURED         45      1 - MALE  1 - DRIVER   
6  B - NON-INCAPACITATING INJURY         41      1 - MALE  1 - DRIVER   

  Vehicle Model Year          Vehicle Body Style  
0               2005                    VN - VAN  
2               2011  P4 - PASSENGER CAR, 4-DOOR  
4               2006  P4 - PASSENGER CAR, 4-DOOR  
5               2001  P4 - PASSENGER CAR, 4-DOOR  
6               2006  P4 - PASSENGER CAR, 4-DOOR  


In [14]:
# Clean values for Person Gender and Person Type columns
mock_df.loc[mock_df['Person Gender'] == "2 - FEMALE", 'Person Gender'] = "FEMALE"

mock_df.loc[mock_df['Person Gender'] == "1 - MALE", 'Person Gender'] = "MALE"

mock_df.loc[mock_df['Person Type'] == "1 - DRIVER", 'Person Type'] = "DRIVER"
mock_df.loc[mock_df['Person Type'] == "5 - DRIVER OF MOTORCYCLE TYPE VEHICLE", 'Person Type'] = "MOTORCYCLE DRIVER"
mock_df.loc[mock_df['Person Type'] == "3 - PEDALCYCLIST", 'Person Type'] = "PEDALCYCLIST"



In [15]:

print(mock_df.head())


                  Crash Severity Person Age Person Gender Person Type  \
0                N - NOT INJURED         34        FEMALE      DRIVER   
2                N - NOT INJURED         22        FEMALE      DRIVER   
4                N - NOT INJURED         21          MALE      DRIVER   
5                N - NOT INJURED         45          MALE      DRIVER   
6  B - NON-INCAPACITATING INJURY         41          MALE      DRIVER   

  Vehicle Model Year          Vehicle Body Style  
0               2005                    VN - VAN  
2               2011  P4 - PASSENGER CAR, 4-DOOR  
4               2006  P4 - PASSENGER CAR, 4-DOOR  
5               2001  P4 - PASSENGER CAR, 4-DOOR  
6               2006  P4 - PASSENGER CAR, 4-DOOR  


In [16]:
# Group Crash Severity in two classes: 0 for none or light injury and 1 for serious injury or fatality
mock_df.loc[(mock_df['Crash Severity'] == "N - NOT INJURED") | (mock_df['Crash Severity'] == "B - NON-INCAPACITATING INJURY") | (mock_df['Crash Severity'] == "C - POSSIBLE INJURY"), 'Crash Severity'] = 0


mock_df.loc[(mock_df['Crash Severity'] == "A - SUSPECTED SERIOUS INJURY") | (mock_df['Crash Severity'] == "K - KILLED"), 'Crash Severity'] = 1


print(mock_df['Crash Severity'].value_counts())


0    81337
1     2593
Name: Crash Severity, dtype: int64


In [17]:
print(mock_df.head())


  Crash Severity Person Age Person Gender Person Type Vehicle Model Year  \
0              0         34        FEMALE      DRIVER               2005   
2              0         22        FEMALE      DRIVER               2011   
4              0         21          MALE      DRIVER               2006   
5              0         45          MALE      DRIVER               2001   
6              0         41          MALE      DRIVER               2006   

           Vehicle Body Style  
0                    VN - VAN  
2  P4 - PASSENGER CAR, 4-DOOR  
4  P4 - PASSENGER CAR, 4-DOOR  
5  P4 - PASSENGER CAR, 4-DOOR  
6  P4 - PASSENGER CAR, 4-DOOR  


In [18]:
print(mock_df.dtypes)


Crash Severity        object
Person Age            object
Person Gender         object
Person Type           object
Vehicle Model Year    object
Vehicle Body Style    object
dtype: object


In [19]:
mock_df["Crash Severity"] = mock_df["Crash Severity"].astype(int)


In [20]:
print(mock_df['Vehicle Model Year'].value_counts())


2015    6615
2016    6319
2014    5913
2013    5898
2017    5868
        ... 
1940       1
1951       1
1948       1
1968       1
1955       1
Name: Vehicle Model Year, Length: 67, dtype: int64


In [21]:
print(mock_df.shape)


(83930, 6)


In [23]:
mock_df = mock_df[mock_df['Person Age'] != "No Data"]
print(mock_df.shape)


(81698, 6)


In [24]:
mock_df["Person Age"] = mock_df["Person Age"].astype(int)
mock_df["Vehicle Model Year"] = mock_df["Vehicle Model Year"].astype(int)


In [25]:
print(mock_df.dtypes)

print(mock_df.head())


Crash Severity         int64
Person Age             int64
Person Gender         object
Person Type           object
Vehicle Model Year     int64
Vehicle Body Style    object
dtype: object
   Crash Severity  Person Age Person Gender Person Type  Vehicle Model Year  \
0               0          34        FEMALE      DRIVER                2005   
2               0          22        FEMALE      DRIVER                2011   
4               0          21          MALE      DRIVER                2006   
5               0          45          MALE      DRIVER                2001   
6               0          41          MALE      DRIVER                2006   

           Vehicle Body Style  
0                    VN - VAN  
2  P4 - PASSENGER CAR, 4-DOOR  
4  P4 - PASSENGER CAR, 4-DOOR  
5  P4 - PASSENGER CAR, 4-DOOR  
6  P4 - PASSENGER CAR, 4-DOOR  


In [26]:
mock_df = mock_df[mock_df['Person Gender'] != "99 - UNKNOWN"]
print(mock_df.shape)

(81632, 6)


In [27]:
# Generate our categorical variable lists
feat_cat = ["Person Gender", "Person Type", "Vehicle Body Style"]


In [28]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)


In [29]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mock_df[feat_cat]))



In [30]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(feat_cat)
print(encode_df.head())


   Person Gender_FEMALE  Person Gender_MALE  Person Type_DRIVER  \
0                   1.0                 0.0                 1.0   
1                   1.0                 0.0                 1.0   
2                   0.0                 1.0                 1.0   
3                   0.0                 1.0                 1.0   
4                   0.0                 1.0                 1.0   

   Person Type_MOTORCYCLE DRIVER  Vehicle Body Style_AM - AMBULANCE  \
0                            0.0                                0.0   
1                            0.0                                0.0   
2                            0.0                                0.0   
3                            0.0                                0.0   
4                            0.0                                0.0   

   Vehicle Body Style_BU - BUS  Vehicle Body Style_FT - FIRE TRUCK  \
0                          0.0                                 0.0   
1                          0.0



In [31]:
# Merge one-hot encoded features and drop the originals
mock_df = mock_df.merge(encode_df, left_index=True, right_index=True).drop(columns=feat_cat, axis=1)
print(mock_df.head())


   Crash Severity  Person Age  Vehicle Model Year  Person Gender_FEMALE  \
0               0          34                2005                   1.0   
2               0          22                2011                   0.0   
4               0          21                2006                   0.0   
5               0          45                2001                   1.0   
6               0          41                2006                   1.0   

   Person Gender_MALE  Person Type_DRIVER  Person Type_MOTORCYCLE DRIVER  \
0                 0.0                 1.0                            0.0   
2                 1.0                 1.0                            0.0   
4                 1.0                 1.0                            0.0   
5                 0.0                 1.0                            0.0   
6                 0.0                 1.0                            0.0   

   Vehicle Body Style_AM - AMBULANCE  Vehicle Body Style_BU - BUS  \
0                      

In [32]:
# Split our preprocessed data into our features and target arrays
# y=0 for no or low injury and y=1 for serious injury or fatality
y = mock_df["Crash Severity"]
X = mock_df.drop("Crash Severity", axis=1)


In [33]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)


In [34]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [35]:
# ### Logistic Regression Classifier

# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200)


In [36]:
# Train the model
log_classifier.fit(X_train_scaled, y_train)

In [37]:

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
print(results.head(6))


from sklearn.metrics import confusion_matrix, classification_report

 Logistic regression model accuracy: 0.969
   Prediction  Actual
0           0       0
1           0       0
2           0       0
3           0       0
4           0       0
5           0       1


In [38]:

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13617,0
Actual 1,434,0


In [39]:
print("Classification Report")
print(classification_report(y_test, y_pred))


Classification Report
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     13617
           1       0.00      0.00      0.00       434

    accuracy                           0.97     14051
   macro avg       0.48      0.50      0.49     14051
weighted avg       0.94      0.97      0.95     14051



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:

# ### Oversampling

from collections import Counter

print(Counter(y_train))


# ### RandomOverSampler

# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
# Instantiate the model
ros = RandomOverSampler(random_state=1)
# Resample the targets
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled))

# Train the Logistic Regression model using the resampled data
logreg = LogisticRegression(solver='lbfgs', max_iter=200)


Counter({0: 40852, 1: 1300})
Counter({0: 40852, 1: 40852})


In [41]:
# Fit
logreg.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = logreg.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

print("Classification Report")
print(classification_report(y_test, y_pred))


# ### SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled))


# Train the Logistic Regression model using the resampled data
logreg.fit(X_resampled, y_resampled)
# Calculated the balanced accuracy score
y_pred = logreg.predict(X_test_scaled)
from sklearn.metrics import balanced_accuracy_score
print(accuracy_score(y_test, y_pred))

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df


print("Classification Report")
print(classification_report(y_test, y_pred))

 Logistic regression model accuracy: 0.539
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.54      0.69     13617
           1       0.03      0.49      0.06       434

    accuracy                           0.54     14051
   macro avg       0.50      0.52      0.38     14051
weighted avg       0.94      0.54      0.68     14051

Counter({0: 33559, 1: 28742})
0.8137499110383603
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.83      0.90     13617
           1       0.03      0.17      0.05       434

    accuracy                           0.81     14051
   macro avg       0.50      0.50      0.47     14051
weighted avg       0.94      0.81      0.87     14051

