### Import Dependencies for Machine Learning

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

### Import Initial Dataset

In [2]:
# Import the dataset from Google Drive:
url = ('https://drive.google.com/file/d/1t3Z8Blgy2BPmBB4FqrQkC_jie9IwYuQb/view?usp=sharing')
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
crash_1 = pd.read_csv(path,index_col=0)
crash_1.head()

Unnamed: 0,dvcat,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy,injSeverity,caseid
1,25-39,25.069,alive,none,belted,1,f,26,1997,1990.0,unavail,driver,0,3.0,2:3:1
2,10-24,25.069,alive,airbag,belted,1,f,72,1997,1995.0,deploy,driver,1,1.0,2:3:2
3,10-24,32.379,alive,none,none,1,f,69,1997,1988.0,unavail,driver,0,4.0,2:5:1
4,25-39,495.444,alive,airbag,belted,1,f,53,1997,1995.0,deploy,driver,1,1.0,2:10:1
5,25-39,25.069,alive,none,belted,1,f,32,1997,1988.0,unavail,driver,0,3.0,2:11:1


### Perform Data Cleaning

In [3]:
# Remove columns not needed for this analysis:

# 'weight': Value of unknown significance or origin
# 'yearacc': Year the accident occurred from 1997-2002
# 'caseid': Not individual accident identifiers, numerous indicents assigned to single id's
# 'airbag' & 'deploy': Values are duplicated in the 'abcat' column

crash_2 = crash_1.drop(['weight','yearacc','caseid','airbag','deploy'], axis=1)

# Rename the column titles for better clarity:
crash_2.rename(columns={'dvcat':'est_impact_kmh',
                        'dead':'ultimate_outcome',
                        'airbag':'airbag_available',
                        'frontal':'front_impact',
                        'ageOFocc':'occupant_age',
                        'yearacc':'accident_year',
                        'yearVeh':'vehicle_year',
                        'abcat':'airbag_deployment',
                        'occRole':'occupant_role',
                        'injSeverity':'injury_severity'},inplace=True)

crash_2.head()

Unnamed: 0,est_impact_kmh,ultimate_outcome,seatbelt,front_impact,sex,occupant_age,vehicle_year,airbag_deployment,occupant_role,injury_severity
1,25-39,alive,belted,1,f,26,1990.0,unavail,driver,3.0
2,10-24,alive,belted,1,f,72,1995.0,deploy,driver,1.0
3,10-24,alive,none,1,f,69,1988.0,unavail,driver,4.0
4,25-39,alive,belted,1,f,53,1995.0,deploy,driver,1.0
5,25-39,alive,belted,1,f,32,1988.0,unavail,driver,3.0


In [4]:
# Drop the all rows with null values:
crash_3 = crash_2.dropna()
for column in crash_3.columns:
    print(f'Column {column} has {crash_3[column].isnull().sum()}\
    null values')

Column est_impact_kmh has 0    null values
Column ultimate_outcome has 0    null values
Column seatbelt has 0    null values
Column front_impact has 0    null values
Column sex has 0    null values
Column occupant_age has 0    null values
Column vehicle_year has 0    null values
Column airbag_deployment has 0    null values
Column occupant_role has 0    null values
Column injury_severity has 0    null values


In [5]:
# Rename values:
crash_3['est_impact_kmh'] = crash_3['est_impact_kmh'].replace({'1-9km/h':'1-9'})
crash_3['seatbelt'] = crash_3['seatbelt'].replace({'none':'not_belted'})
crash_3['front_impact'] = crash_3['front_impact'].replace({1:'yes',0:'no'})
crash_3[crash_3['injury_severity'] < 5.0]
crash_3.index.name = 'index'
crash_3.head()

Unnamed: 0_level_0,est_impact_kmh,ultimate_outcome,seatbelt,front_impact,sex,occupant_age,vehicle_year,airbag_deployment,occupant_role,injury_severity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,25-39,alive,belted,yes,f,26,1990.0,unavail,driver,3.0
2,10-24,alive,belted,yes,f,72,1995.0,deploy,driver,1.0
3,10-24,alive,not_belted,yes,f,69,1988.0,unavail,driver,4.0
4,25-39,alive,belted,yes,f,53,1995.0,deploy,driver,1.0
5,25-39,alive,belted,yes,f,32,1988.0,unavail,driver,3.0


In [6]:
# Rename the database that has been cleaned:
crash_cleaned = crash_3

### Export Cleaned Data to SQLite Database

In [7]:
import sqlite3 as sq
crash_cleaned_copy = crash_cleaned.copy()

# Export the accidents data:
accidents_data = crash_cleaned
accidents_data = crash_cleaned.drop(['ultimate_outcome','sex','occupant_age','occupant_role','injury_severity'], axis=1)

# Export the occupants data:
occupants_data = crash_cleaned
occupants_data = crash_cleaned.drop(['est_impact_kmh','front_impact','vehicle_year','seatbelt','airbag_deployment'], axis=1)

sql_data = 'crash2.sqlite'


In [8]:
# Create connection & push the data:

conn = sq.connect(sql_data)
cur = conn.cursor()

cur.executescript('''
DROP TABLE IF EXISTS "ACCIDENTS";
CREATE TABLE "ACCIDENTS" (
	"index" INTEGER PRIMARY KEY AUTOINCREMENT,
	"est_impact_kmh" TEXT NOT NULL,
	"front_impact" TEXT NOT NULL,
	"vehicle_year" INTEGER NOT NULL,
	"seatbelt" TEXT NOT NULL,
	"airbag_deployment" TEXT NOT NULL
);

DROP TABLE IF EXISTS "OCCUPANTS";
CREATE TABLE "OCCUPANTS" (
	"index" INTEGER PRIMARY KEY AUTOINCREMENT,
	"ultimate_outcome" TEXT NOT NULL,
	"sex" INTEGER NOT NULL,
	"occupant_age" TEXT NOT NULL,
	"occupant_role" TEXT NOT NULL,
	"injury_severity" INTEGER NOT NULL
);

''')
#conn.commit()
accidents_data.to_sql("ACCIDENTS", conn, if_exists='append', index=True)
#conn.commit()
occupants_data.to_sql("OCCUPANTS", conn, if_exists='append', index=True)

conn.commit()
conn.close()

### Reflect the Tables into SQLAlchemy ORM

In [9]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [10]:
engine = create_engine("sqlite:///crash2.sqlite")

In [11]:
# Reflect an existing database into a new model:
Base = automap_base()

# Reflect the tables:
Base.prepare(engine, reflect=True)

In [12]:
# All of the classes that automap found:
Base.classes.keys()

['ACCIDENTS', 'CRASH_COMBINED2', 'OCCUPANTS']

In [13]:
# Save reference to the combined table:
Crash = Base.classes.CRASH_COMBINED2

In [14]:
# Create our session (link) from Python to the DB:
session = Session(engine)

### Import the Combined Databases Back to Python

In [15]:
# Perform a query to retrieve the data from the CRASH_COMBINED2 table:
results = []
results = session.query(Crash.occupant_role,
                        Crash.sex,
                        Crash.occupant_age,
                        Crash.ultimate_outcome,
                        Crash.injury_severity,
                        Crash.vehicle_year,
                        Crash.est_impact_kmh,
                        Crash.airbag_deployment,
                        Crash.front_impact,
                        Crash.seatbelt).all()

# Save the query results as a Pandas DataFrame
crash_4 = pd.DataFrame(results, columns=['occupant_role',
                                         'sex',
                                         'occupant_age',
                                         'ultimate_outcome',
                                         'injury_severity',
                                         'seatbelt',
                                         'est_impact_kmh',
                                         'airbag_deployment',
                                         'vehicle_year',
                                         'front_impact',])
crash_4.head()


Unnamed: 0,occupant_role,sex,occupant_age,ultimate_outcome,injury_severity,seatbelt,est_impact_kmh,airbag_deployment,vehicle_year,front_impact
0,driver,f,26,alive,3.0,belted,25-39,unavail,1990,yes
1,driver,f,72,alive,1.0,belted,10-24,deploy,1995,yes
2,driver,f,69,alive,4.0,not_belted,10-24,unavail,1988,yes
3,driver,f,53,alive,1.0,belted,25-39,deploy,1995,yes
4,driver,f,32,alive,3.0,belted,25-39,unavail,1988,yes


In [51]:
ultimate_outcome = crash_4.ultimate_outcome.value_counts()
ultimate_outcome

alive    24883
dead      1180
Name: ultimate_outcome, dtype: int64

### Conduct Integer Encoding to Transform Text to Numbers

In [19]:
le = LabelEncoder()

crash_5 = crash_4.copy()
crash_5['occupant_role'] = le.fit_transform(crash_5['occupant_role']) 
crash_5['sex'] = le.fit_transform(crash_5['sex'])
crash_5['ultimate_outcome'] = le.fit_transform(crash_5['ultimate_outcome'])
crash_5['seatbelt'] = le.fit_transform(crash_5['seatbelt'])
crash_5['est_impact_kmh'] = le.fit_transform(crash_5['est_impact_kmh'])
crash_5['airbag_deployment'] = le.fit_transform(crash_5['airbag_deployment'])
crash_5['front_impact'] = le.fit_transform(crash_5['front_impact'])

crash_5.head()

Unnamed: 0,occupant_role,sex,occupant_age,ultimate_outcome,injury_severity,seatbelt,est_impact_kmh,airbag_deployment,vehicle_year,front_impact
0,0,0,26,0,3.0,0,2,2,1990,1
1,0,0,72,0,1.0,0,1,0,1995,1
2,0,0,69,0,4.0,1,1,2,1988,1
3,0,0,53,0,1.0,0,2,0,1995,1
4,0,0,32,0,3.0,0,2,2,1988,1


### Split the Data into Training & Testing Sets

In [20]:
# Separate the features (X) from the target (y):
y = crash_5['ultimate_outcome']
X = crash_5.drop(columns='ultimate_outcome')

# Split data into training & testing:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(19547, 9)

### Balanced Random Forest Classifier:

In [21]:
# Resample the training data with the BalancedRandomForestClassifier:
BRFC = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
BRFC.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [22]:
# Calculate the balanced accuracy score:
BFRC_pred = BRFC.predict(X_test)
balanced_accuracy_score(y_test, BFRC_pred)

0.9545279929380801

In [23]:
# Display the confusion matrix:
confusion_matrix(y_test, BFRC_pred)

array([[6077,  144],
       [  20,  275]])

In [24]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test, BFRC_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.98      0.93      0.99      0.95      0.91      6221
          1       0.66      0.93      0.98      0.77      0.95      0.91       295

avg / total       0.98      0.97      0.93      0.98      0.95      0.91      6516



In [25]:
# List the features sorted in descending order by feature importance:
importances = BRFC.feature_importances_
sorted(zip(importances, X.columns), reverse=True)

[(0.6882189727380738, 'injury_severity'),
 (0.12779829263470893, 'est_impact_kmh'),
 (0.07420672975580013, 'occupant_age'),
 (0.03690981047880008, 'vehicle_year'),
 (0.034854471324103374, 'seatbelt'),
 (0.012725779281565815, 'front_impact'),
 (0.009435762295645904, 'airbag_deployment'),
 (0.007997001819800814, 'occupant_role'),
 (0.007853179671501235, 'sex')]

### Easy Ensemble AdaBoost Classifier

In [26]:
# Train the classifier:
EEC = EasyEnsembleClassifier(n_estimators=100, random_state=1)
EEC.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [27]:
# Calculate the balanced accuracy score:
EEC_pred = EEC.predict(X_test)
balanced_accuracy_score(y_test, EEC_pred)

0.961877348183708

In [28]:
# Display the confusion matrix:
confusion_matrix(y_test, EEC_pred)

array([[6063,  158],
       [  15,  280]])

In [29]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test, EEC_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.97      0.95      0.99      0.96      0.93      6221
          1       0.64      0.95      0.97      0.76      0.96      0.92       295

avg / total       0.98      0.97      0.95      0.98      0.96      0.93      6516



### Naive Random Oversampling

In [30]:
# Resample the training data with the RandomOversampler:
ROS = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ROS.fit_resample(X_train, y_train)

In [31]:
# Train the Logistic Regression model using the resampled data:
ROS_model = LogisticRegression(solver='lbfgs', random_state=1)
ROS_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [32]:
# Calculated the balanced accuracy score:
ROS_pred = ROS_model.predict(X_test)
balanced_accuracy_score(y_test, ROS_pred)

0.9597876519933849

In [33]:
# Display the confusion matrix:
confusion_matrix(y_test, ROS_pred)

array([[6037,  184],
       [  15,  280]])

In [34]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test, ROS_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.97      0.95      0.98      0.96      0.92      6221
          1       0.60      0.95      0.97      0.74      0.96      0.92       295

avg / total       0.98      0.97      0.95      0.97      0.96      0.92      6516



### SMOTE Oversampling

In [35]:
# Resample the training data with SMOTE:
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

In [36]:
# Train the Logistic Regression model using the resampled data:
Smote_model = LogisticRegression(solver='lbfgs', random_state=1)
Smote_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [37]:
# Calculate the balanced accuracy score:
SMOTE_pred = Smote_model.predict(X_test)
balanced_accuracy_score(y_test, SMOTE_pred)

0.9633831827135536

In [38]:
# Display the confusion matrix
confusion_matrix(y_test, SMOTE_pred)

array([[6145,   76],
       [  18,  277]])

In [39]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, SMOTE_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.94      0.99      0.96      0.93      6221
          1       0.78      0.94      0.99      0.85      0.96      0.92       295

avg / total       0.99      0.99      0.94      0.99      0.96      0.93      6516



### Undersampling

In [40]:
# Resample the data using the ClusterCentroids resampler:
CC = ClusterCentroids(random_state=1)
X_resampled, y_resampled = CC.fit_resample(X_train, y_train)

In [41]:
# Train the Logistic Regression model using the resampled data:
CC_model = LogisticRegression(solver='lbfgs', random_state=78)
CC_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [42]:
# Calculate the balanced accuracy score:
CC_pred = CC_model.predict(X_test)
balanced_accuracy_score(y_test, CC_pred)

0.8663918548165181

In [43]:
# Display the confusion matrix:
confusion_matrix(y_test, CC_pred)

array([[4643, 1578],
       [   4,  291]])

In [44]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test, CC_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.75      0.99      0.85      0.86      0.72      6221
          1       0.16      0.99      0.75      0.27      0.86      0.75       295

avg / total       0.96      0.76      0.98      0.83      0.86      0.72      6516



### Combination (Over and Under) Sampling

In [45]:
# Resample the training data with SMOTEENN:
SMTN = SMOTEENN(random_state=0)
X_resampled2, y_resampled2 = SMTN.fit_resample(X, y)

In [46]:
# Train the Logistic Regression model using the resampled data:
SMTN_model = LogisticRegression(solver='lbfgs', random_state=1)
SMTN_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [47]:
# Calculate the balanced accuracy score:
SMTN_pred = SMTN_model.predict(X_test)
balanced_accuracy_score(y_test, SMTN_pred)

0.8663918548165181

In [48]:
# Display the confusion matrix:
confusion_matrix(y_test, SMTN_pred)

array([[4643, 1578],
       [   4,  291]])

In [49]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test, SMTN_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.75      0.99      0.85      0.86      0.72      6221
          1       0.16      0.99      0.75      0.27      0.86      0.75       295

avg / total       0.96      0.76      0.98      0.83      0.86      0.72      6516

