### 1. Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression

In [None]:
%matplotlib inline

In [None]:
path = r'C:\Users\nukis\Documents\Projects\08. Road Safety'

In [None]:
df = pd.read_pickle(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.pkl'))

In [None]:
# Command to maximize view of rows and columns

pd.options.display.max_rows = None
pd.options.display.max_columns = len(df.columns)

In [None]:
df.shape

In [None]:
df.info()

#### Pre-Processing

In [None]:
df.drop(columns=['Suburb', 'LGA Name', 'Postcode', 'Total Units', 'Total Cas', 'Total Fats', 'Total SI', 'Total MI', 'Lat', 'Lon'], inplace = True)
#df['Postcode'] = df['Postcode'].astype('int64')

### 2. Accident Severity Analysis

In [None]:
labels = ['Property Damage Only', 'Minor Injury', 'Serious Injury', 'Fatality']
colors = ['#FEF9A7', '#FAC213', '#F77E21', '#D61C4E']

fig, ax = plt.subplots()
myexplode = (0.05, 0.05, 0.05, 0.2)

ax.pie(df['Target'].value_counts(), explode = myexplode, labels = labels,autopct='%1.1f%%', 
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'}, startangle = 90, colors = colors)

ax.set_title('Accident Severity', fontsize = 14)
plt.show()

In [None]:
# Function for drawing countplot

def countplot(x):
    plt.figure(figsize = (4, 6))
    sns.countplot(data = df, x = x, palette = 'mako_r', 
                  order = df[x].value_counts().index)

In [None]:
countplot('Target')

#### As we can see, there are 4 classes of severity. We can see that the distribution of the classes is greatly disbalanced. 'Property Damage Only (PDO)' class is in majority while 'Fatality (Fatal)' class is the minority here.
#### Due to imbalance dataset, binary classification will be performed. For this purpose, classes of Mi, Si and Fatal will be grouped together as Injury/Death.

In [None]:
# Grouping Minor injury, Serious injury abd fatality into one class

df.loc[df['Target'] == 'Mi', 'Target'] = 'Injury/Death'
df.loc[df['Target'] == 'Si', 'Target'] = 'Injury/Death'
df.loc[df['Target'] == 'Fatal', 'Target'] = 'Injury/Death'
df.loc[df['Target'] == 'Pdo', 'Target'] = 'PDO'

In [None]:
labels = ['PDO', 'Injury/Death']
colors = ['#FEF9A7', '#D61C4E']

fig, ax = plt.subplots()
myexplode = (0.05, 0.05)

ax.pie(df['Target'].value_counts(), explode = myexplode, labels = labels,autopct='%1.1f%%', 
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'}, startangle = 90, colors = colors)

ax.set_title('Accident Severity', fontsize = 14)
plt.show()

In [None]:
countplot('Target')

In [None]:
df.describe()

In [None]:
# There is outliers in Vehicle Year column: 1900 will be removed

df = df.loc[df['Veh Year'] >= 1940]

### 3. Label-Encoding for Binary Features

In [None]:
labelencoder = LabelEncoder()

In [None]:
df['Target'] = labelencoder.fit_transform(df['Target']) # Target column
df['Target'] = df['Target'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['DayNight'] = labelencoder.fit_transform(df['DayNight']) 
df['DayNight'] = df['DayNight'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Road Surface'] = labelencoder.fit_transform(df['Road Surface']) 
df['Road Surface'] = df['Road Surface'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Moisture Cond'] = labelencoder.fit_transform(df['Moisture Cond']) 
df['Moisture Cond'] = df['Moisture Cond'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Weather Cond'] = labelencoder.fit_transform(df['Weather Cond']) 
df['Weather Cond'] = df['Weather Cond'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Sex'] = labelencoder.fit_transform(df['Sex']) 
df['Sex'] = df['Sex'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Unit Involved'] = labelencoder.fit_transform(df['Unit Involved']) 
df['Unit Involved'] = df['Unit Involved'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df['Day Group'] = labelencoder.fit_transform(df['Day Group']) 
df['Day Group'] = df['Day Group'].astype('int64')

dict(zip(labelencoder.inverse_transform([0,1]),[0,1]))

In [None]:
df.head()

In [None]:
df.info()

### 4. One-Hot-Encoding for categorical data
#### As many of the features are categorical, One-Hot-Encoding is performed.

In [None]:
# Creating categorical features list

catvar = df.select_dtypes(include = ['object']).columns
catvar

In [None]:
for i in df:
    if i in catvar:
        print(df[i].unique())

In [None]:
# Creating one hot encoder object 

onehotencoder = OneHotEncoder(handle_unknown = 'ignore') # Whether to raise an error

In [None]:
# Fit and transform the data using the .fit_transform() method
# return the array version of the transformed data using the .toarray() method

df_enc = onehotencoder.fit_transform(df[catvar]).toarray()
df_enc

In [None]:
feature_array = onehotencoder.get_feature_names_out()
feature_array

In [None]:
# Convert to dataframe

df_enc = pd.DataFrame(df_enc, columns = feature_array)
df_enc.head()

In [None]:
df_enc.shape

In [None]:
# Concatenate with the dataframe

df_num = df.drop(columns = catvar, axis = 1).reset_index()
df_num = df_num.drop(columns = 'index', axis = 1)
df_num.shape

In [None]:
df_new = pd.concat([df_num, df_enc], axis=1)
df_new.shape

In [None]:
df_new.head()

### 5. Preparing and Splitting Dataset

In [None]:
X = df_new.drop(columns = 'Target') # Features
y = df_new['Target'] # Dependent variable (Target)

### 6. Features Selection Embedded Method

In [None]:
reg = LassoCV()
reg.fit(X, y)

print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = coef.sort_values()
imp_coef

In [None]:
selected_features = imp_coef.loc[(imp_coef != 0) | (imp_coef != -0)]
selected_features 

In [None]:
plt.rcParams['figure.figsize'] = (20, 10)
selected_features.plot(kind = 'barh')
plt.title('Feature importance using Lasso Model')

### 7. Building Models

In [None]:
# Divide into training-set and test-set: 70% 30% ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### A. Logistic Regression

In [None]:
lgr = LogisticRegression()

In [None]:
lgr.fit(X_train, y_train)

In [None]:
y_pred_lgr = lgr.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred_lgr))

In [None]:
cf_matrix_lgr = confusion_matrix(y_test, y_pred_lgr)

In [None]:
TN, FP, FN, TP = cf_matrix_lgr.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

In [None]:
group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']

group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_lgr.flatten()]

group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix_lgr.flatten()/np.sum(cf_matrix_lgr)]

labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

plt.figure(figsize=(8,4))
sns.heatmap(cf_matrix_lgr, annot=labels, fmt='', cmap='Blues')

In [None]:
logres_result = pd.DataFrame(lgr.coef_, columns=X_test.columns, index=['Coef']).T.sort_values(by='Coef', key=abs)

In [None]:
logres_nonzero = logres_result.loc[(logres_result['Coef'] != 0) | (logres_result['Coef'] != -0)]
logres_nonzero.sort_values(by='Coef', ascending=False, key=abs)

In [None]:
plt.rcParams['figure.figsize'] = (20, 40)
logres_nonzero.plot(kind = 'barh')
plt.title('Feature importance using Logistic Regression')

### B. Decision Tree Classifier Model - Entropy
#### A Decision Tree is a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules (if-else) inferred from the data features.

In [None]:
# Create a Decision Tree object.

dtc = DecisionTreeClassifier(criterion = 'entropy')

In [None]:
# Fit the DTC object onto the training set.

dtc.fit(X_train, y_train)

In [None]:
y_pred_dtc = dtc.predict(X_test)

In [None]:
print('Test Accuracy:', accuracy_score(y_test, y_pred_dtc))

In [None]:
cf_matrix_dtc = confusion_matrix(y_test, y_pred_dtc)

In [None]:
TN, FP, FN, TP = cf_matrix_dtc.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

In [None]:
group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']

group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_dtc.flatten()]

group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix_dtc.flatten()/np.sum(cf_matrix_dtc)]

labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

plt.figure(figsize=(8,4))
sns.heatmap(cf_matrix_dtc, annot=labels, fmt='', cmap='Blues')

In [None]:
pd.Series(y_pred_dtc).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
# Extracting the importances by sklearn 
importances_dtc = dtc.feature_importances_

# Creating a dataframe with the feature importance by sklearn
feature_importance_dtc = {}
for i, feature in enumerate(features):
    feature_importance_dtc[feature] = round(importances_dtc[i], 3)

print(f"Feature importance by sklearn: {feature_importance_dtc}")

In [None]:
dtc_result = pd.DataFrame({'Feature':feature_importance_dtc.keys(),'Importance':feature_importance_dtc.values()})
dtc_result.set_index('Feature', inplace = True)
dtc_result.sort_values(by='Importance', ascending=False, key=abs)

In [None]:
dtc_nonzero = dtc_result.loc[(dtc_result['Importance'] != 0) | (dtc_result['Importance'] != -0)]
dtc_nonzero = dtc_nonzero.sort_values(by='Importance', key=abs)
dtc_nonzero

In [None]:
plt.rcParams['figure.figsize'] = (10, 20)
dtc_nonzero.plot(kind = 'barh')
plt.title('Feature importance using Decision Tree')

### C. Random Forest

In [None]:
# Create a RF object.

rfc = RandomForestClassifier()

In [None]:
# Fit the DTC object onto the training set.

rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc = rfc.predict(X_test)

In [None]:
print('Train Accuracy:', accuracy_score(y_train, rfc.predict(X_train)))

In [None]:
print('Test Accuracy:', '{:.3f}'.format(accuracy_score(y_test, y_pred_rfc)))

In [None]:
cf_matrix_rfc = confusion_matrix(y_test, y_pred_rfc)

In [None]:
TN, FP, FN, TP = cf_matrix_rfc.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

In [None]:
group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']

group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix_rfc.flatten()]

group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix_rfc.flatten()/np.sum(cf_matrix_rfc)]

labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]

labels = np.asarray(labels).reshape(2,2)

plt.figure(figsize=(8,4))
conf_mat = sns.heatmap(cf_matrix_rfc, annot=labels, fmt='', cmap='crest')

In [None]:
# Extracting the importances by sklearn 
importances_rfc = rfc.feature_importances_

# Creating a dataframe with the feature importance by sklearn
feature_importance_rfc = {}
for i, feature in enumerate(features):
    feature_importance_rfc[feature] = round(importances_rfc[i], 3)

print(f"Feature importance by sklearn: {feature_importance_rfc}")

In [None]:
rfc_result = pd.DataFrame({'Feature':feature_importance_rfc.keys(),'Importance':feature_importance_rfc.values()})
rfc_result.set_index('Feature', inplace = True)
rfc_result.sort_values(by='Importance', ascending=False, key=abs)

In [None]:
rfc_nonzero = rfc_result.loc[(rfc_result['Importance'] != 0) | (rfc_result['Importance'] != -0)]
rfc_nonzero = rfc_nonzero.sort_values(by='Importance', key=abs, ascending=False)
rfc_nonzeros = rfc_nonzero.head(20)
rfc_nonzeros 

In [None]:
values = rfc_nonzeros['Importance']
clrs = ['green' if (x < max(values)) else 'red' for x in values ]
plt.rcParams['figure.figsize'] = (18, 8)

feat_importance = sns.barplot(y=rfc_nonzeros.index, x=values, palette=clrs)
plt.title('Top 20 Feature Importance Using Random Forest')
plt.show()

In [None]:
conf_mat.figure.savefig(os.path.join(path, '04. Visualizations', 'conf_mat.png'))

In [None]:
feat_importance.figure.savefig(os.path.join(path, '04. Visualizations', 'feature_inportance.png'))

In [None]:
gc.collect()