In [None]:
#!pip install lightgbm

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
def create_folder(folder_path):
    """
    Check whether the specified path exists or not
    Create a new directory if it does not exist 
    """
    isExist = os.path.exists(folder_path)
    if not isExist:
        os.makedirs(folder_path)
path = "./results/3_machine_learning"

In [15]:
class Classifier:
    
    def __init__(self, name, clf, X_train, y_train, X_test, y_test):
        self.clf = clf
        self.name = name
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test        
    
    def run_classification(self):
        self.clf.fit(self.X_train, self.y_train)
        self.y_pred = self.clf.predict(self.X_test)
        self.report = pd.DataFrame(classification_report(self.y_test, self.y_pred, output_dict=True))
        self.cf_matrix = confusion_matrix(self.y_test, self.y_pred)
        self.accuracy = accuracy_score(self.y_test, self.y_pred)
        print(self.cf_matrix)
        print(self.report)
        print("Accuracy: %.2f%%" % (self.accuracy * 100.0))
        self.results = {
            "classifier": self.name,
            "average precision": average_precision_score(self.y_test, self.y_pred),
            "precision": precision_score(self.y_test, self.y_pred),
            "recall": recall_score(self.y_test, self.y_pred),
            "accuracy": accuracy_score(self.y_test, self.y_pred),            
            "f1 score": f1_score(self.y_test, self.y_pred),
            "roc_auc": roc_auc_score(self.y_test, self.y_pred)
        }
        self.save_results()
    
    def save_results(self):
        folder_path = "./results/3_machine_learning/{}/{}".format("testing_models", self.name)
        create_folder(folder_path)
        self.report = pd.DataFrame(classification_report(self.y_test, self.y_pred, output_dict=True))
        self.report.to_csv(folder_path + "classification_report.csv")
        #pd.DataFrame(self.y_pred).to_csv(folder_path + "y_pred.csv")
        sns_plot = sns.heatmap(self.cf_matrix/np.sum(self.cf_matrix), annot=True, fmt='.2%', cmap='Blues')
        fig = sns_plot.get_figure()
        fig.savefig("./results/3_machine_learning/{}/{}/cf_matrix.png".format("testing_models", self.name))
        fig.clf()

In [21]:
#df = pd.read_csv("./results/0_data_cleaning/dataset.csv")
df =  pd.read_csv("./results/0_data_cleaning/train_dataset.csv")

X = df.drop('accident_severity',axis=1)
y = df['accident_severity'].replace(['Fatal'], 'Serious')
grp = df.groupby("accident_severity")
df_serious = grp.get_group("Serious")
df_slight = grp.get_group("Slight")
print("Serious", df_serious.shape)
print("Slight", df_slight.shape)
y_encoded = LabelEncoder().fit_transform(y)
X_encoded = X.copy()
for col in X.columns:
    if X[col].dtype == np.dtype('O'):
        X_encoded[col] = LabelEncoder().fit_transform(X[col])
    if X[col].dtype == np.dtype('int64') or X[col].dtype == np.dtype('float64'):
        X_encoded[col] = StandardScaler().fit_transform(X[col].values.reshape(-1,1))

        
#Serious (24693, 14)
#Slight (45000, 14)
#df = pd.concat([df_serious.sample(20000), df_slight.sample(20000)])
print(df.shape)

0        Serious
1        Serious
2        Serious
3        Serious
4        Serious
          ...   
96057    Serious
96058    Serious
96059    Serious
96060    Serious
96061    Serious
Name: accident_severity, Length: 96062, dtype: object
Serious (24693, 14)
Slight (45000, 14)
[0 0 0 ... 0 0 0]
(96062, 14)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=1, stratify=y_encoded)

In [18]:
estimator = RandomForestClassifier(bootstrap=True,
                class_weight="balanced_subsample", 
                criterion='gini',
                max_depth=8, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=4, min_samples_split=10,
                min_weight_fraction_leaf=0.0, n_estimators=300,
                oob_score=False,
                random_state=35,
                verbose=0, warm_start=False)

In [19]:
clf = Classifier("RF", RandomForestClassifier(bootstrap=True,
                class_weight="balanced_subsample", 
                criterion='gini',
                max_depth=8, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=4, min_samples_split=10,
                min_weight_fraction_leaf=0.0, n_estimators=300,
                oob_score=False,
                random_state=35,
                verbose=0, warm_start=False), X_train, y_train, X_test, y_test)


In [20]:
clf.run_classification()

[[5776 4437]
 [2759 6241]]
                      0            1  accuracy     macro avg  weighted avg
precision      0.676743     0.584473  0.625462      0.630608      0.633520
recall         0.565554     0.693444  0.625462      0.629499      0.625462
f1-score       0.616172     0.634312  0.625462      0.625242      0.624670
support    10213.000000  9000.000000  0.625462  19213.000000  19213.000000
Accuracy: 62.55%


<Figure size 432x288 with 0 Axes>

In [13]:



models = {
    "KNC": KNeighborsClassifier(2),
    "GradBoost": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0),
    "LR": LogisticRegression(),
    "XGBoost": XGBClassifier(),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "DT": DecisionTreeClassifier(),
    "NB": GaussianNB(),
    "SGD": SGDClassifier(loss="hinge", penalty="l2"),
    "LGBM": LGBMClassifier(max_depth=10, num_leaves=246, n_estimators=380, min_data_in_leaf=20),
    "SVM": LinearSVC(),
    "RF": RandomForestClassifier(bootstrap=True,
                class_weight="balanced_subsample", 
                criterion='gini',
                max_depth=8, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=4, min_samples_split=10,
                min_weight_fraction_leaf=0.0, n_estimators=300,
                oob_score=False,
                random_state=35,
                verbose=0, warm_start=False),
    "MLP": MLPClassifier(alpha=1, max_iter=1000)
}

        

fitted_classifiers = {}
results = []
for name, clf in models.items():
    if name == "RF":
        clf = Classifier(name, clf, X_train, y_train, X_test, y_test)
        clf.run_classification()
        clf.save_results()
        results.append(clf.results)
        fitted_classifiers[name] = clf

df_results = pd.DataFrame(results)
df_results

[[5776 4437]
 [2759 6241]]
                      0            1  accuracy     macro avg  weighted avg
precision      0.676743     0.584473  0.625462      0.630608      0.633520
recall         0.565554     0.693444  0.625462      0.629499      0.625462
f1-score       0.616172     0.634312  0.625462      0.625242      0.624670
support    10213.000000  9000.000000  0.625462  19213.000000  19213.000000
Accuracy: 62.55%


Unnamed: 0,classifier,average precision,precision,recall,accuracy,f1 score,roc_auc
0,RF,0.5489,0.584473,0.693444,0.625462,0.634312,0.629499


In [None]:
df_results.to_csv(path + "ML_classifiers_comparison.csv")

In [None]:
df_results

In [None]:
feats = 
feature_importances = pd.DataFrame([feats])
feature_importances.T.sort_values(by=[0], ascending=False)

In [None]:
RF = models["RF"]

def get_model():
    return RF

def get_hyperparams():
    return {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [None]:
from skopt import BayesSearchCV

search_parameters = {
    "cv": 3,
    "verbose": 0,
    "scoring": 'f1_macro',
    "n_iter": 20,
    "random_state": 42
}
bs = BayesSearchCV(get_model(), get_hyperparams(), **search_parameters)
bs.fit(X_train, y_train)

In [None]:
print(bs.best_params_)
best = bs.best_estimator_
best.fit(X_train, y_train)
y_pred = best.predict(X_test)

In [None]:
classification_report(y_pred, y_test)