In [9]:
import logging, os

import matplotlib.pyplot as plt

logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers, optimizers
from sklearn.model_selection import train_test_split
from datasets import data_all as dataframe
from helpers import df_to_dataset

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree

In [10]:
# set some pandas option to use the whole terminal width to display results
pd.options.display.max_columns = 10
pd.set_option('display.width', 200)

dataframe = dataframe.drop(columns=['Date', 'SumBikerNumber',
                                    'SumBikerNumber', 'SumCars', 'SumPedestrianNumber'])
# Shorten feature names just for convenient output format
new_cols = {"AccidentSeverityCategory": "Severity", "AccidentType": "AccType", "AvgTemperature": "Temperature",
            "AvgRainDur": "RainDur",
            "AccidentInvolvingPedestrian": "Pedestrian", "AccidentInvolvingBicycle": "Bicycle",
            "AccidentInvolvingMotorcycle": "Motorcycle", "AccidentLocation_CHLV95_E": "LocationE",
            "AccidentLocation_CHLV95_N": "LocationN"}
dataframe.rename(columns=new_cols, inplace=True)
dataframe.loc[dataframe['RoadType'] == 9, 'RoadType'] = 5

In [13]:
# all features: ['AccType', 'Severity', 'Pedestrian', 'Bicycle', 'Motorcycle', 'RoadType','LocationE', 'LocationN', 'Temperature', 'RainDur']
df = dataframe[['AccType', 'Severity', 'Pedestrian', 'Bicycle', 'Motorcycle', 'RoadType', 'Temperature', 'RainDur']]
X = df.drop(columns='RoadType')
y = df['RoadType']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

dtc_y_predict = dtc.predict(X_test)
accuracy_score(y_test, dtc_y_predict)

confusion = pd.DataFrame(
    confusion_matrix(y_test, dtc_y_predict),
    columns=['Predicted Motorway', 'Predicted Expressway', 'Predicted Principal road','Predicted Minor road',
             'Predicted Motorway side installation', 'Predicted Other'],
    index=['True Motorway', 'True Expressway', 'True Principal road', 'True Minor road',
           'True Motorway side installation', 'True Other']
)

In [14]:
print(confusion)

fi_dtc = pd.DataFrame(dtc.feature_importances_,
                      index=list(X.columns),
                      columns=['importance'])
fi_dtc_sorted = fi_dtc.sort_values('importance', ascending=False)
fi_dtc_sorted


                                 Predicted Motorway  Predicted Expressway  Predicted Principal road  Predicted Minor road  Predicted Motorway side installation  Predicted Other
True Motorway                                    58                     1                       234                   346                                     0                5
True Expressway                                   6                     0                        23                    19                                     0                0
True Principal road                             138                     6                       941                  1730                                     1               27
True Minor road                                 202                    20                      1558                  5758                                     1              115
True Motorway side installation                   0                     0                         0                

Unnamed: 0,importance
Temperature,0.529859
RainDur,0.221966
AccType,0.175218
Severity,0.03261
Motorcycle,0.018968
Bicycle,0.016089
Pedestrian,0.00529
