In [1]:
def str_to_num_weekday(weekday: str):
    if weekday == 'Monday': return 1
    if weekday == 'Tuesday': return 2
    if weekday == 'Wednesday': return 3
    if weekday == 'Thursday': return 4
    if weekday == 'Friday': return 5
    if weekday == 'Saturday': return 6
    if weekday == 'Sunday': return 7
        
traffic_situation_map = {
    'low': 0,
    'normal': 1,
    'heavy': 2,
    'high': 3
}

In [2]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime

data = pd.read_csv('traffic.csv', dtype={'Date': 'int', 'Traffic Situation': 'string'})

# Transform str 'Day of the week' to int to make it ordinal
data['Day of the week'] = data['Day of the week'].map(lambda d: str_to_num_weekday(d))

# transform Date and Time to Timestamp
data['Date'] = data.apply(lambda row: pd.Timestamp(
    year=2023, 
    month=1, 
    day=row['Date'],
    hour=datetime.datetime.strptime(row['Time'], '%I:%M:%S %p').hour,
    minute=datetime.datetime.strptime(row['Time'], '%I:%M:%S %p').minute
), axis=1)
data = data.drop(['Time'], axis='columns')
_ = data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2976 entries, 0 to 2975
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               2976 non-null   datetime64[ns]
 1   Day of the week    2976 non-null   int64         
 2   CarCount           2976 non-null   int64         
 3   BikeCount          2976 non-null   int64         
 4   BusCount           2976 non-null   int64         
 5   TruckCount         2976 non-null   int64         
 6   Total              2976 non-null   int64         
 7   Traffic Situation  2976 non-null   string        
dtypes: datetime64[ns](1), int64(6), string(1)
memory usage: 186.1 KB


In [3]:
X = data
X['Date'] = pd.to_numeric(pd.to_datetime(X['Date']))

X_train, X_test, y_train, y_test = train_test_split(
    X.drop('Traffic Situation', axis='columns'),
    X['Traffic Situation'].map(traffic_situation_map),
    train_size=0.8
)

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import dtreeviz

def gen_classifiers(tree_depth_zip, tc_type: str):
    for clf, max_depth in tree_depth_zip:
        try:
            viz_model = dtreeviz.model(clf,
                                       X_train=X_train, y_train=y_train,
                                       feature_names=X.columns.values,
                                       target_name='Traffic Situation',
                                       class_names=list(traffic_situation_map.keys()))
        
            v = viz_model.view()
            v.save(f".%s/depth_%d.svg" % (tc_type, max_depth))
        except Exception:
            pass
        fig = plt.figure(figsize=(10,5))
        _ = plot_tree(clf, feature_names=X.columns.values, class_names=list(traffic_situation_map.keys()), filled=True)
        predictions = clf.predict(X_test)
        disp = ConfusionMatrixDisplay(
            confusion_matrix=confusion_matrix(y_test, predictions, labels=clf.classes_),
            display_labels=list(traffic_situation_map.keys())
        )
        _ = disp.plot()
        disp.ax_.set_title(f'Confussion matrix for %s with max depth: %s, and accuracy: %s' % (tc_type, max_depth, accuracy_score(y_test, predictions)))
        
gen_classifiers(
    [(DecisionTreeClassifier(max_depth=max_depth).fit(X_train, y_train), max_depth) for max_depth in range(3, 6)],
    'DecisionTreeClassifier'
)

rfs = [(RandomForestClassifier(n_estimators=100, random_state=0, max_depth=max_depth).fit(X_train, y_train), max_depth) for max_depth in range(3,6)]
for rfc, max_depth in rfs:
    gen_classifiers([(estimator, max_depth) for estimator in rfc], 'RandomForestClassifier')

