In [0]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import random

def hourToPeriod(hour):
    if int(hour) >= 0 and int(hour) < 6:
        return "overnight"
    elif int(hour) >= 6 and int(hour) < 12:
        return "morning"
    elif int(hour) >= 12 and int(hour) < 18:
        return "afternoon"
    elif int(hour) >= 18 and int(hour) <= 23:
        return "evening"
    else:
        return "error"
    
def dayToDayType(day):
    if int(day) < 6:
        return "weekday"
    else:
        return "weekend"

data = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/luke.couture@ucalgary.ca/Traffic_Crashes_Truncated-5.csv").toPandas()

# data_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/luke.couture@ucalgary.ca/Traffic_Crashes___Crashes.csv").toPandas()
# data = data_df.sample(10000, replace=False)

category_column_names = [
                'POSTED_SPEED_LIMIT', 
                'TRAFFIC_CONTROL_DEVICE',
                'DEVICE_CONDITION',
                'WEATHER_CONDITION',
                'TRAFFICWAY_TYPE',
                'ALIGNMENT',
                'ROADWAY_SURFACE_COND',
                'ROAD_DEFECT',
                'STREET_DIRECTION',
                'CRASH_DAY_OF_WEEK',
                'CRASH_MONTH',
                'CRASH_HOUR'
                ]

# number_column_names = [
#                 'NUM_UNITS',
#                 'INJURIES_TOTAL',
#                 'INJURIES_FATAL',
#                 'INJURIES_INCAPACITATING',
#                 'INJURIES_NON_INCAPACITATING',
#                 'INJURIES_REPORTED_NOT_EVIDENT',
#                 'INJURIES_NO_INDICATION'
#                 ]

df = data[category_column_names].copy()
df['CRASH_HOUR'] = data['CRASH_HOUR'].map(hourToPeriod)
# df['CRASH_DAY_OF_WEEK'] = data['CRASH_DAY_OF_WEEK'].map(dayToDayType)

category_columns_df = pd.get_dummies(df, columns=category_column_names, drop_first=True)

# number_columns_df = data.loc[:,number_column_names].fillna(0)
# joined_df = category_columns_df.join(number_columns_df)
# scaler = StandardScaler()
# joined_df[number_column_names] = scaler.fit_transform(joined_df[number_column_names])
# X = joined_df
# y = data['CRASH_HOUR'].map(hourToPeriod)

X = category_columns_df
y = data['CRASH_TYPE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlp = MLPClassifier(alpha=1e-05, max_iter=10000, hidden_layer_sizes=(2,4), random_state=1, solver='lbfgs')

# Train the model on the training data
mlp.fit(X_train, y_train)
 
# Make predictions on the test data
y_pred = mlp.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# scores, pvalues = chi2(X, y)
# for idx, p in enumerate(pvalues):
#     if (p > 0.05):
#         print(f"{X.columns[idx]} {p}")
#     else:
#         print(f"\t\t{X.columns[idx]} {p}")

cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

k_values = [i for i in range (1,31)]
scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, p=1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='INJURY AND / OR TOW DUE TO CRASH')
    recall = recall_score(y_test, y_pred, pos_label='INJURY AND / OR TOW DUE TO CRASH')
    score = cross_val_score(knn, X, y, cv=5)
    scores.append((np.mean(score) + accuracy + recall)/3)

best_index = np.argmax(scores)
best_k = k_values[best_index]

print(f'Best K: {best_k}')

knn = KNeighborsClassifier(n_neighbors=best_k, p=1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='INJURY AND / OR TOW DUE TO CRASH')
recall = recall_score(y_test, y_pred, pos_label='INJURY AND / OR TOW DUE TO CRASH')

print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Accuracy: 0.68
Classification Report:
                                   precision    recall  f1-score   support

INJURY AND / OR TOW DUE TO CRASH       0.22      0.03      0.06        60
          NO INJURY / DRIVE AWAY       0.70      0.95      0.80       140

                        accuracy                           0.68       200
                       macro avg       0.46      0.49      0.43       200
                    weighted avg       0.55      0.68      0.58       200

Confusion matrix, without normalization
[[  2  58]
 [  7 133]]
Accuracy: 0.69
Classification Report:
                                   precision    recall  f1-score   support

INJURY AND / OR TOW DUE TO CRASH       0.47      0.32      0.38        60
          NO INJURY / DRIVE AWAY       0.74      0.85      0.79       140

                        accuracy                           0.69       200
                       macro avg       0.61      0.58      0.59       200
                    weighted avg       0