In [71]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [72]:
# load data 
df = pd.read_parquet('processed_data.parquet')

In [73]:
input_cols = ['age_ind', 'district_y', 'crash_hour_ind', 'posted_speed_limit_ind', 'street_direction_ind_S', 'street_direction_ind_N',
 'lighting_condition_ind_DAYLIGHT','crash_day_of_week_ind', 'target']

df = df[input_cols]

In [74]:
# Split the data into X and y
X = df.drop(columns=['target']).to_numpy() 
y = df['target'].to_numpy() 

In [75]:
df['target'].value_counts()

target
1    11937
0     7411
2     3265
Name: count, dtype: int64

Naive Bayes can be extended to handle multi-class classification problems. In this context, it calculates the probability of each class given the features and selects the class with the highest probability as the predicted class.

Logistic Regression can be extended to handle multi-class classification tasks. One common approach is the one-vs-rest (OvR) strategy, where separate binary classifiers are trained for each class. Each classifier is trained to distinguish between one class and the rest. Alternatively, the one-vs-one (OvO) strategy trains a binary classifier for each pair of classes.

SVM: SVM can be adapted to handle multi-class classification using either the one-vs-one (OvO) or one-vs-rest (OvR) strategy. In OvO, a binary classifier is trained for each pair of classes, and the class with the most votes is chosen. In OvR, separate binary classifiers are trained for each class, where each classifier distinguishes between one class and the rest.

Random Forest: Ensemble learning method that combines the strengths of decision trees with randomization to achieve high predictive accuracy and generalization performance.

Gradient Boosting: Builds an ensemble of weak learners, optimizing them using gradient descent to minimize a loss function and achieve strong predictive performance.


In [76]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes_accuracy_per_class(X, y, n_splits=5, n_repeats=2):
    nb_classifier = GaussianNB()

    # Perform cross-validation and get predicted labels for each sample
    y_pred = cross_val_predict(nb_classifier, X, y, cv=n_splits, n_jobs=-1)

    # accuracy for each class
    accuracy_per_class = []
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        class_accuracy = accuracy_score(y[class_indices], y_pred[class_indices])
        accuracy_per_class.append(class_accuracy)

    return accuracy_per_class

# Example:
accuracies = naive_bayes_accuracy_per_class(X, y)
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {i}: {acc:.4f}")


Accuracy for class 0: 0.1753
Accuracy for class 1: 0.9079
Accuracy for class 2: 0.0021


In [79]:
from sklearn.linear_model import LogisticRegression

def logistic_regression_accuracy_per_class(X, y, n_splits=5, n_repeats=2):
    lr_classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs')

    # cross-validation and get predicted labels for each sample
    y_pred = cross_val_predict(lr_classifier, X, y, cv=n_splits, n_jobs=-1)

    # accuracy for each class
    accuracy_per_class = []
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        class_accuracy = accuracy_score(y[class_indices], y_pred[class_indices])
        accuracy_per_class.append(class_accuracy)

    return accuracy_per_class

# Example:
accuracies = logistic_regression_accuracy_per_class(X, y)
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {i}: {acc:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy for class 0: 0.0734
Accuracy for class 1: 0.9778
Accuracy for class 2: 0.0000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [80]:
from sklearn.svm import SVC

def svm_accuracy_per_class(X, y, n_splits=5, n_repeats=2):
    svm_classifier = SVC()

    # cross-validation and get predicted labels for each sample
    y_pred = cross_val_predict(svm_classifier, X, y, cv=n_splits, n_jobs=-1)

    # accuracy for each class
    accuracy_per_class = []
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        class_accuracy = accuracy_score(y[class_indices], y_pred[class_indices])
        accuracy_per_class.append(class_accuracy)

    return accuracy_per_class

# Example:
accuracies = svm_accuracy_per_class(X, y)
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {i}: {acc:.4f}")


Accuracy for class 0: 0.2154
Accuracy for class 1: 0.9261
Accuracy for class 2: 0.0000


In [81]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_accuracy_per_class(X, y, n_splits=5, n_repeats=2):
    rf_classifier = RandomForestClassifier()

    # cross-validation and get predicted labels for each sample
    y_pred = cross_val_predict(rf_classifier, X, y, cv=n_splits, n_jobs=-1)

    # accuracy for each class
    accuracy_per_class = []
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        class_accuracy = accuracy_score(y[class_indices], y_pred[class_indices])
        accuracy_per_class.append(class_accuracy)

    return accuracy_per_class

# Example:
accuracies = random_forest_accuracy_per_class(X, y)
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {i}: {acc:.4f}")


Accuracy for class 0: 0.3330
Accuracy for class 1: 0.6923
Accuracy for class 2: 0.0833


In [82]:
from sklearn.ensemble import GradientBoostingClassifier

def gradient_boosting_accuracy_per_class(X, y, n_splits=5, n_repeats=2):
    gb_classifier = GradientBoostingClassifier()

    # cross-validation and predicted class for each sample
    y_pred = cross_val_predict(gb_classifier, X, y, cv=n_splits, n_jobs=-1)

    # accuracy for each class
    accuracy_per_class = []
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        class_accuracy = accuracy_score(y[class_indices], y_pred[class_indices])
        accuracy_per_class.append(class_accuracy)

    return accuracy_per_class

# Example:
accuracies = gradient_boosting_accuracy_per_class(X, y)
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {i}: {acc:.4f}")


Accuracy for class 0: 0.2086
Accuracy for class 1: 0.9332
Accuracy for class 2: 0.0089
