<a href="https://colab.research.google.com/github/christinesako-berk/ds_207_final_project/blob/main/baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Multiclass Regression Model

## Christine Sako

## Importing Libraries

In [1]:

!pip install keras_tuner -q

import pandas as pd
import math
import numpy as np
import numpy.linalg as nla
import pandas as pd
import seaborn as sns
import re
import six
from os.path import join
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from keras_tuner import HyperParameters
sns.set(style="darkgrid")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Setting detault plot params
plt.rcParams.update({
    'axes.titlesize': 14,
    'axes.titlepad': 20,
    'axes.labelsize': 12,
    'axes.labelpad': 10,
})

# Show all results
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

## Data Ingestion

In [2]:
# Importing training, validation, and test data
train_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/train_final.csv'
train_df = pd.read_csv(train_file_path, sep=',')

val_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/val_final.csv'
val_df = pd.read_csv(val_file_path, sep=',')

test_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/test_final.csv'
test_df = pd.read_csv(test_file_path, sep=',')

In [3]:
# Printing shapes of datasets

print(f"The shape of `train_df` is: {train_df.shape}")
print(f"The shape of `val_df` is: {val_df.shape}")
print(f"The shape of `test_df` is: {test_df.shape}")

The shape of `train_df` is: (218338, 458)
The shape of `val_df` is: (54585, 458)
The shape of `test_df` is: (103669, 458)


## Splitting Features of Interest from Outcome Variable in Datasets

In [4]:
# Splitting training data
x_train = train_df.drop(columns='ExtentOfInjuryCode')
y_train = train_df['ExtentOfInjuryCode']

# Splitting validation data
x_val = val_df.drop(columns='ExtentOfInjuryCode')
y_val = val_df['ExtentOfInjuryCode']

# Splitting test data
x_test = test_df.drop(columns='ExtentOfInjuryCode')
y_test = test_df['ExtentOfInjuryCode']


## Encode Outcome Variable for Multiclass Classificataion

In [5]:
# Creating encoding mapping
injury_mapping = {
    'No Injury': 0,
    'Minor': 1,
    'Serious': 2,
    'Fatal': 3
}

# Transforming all datasets using the same encoder fromg y_train
y_train_encoded = y_train.map(injury_mapping)
y_val_encoded = y_val.map(injury_mapping)
y_test_encoded = y_test.map(injury_mapping)

# Printing shapes of all split data:

print("The shape of `x_train` is:", x_train.shape)
print("The shape of `x_val` is:", x_val.shape)
print("The shape of `x_test` is:", x_test.shape)

print("The shape of `y_train_encoded` is:", y_train_encoded.shape)
print("The shape of `y_val_encoded` is:", y_val_encoded.shape)
print("The shape of `y_test_encoded` is:", y_test_encoded.shape)

The shape of `x_train` is: (218338, 457)
The shape of `x_val` is: (54585, 457)
The shape of `x_test` is: (103669, 457)
The shape of `y_train_encoded` is: (218338,)
The shape of `y_val_encoded` is: (54585,)
The shape of `y_test_encoded` is: (103669,)


## Creating Logistic Regression Baseline



In [6]:
# Initializing multiclass logistic regression with balanced class weights
logreg = LogisticRegression(multi_class = 'multinomial', max_iter = 500, class_weight = 'balanced', random_state = 42)

# Fiting model to training data
logreg.fit(x_train, y_train_encoded)

## Evaluation Results

In [10]:
# Creating predictions from test data
y_train_pred = logreg.predict(x_train)

# Creating predictions from validation data
y_val_pred = logreg.predict(x_val)

# Creating predictions from test data
y_test_pred = logreg.predict(x_test)

# Evaluating training performance
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
print(f"The training acccuracy is: {train_accuracy:.4f}")
print("The full training classification report is:")
print(classification_report(y_train_encoded, y_train_pred, target_names = injury_mapping.keys()))

# Evaluating validation performance
print("\n-------\n")
val_accuracy = accuracy_score(y_val_encoded, y_val_pred)
print(f"The validation acccuracy is: {val_accuracy:.4f}")
print("The full validation classification report is:")
print(classification_report(y_val_encoded, y_val_pred, target_names = injury_mapping.keys()))

# Evaluating test performance
print("\n-------\n")
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
print(f"The test accuracy is: {test_accuracy:.4f}")
print("The full test classification report is:")
print(classification_report(y_test_encoded, y_test_pred, target_names = injury_mapping.keys()))

The training acccuracy is: 0.6134
The full training classification report is:
              precision    recall  f1-score   support

   No Injury       0.79      0.72      0.75    134059
       Minor       0.56      0.44      0.49     76242
     Serious       0.11      0.46      0.18      6385
       Fatal       0.12      0.73      0.20      1652

    accuracy                           0.61    218338
   macro avg       0.39      0.59      0.41    218338
weighted avg       0.68      0.61      0.64    218338


-------

The validation acccuracy is: 0.6122
The full validation classification report is:
              precision    recall  f1-score   support

   No Injury       0.79      0.72      0.75     33637
       Minor       0.55      0.44      0.49     18986
     Serious       0.11      0.45      0.18      1562
       Fatal       0.11      0.71      0.19       400

    accuracy                           0.61     54585
   macro avg       0.39      0.58      0.40     54585
weighted avg   