<a href="https://colab.research.google.com/github/christinesako-berk/ds_207_final_project/blob/christine/processing_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-processing on 2024 Datasets (For EDA)
## Christine Sako

## Connecting to Github

In [None]:

import os
# from google.colab import userdata

# # Github access
# github_token = userdata.get('GITHUB_TOKEN')

# repo_owner = 'christinesako-berk'
# repo_name_full = 'ds_207_final_project'
# repo_url = f'https://{github_token}@github.com/{repo_owner}/{repo_name_full}.git'

# # Clone repo
# !git clone {repo_url}

# # Change directory
# # os.chdir(repo_name_full)

## Importing Libraries

In [None]:
!pip install keras_tuner -q

import pandas as pd
import math
import numpy as np
import numpy.linalg as nla
import pandas as pd
import seaborn as sns
import re
import six
from os.path import join
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score
from keras_tuner import HyperParameters
sns.set(style="darkgrid")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Setting detault plot params
plt.rcParams.update({
    'axes.titlesize': 14,
    'axes.titlepad': 20,
    'axes.labelsize': 12,
    'axes.labelpad': 10,
})

# Show all results
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)


## Initial Exploration



In [None]:
# Importing 2024 datasets, to be used for testing and validation data
crashes_2024_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/crashes_2024.csv'
crashes_2024 = pd.read_csv(crashes_2024_file_path, sep=',')

parties_2024_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/parties_2024.csv'
parties_2024 = pd.read_csv(parties_2024_file_path, sep=',', on_bad_lines='skip') # Skipping 2 lines of poorly formatted rows

injuredwitnesspassengers_2024_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/injuredwitnesspassengers_2024.csv'
injuredwitnesspassengers_2024 = pd.read_csv(injuredwitnesspassengers_2024_file_path, sep=',')

# Removing spaces and "\t" that appears in column names
crashes_2024.columns = crashes_2024.columns.str.replace(' ', '', regex = False)
crashes_2024.columns = crashes_2024.columns.str.replace('\t', '', regex = False)
parties_2024.columns = parties_2024.columns.str.replace(' ', '', regex = False)
parties_2024.columns = parties_2024.columns.str.replace('\t', '', regex = False)
injuredwitnesspassengers_2024.columns = injuredwitnesspassengers_2024.columns.str.replace(' ', '', regex = False)
injuredwitnesspassengers_2024.columns = injuredwitnesspassengers_2024.columns.str.replace('\t', '', regex = False)

# Shape of Data
print(f"The shape of `crashes_2024` is: {crashes_2024.shape}")
print(f"The shape of `parties_2024` is: {parties_2024.shape}")
print(f"The shape of `injuredwitnesspassengers_2024` is: {injuredwitnesspassengers_2024.shape}")

The shape of `crashes_2024` is: (410348, 73)
The shape of `parties_2024` is: (801856, 38)
The shape of `injuredwitnesspassengers_2024` is: (485031, 21)


In [None]:
# Retreiving and printing column names, datatypes, unique values, and the percentage of missing values
# Get datatypes
crashes_2024_dtypes = crashes_2024.dtypes
parties_2024_dtypes = parties_2024.dtypes
injuredwitnesspassengers_2024_dtypes = injuredwitnesspassengers_2024.dtypes

# Get unique value counts
crashes_2024_unique_counts = crashes_2024.nunique()
parties_2024_unique_counts = parties_2024.nunique()
injuredwitnesspassengers_2024_unique_counts = injuredwitnesspassengers_2024.nunique()

# Get missing value counts
crashes_2024_missing_counts = crashes_2024.isnull().sum()
parties_2024_missing_counts = parties_2024.isnull().sum()
injuredwitnesspassengers_2024_missing_counts = injuredwitnesspassengers_2024.isnull().sum()

# Recomposing into summary DataFrame
crashes_2024_summary = pd.DataFrame({
    'Dtype': crashes_2024_dtypes,
    'Unique_Values': crashes_2024_unique_counts,
    'Missing_Values': crashes_2024_missing_counts,
})

parties_2024_summary = pd.DataFrame({
    'Dtype': parties_2024_dtypes,
    'Unique_Values': parties_2024_unique_counts,
    'Missing_Values': parties_2024_missing_counts,
})

injuredwitnesspassengers_2024_summary = pd.DataFrame({
    'Dtype': injuredwitnesspassengers_2024_dtypes,
    'Unique_Values': injuredwitnesspassengers_2024_unique_counts,
    'Missing_Values': injuredwitnesspassengers_2024_missing_counts,
})

# Adding 'Missing_Percentage' column
crashes_2024_summary['Missing_Percentage'] = (crashes_2024_summary['Missing_Values'] / len(crashes_2024))
parties_2024_summary['Missing_Percentage'] = (parties_2024_summary['Missing_Values'] / len(parties_2024))
injuredwitnesspassengers_2024_summary['Missing_Percentage'] = (injuredwitnesspassengers_2024_summary['Missing_Values'] / len(injuredwitnesspassengers_2024))

# Printing Results
print(f"The column names, datatypes, unique values, and the percentage of missing values for `crashes_2024` are:\n{crashes_2024_summary.to_string()}")
print(f"\nThe column names, datatypes, unique values, and the percentage of missing values `parties_2024` aare:\n{parties_2024_summary.to_string()}")
print(f"\nThe column names, datatypes, unique values, and the percentage of missing values `injuredwitnesspassengers_2024_summary` aare:\n{injuredwitnesspassengers_2024_summary.to_string()}")


The column names, datatypes, unique values, and the percentage of missing values for `crashes_2024` are:
                                     Dtype  Unique_Values  Missing_Values  Missing_Percentage
CollisionId                          int64         410348               0            0.000000
ReportNumber                        object         392961              76            0.000185
ReportVersion                        int64             34               0            0.000000
IsPreliminary                         bool              2               0            0.000000
NCICCode                            object            700               0            0.000000
CrashDateTime                       object         219227               0            0.000000
CrashTimeDescription                 int64           1442               0            0.000000
Beat                                object           6119           23345            0.056891
CityId                             float64       

## Feature Selection


In [None]:
# Selecting only relevant features
crashes_2024 = crashes_2024[['CollisionId', 'CollisionTypeDescription', 'IsHighwayRelated', 'Weather1', 'RoadCondition1', 'LightingDescription']]
parties_2024 = parties_2024[['PartyId', 'CollisionId', 'SpeedLimit','MovementPrecCollDescription', 'AirbagDescription', 'SafetyEquipmentDescription', 'SobrietyDrugPhysicalDescription1', 'SpecialInformation']]

# First filtering `IsWitnessOnly` False and then selecting relevant features
injuredwitnesspassengers_2024 = injuredwitnesspassengers_2024[((injuredwitnesspassengers_2024['IsWitnessOnly'] == False) & (injuredwitnesspassengers_2024['IsWitnessOnly'].notna()))][['InjuredWitPassId', 'CollisionId','ExtentOfInjuryCode']]
# Replacing missing values in `ExtentOfInjuryCode` with 'NoInjuryReported' value
injuredwitnesspassengers_2024['ExtentOfInjuryCode'] = injuredwitnesspassengers_2024['ExtentOfInjuryCode'].fillna('NoInjuryReported')

print(f"The new shape of `crashes_2024` is: {crashes_2024.shape}")
print(f"The new shape of `parties_2024` is: {parties_2024.shape}")
print(f"The new shape of `injuredwitnesspassengers_2024` is: {injuredwitnesspassengers_2024.shape}\n")

# Dropping duplicates and filtering NAs

crashes_2024_filt = crashes_2024.drop_duplicates().dropna()
parties_2024_filt = parties_2024.drop_duplicates().dropna()
injuredwitnesspassengers_2024_filt = injuredwitnesspassengers_2024.drop_duplicates()

print(f"The new shape of `crashes_2024` is: {crashes_2024_filt.shape}")
print(f"The new shape of `parties_2024` is: {parties_2024_filt.shape}")
print(f"The new shape of `injuredwitnesspassengers_2024` is: {injuredwitnesspassengers_2024_filt.shape}\n")

print(f"The percentage of data retained for `crashes_2024_filt` after filtering is: {crashes_2024_filt.shape[0]/crashes_2024.shape[0]}")
print(f"The percentage of data retained for `parties_2024_filt` after filtering is: {parties_2024_filt.shape[0]/parties_2024.shape[0]}")
print(f"The percentage of data retained for `injuredwitnesspassengers_2024_filt` after filtering is: {injuredwitnesspassengers_2024_filt.shape[0]/injuredwitnesspassengers_2024.shape[0]}")

The new shape of `crashes_2024` is: (410348, 6)
The new shape of `parties_2024` is: (801856, 8)
The new shape of `injuredwitnesspassengers_2024` is: (305158, 3)

The new shape of `crashes_2024` is: (406874, 6)
The new shape of `parties_2024` is: (494352, 8)
The new shape of `injuredwitnesspassengers_2024` is: (305158, 3)

The percentage of data retained for `crashes_2024_filt` after filtering is: 0.9915340150311442
The percentage of data retained for `parties_2024_filt` after filtering is: 0.6165096975017959
The percentage of data retained for `injuredwitnesspassengers_2024_filt` after filtering is: 1.0


## Data Transformation



In [None]:
# Converting string values to categorical
crashes_2024_filt = crashes_2024_filt.astype({
    'CollisionTypeDescription' : 'category',
    'Weather1' : 'category',
    'RoadCondition1' : 'category',
    'LightingDescription' : 'category'
})

parties_2024_filt = parties_2024_filt.astype({
    'MovementPrecCollDescription' : 'category',
    'AirbagDescription' : 'category',
    'SafetyEquipmentDescription' : 'category',
    'SobrietyDrugPhysicalDescription1' : 'category',
    'SpecialInformation' : 'category'
})

# Encoding order to outcome variable category
injury_groups = {
    'NoInjuryReported': 'None',
    'ComplaintOfPainInactive': 'Minor',
    'PossibleInjury': 'Minor',
    'OtherVisibleInactive': 'Minor',
    'SuspectMinor': 'Minor',
    'SevereInactive': 'Serious',
    'SuspectSerious': 'Serious',
    'Fatal': 'Fatal'
}

injuredwitnesspassengers_2024_filt['ExtentOfInjuryCode'] = injuredwitnesspassengers_2024_filt['ExtentOfInjuryCode'].replace(injury_groups)

injuredwitnesspassengers_2024_filt['ExtentOfInjuryCode'] = pd.Categorical(injuredwitnesspassengers_2024_filt['ExtentOfInjuryCode'],
                                                                          categories = ['None','Minor','Serious','Fatal'],
                                                                          ordered = True
                                                                          )

# Confirming datatypes after conversions
print("Confirming datatypesafter conversions:\n")
print(f"{crashes_2024_filt.dtypes}\n")
print(f"{parties_2024_filt.dtypes}\n")
print(f"{injuredwitnesspassengers_2024_filt.dtypes}\n")

print("Checking outcome variable after conversion:\n")
injuredwitnesspassengers_2024_filt['ExtentOfInjuryCode'].value_counts()



Confirming datatypesafter conversions:

CollisionId                    int64
CollisionTypeDescription    category
IsHighwayRelated                bool
Weather1                    category
RoadCondition1              category
LightingDescription         category
dtype: object

PartyId                                int64
CollisionId                            int64
SpeedLimit                           float64
MovementPrecCollDescription         category
AirbagDescription                   category
SafetyEquipmentDescription          category
SobrietyDrugPhysicalDescription1    category
SpecialInformation                  category
dtype: object

InjuredWitPassId         int64
CollisionId              int64
ExtentOfInjuryCode    category
dtype: object

Checking outcome variable after conversion:



Unnamed: 0_level_0,count
ExtentOfInjuryCode,Unnamed: 1_level_1
Minor,147490
,144471
Serious,10729
Fatal,2468


## Joining and Aggregating

In [None]:
# Reduce `injuredwitnesspassengers_2024_filt` to only retain highest severity per `CollisionId`
injuredwitnesspassengers_2024_filt = injuredwitnesspassengers_2024_filt.drop(columns=['InjuredWitPassId'])
most_severe_idx = injuredwitnesspassengers_2024_filt.groupby('CollisionId')['ExtentOfInjuryCode'].idxmax()
injuredwitnesspassengers_2024_filt_most_severe = injuredwitnesspassengers_2024_filt.loc[most_severe_idx]

# Creating list of categories per feature per `CollisionId` if there are multiple `PartyId` per `CollisionId` in `parties_2024_filt` and retaining highest speed limit in `parties_2024_filt`
parties_2024_filt_grouped = parties_2024_filt.groupby('CollisionId').agg({
    'MovementPrecCollDescription' : list,
    'AirbagDescription' : list,
    'SafetyEquipmentDescription' : list,
    'SobrietyDrugPhysicalDescription1' : list,
    'SpecialInformation' : list,
    'SpeedLimit' : 'max'
}).reset_index()


In [None]:
# Left joining `injuredwitnesspassengers_2024_filt_most_severe` and `parties_2024_filt_grouped` to `crashes_2024_filt`
merged_2024 = crashes_2024_filt.merge(
    injuredwitnesspassengers_2024_filt_most_severe,
    on = 'CollisionId',
    how = 'left'
)

final_merged_2024 = merged_2024.merge(
    parties_2024_filt_grouped,
    on = 'CollisionId',
    how = 'left'
)

In [None]:
# Checking for uniqueness and structure

print(f"The uniqueness of `CollisionId` is: {final_merged_2024['CollisionId'].is_unique}\n")
print(f"The shape of `CollisionId` is: {final_merged_2024.shape}\n")
final_merged_2024.head(5)



The uniqueness of `CollisionId` is: True

The shape of `CollisionId` is: (406874, 13)



Unnamed: 0,CollisionId,CollisionTypeDescription,IsHighwayRelated,Weather1,RoadCondition1,LightingDescription,ExtentOfInjuryCode,MovementPrecCollDescription,AirbagDescription,SafetyEquipmentDescription,SobrietyDrugPhysicalDescription1,SpecialInformation,SpeedLimit
0,2296102,REAR END,True,CLEAR,NO UNUSUAL CONDITIONS,DUSK-DAWN,Minor,"[STOPPED, PROCEEDING STRAIGHT]","[AIR BAG NOT DEPLOYED, UNKNOWN]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, IMPAIRMENT_NOT_KNOWN]","[CELL PHONE NOT IN USE, CELL PHONE USE UNKNOWN]",65.0
1,2296101,HIT OBJECT,True,RAINING,NO UNUSUAL CONDITIONS,DUSK-DAWN,,[PROCEEDING STRAIGHT],[AIR BAG NOT DEPLOYED],[OCCUPANT LAP SHOULDER HARNESS USED],[HAD NOT BEEN DRINKING],[CELL PHONE NOT IN USE],65.0
2,2296100,SIDE SWIPE,False,CLEAR,NO UNUSUAL CONDITIONS,DARK-NO STREET LIGHTS,,[PROCEEDING STRAIGHT],[AIR BAG NOT DEPLOYED],[OCCUPANT LAP SHOULDER HARNESS USED],[HAD NOT BEEN DRINKING],[CELL PHONE NOT IN USE],55.0
3,2296098,REAR END,True,RAINING,NO UNUSUAL CONDITIONS,DARK-STREET LIGHTS,Minor,"[STOPPED, PROCEEDING STRAIGHT]","[AIR BAG NOT DEPLOYED, AIR BAG NOT DEPLOYED]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, HAD NOT BEEN DRINKING]","[CELL PHONE NOT IN USE, CELL PHONE NOT IN USE]",65.0
4,2296097,SIDE SWIPE,True,CLOUDY,NO UNUSUAL CONDITIONS,DAYLIGHT,,"[PROCEEDING STRAIGHT, CHANGING LANES]","[AIR BAG DEPLOYED, AIR BAG NOT DEPLOYED]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, HAD NOT BEEN DRINKING]","[CELL PHONE NOT IN USE, CELL PHONE NOT IN USE]",65.0


## Exporting data as CSV

In [None]:
# Exporting `final_merged_2024` locally
from google.colab import files

final_merged_2024.to_csv('final_merged_2024.csv', index = False)
files.download('final_merged_2024.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>