<a href="https://colab.research.google.com/github/christinesako-berk/ds_207_final_project/blob/christine/processing_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-processing on 2025 Datasets (For EDA)
## Christine Sako

## Connecting to Github

In [None]:

import os
# from google.colab import userdata

# # Github access
# github_token = userdata.get('GITHUB_TOKEN')

# repo_owner = 'christinesako-berk'
# repo_name_full = 'ds_207_final_project'
# repo_url = f'https://{github_token}@github.com/{repo_owner}/{repo_name_full}.git'

# # Clone repo
# !git clone {repo_url}

# Change directory
# os.chdir(repo_name_full)

## Importing Libraries

In [None]:
!pip install keras_tuner -q

import pandas as pd
import math
import numpy as np
import numpy.linalg as nla
import pandas as pd
import seaborn as sns
import re
import six
from os.path import join
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score
from keras_tuner import HyperParameters
sns.set(style="darkgrid")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Setting detault plot params
plt.rcParams.update({
    'axes.titlesize': 14,
    'axes.titlepad': 20,
    'axes.labelsize': 12,
    'axes.labelpad': 10,
})

# Show all results
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## Initial Exploration



In [None]:
# Importing 2025 datasets, to be used for testing data
crashes_2025_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/crashes_2025.csv'
crashes_2025 = pd.read_csv(crashes_2025_file_path, sep=',')

parties_2025_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/parties_2025.csv'
parties_2025 = pd.read_csv(parties_2025_file_path, sep=',', on_bad_lines='skip') # Skipping 2 lines of poorly formatted rows

injuredwitnesspassengers_2025_file_path = '/content/drive/MyDrive/SUMMER 2025/DATASCI 207/Final Project/data/injuredwitnesspassengers_2025.csv'
injuredwitnesspassengers_2025 = pd.read_csv(injuredwitnesspassengers_2025_file_path, sep=',')

# Removing spaces and "\t" that appears in column names
crashes_2025.columns = crashes_2025.columns.str.replace(' ', '', regex = False)
crashes_2025.columns = crashes_2025.columns.str.replace('\t', '', regex = False)
parties_2025.columns = parties_2025.columns.str.replace(' ', '', regex = False)
parties_2025.columns = parties_2025.columns.str.replace('\t', '', regex = False)
injuredwitnesspassengers_2025.columns = injuredwitnesspassengers_2025.columns.str.replace(' ', '', regex = False)
injuredwitnesspassengers_2025.columns = injuredwitnesspassengers_2025.columns.str.replace('\t', '', regex = False)

# Shape of Data
print(f"The shape of `crashes_2025` is: {crashes_2025.shape}")
print(f"The shape of `parties_2025` is: {parties_2025.shape}")
print(f"The shape of `injuredwitnesspassengers_2025` is: {injuredwitnesspassengers_2025.shape}")

The shape of `crashes_2025` is: (140311, 73)
The shape of `parties_2025` is: (272242, 38)
The shape of `injuredwitnesspassengers_2025` is: (164580, 21)


In [None]:
# Retreiving and printing column names, datatypes, unique values, and the percentage of missing values
# Get datatypes
crashes_2025_dtypes = crashes_2025.dtypes
parties_2025_dtypes = parties_2025.dtypes
injuredwitnesspassengers_2025_dtypes = injuredwitnesspassengers_2025.dtypes

# Get unique value counts
crashes_2025_unique_counts = crashes_2025.nunique()
parties_2025_unique_counts = parties_2025.nunique()
injuredwitnesspassengers_2025_unique_counts = injuredwitnesspassengers_2025.nunique()

# Get missing value counts
crashes_2025_missing_counts = crashes_2025.isnull().sum()
parties_2025_missing_counts = parties_2025.isnull().sum()
injuredwitnesspassengers_2025_missing_counts = injuredwitnesspassengers_2025.isnull().sum()

# Recomposing into summary DataFrame
crashes_2025_summary = pd.DataFrame({
    'Dtype': crashes_2025_dtypes,
    'Unique_Values': crashes_2025_unique_counts,
    'Missing_Values': crashes_2025_missing_counts,
})

parties_2025_summary = pd.DataFrame({
    'Dtype': parties_2025_dtypes,
    'Unique_Values': parties_2025_unique_counts,
    'Missing_Values': parties_2025_missing_counts,
})

injuredwitnesspassengers_2025_summary = pd.DataFrame({
    'Dtype': injuredwitnesspassengers_2025_dtypes,
    'Unique_Values': injuredwitnesspassengers_2025_unique_counts,
    'Missing_Values': injuredwitnesspassengers_2025_missing_counts,
})

# Adding 'Missing_Percentage' column
crashes_2025_summary['Missing_Percentage'] = (crashes_2025_summary['Missing_Values'] / len(crashes_2025))
parties_2025_summary['Missing_Percentage'] = (parties_2025_summary['Missing_Values'] / len(parties_2025))
injuredwitnesspassengers_2025_summary['Missing_Percentage'] = (injuredwitnesspassengers_2025_summary['Missing_Values'] / len(injuredwitnesspassengers_2025))

# Printing Results
print(f"The column names, datatypes, unique values, and the percentage of missing values for `crashes_2025` are:\n{crashes_2025_summary.to_string()}")
print(f"\nThe column names, datatypes, unique values, and the percentage of missing values `parties_2025` aare:\n{parties_2025_summary.to_string()}")
print(f"\nThe column names, datatypes, unique values, and the percentage of missing values `injuredwitnesspassengers_2025_summary` aare:\n{injuredwitnesspassengers_2025_summary.to_string()}")


The column names, datatypes, unique values, and the percentage of missing values for `crashes_2025` are:
                                     Dtype  Unique_Values  Missing_Values  Missing_Percentage
CollisionId                          int64         140311               0            0.000000
ReportNumber                        object         135750              34            0.000242
ReportVersion                        int64             26               0            0.000000
IsPreliminary                         bool              2               0            0.000000
NCICCode                            object            640               0            0.000000
CrashDateTime                       object          78516               0            0.000000
CrashTimeDescription                 int64           1445               0            0.000000
Beat                                object           3974            6422            0.045770
CityId                             float64       

## Feature Selection


In [None]:
# Selecting only relevant features
crashes_2025 = crashes_2025[['CollisionId', 'CollisionTypeDescription', 'IsHighwayRelated', 'Weather1', 'RoadCondition1', 'LightingDescription']]
parties_2025 = parties_2025[['PartyId', 'CollisionId', 'SpeedLimit','MovementPrecCollDescription', 'AirbagDescription', 'SafetyEquipmentDescription', 'SobrietyDrugPhysicalDescription1', 'SpecialInformation']]

# First filtering `IsWitnessOnly` False and then selecting relevant features
injuredwitnesspassengers_2025 = injuredwitnesspassengers_2025[((injuredwitnesspassengers_2025['IsWitnessOnly'] == False) & (injuredwitnesspassengers_2025['IsWitnessOnly'].notna()))][['InjuredWitPassId', 'CollisionId','ExtentOfInjuryCode']]
# Replacing missing values in `ExtentOfInjuryCode` with 'NoInjuryReported' value
injuredwitnesspassengers_2025['ExtentOfInjuryCode'] = injuredwitnesspassengers_2025['ExtentOfInjuryCode'].fillna('NoInjuryReported')

print(f"The new shape of `crashes_2025` is: {crashes_2025.shape}")
print(f"The new shape of `parties_2025` is: {parties_2025.shape}")
print(f"The new shape of `injuredwitnesspassengers_2025` is: {injuredwitnesspassengers_2025.shape}\n")

# Dropping duplicates and filtering NAs

crashes_2025_filt = crashes_2025.drop_duplicates().dropna()
parties_2025_filt = parties_2025.drop_duplicates().dropna()
injuredwitnesspassengers_2025_filt = injuredwitnesspassengers_2025.drop_duplicates()

print(f"The new shape of `crashes_2025` is: {crashes_2025_filt.shape}")
print(f"The new shape of `parties_2025` is: {parties_2025_filt.shape}")
print(f"The new shape of `injuredwitnesspassengers_2025` is: {injuredwitnesspassengers_2025_filt.shape}\n")

print(f"The percentage of data retained for `crashes_2025_filt` after filtering is: {crashes_2025_filt.shape[0]/crashes_2025.shape[0]}")
print(f"The percentage of data retained for `parties_2025_filt` after filtering is: {parties_2025_filt.shape[0]/parties_2025.shape[0]}")
print(f"The percentage of data retained for `injuredwitnesspassengers_2025_filt` after filtering is: {injuredwitnesspassengers_2025_filt.shape[0]/injuredwitnesspassengers_2025.shape[0]}")

The new shape of `crashes_2025` is: (140311, 6)
The new shape of `parties_2025` is: (272242, 8)
The new shape of `injuredwitnesspassengers_2025` is: (109375, 3)

The new shape of `crashes_2025` is: (139443, 6)
The new shape of `parties_2025` is: (187591, 8)
The new shape of `injuredwitnesspassengers_2025` is: (109375, 3)

The percentage of data retained for `crashes_2025_filt` after filtering is: 0.9938137423295393
The percentage of data retained for `parties_2025_filt` after filtering is: 0.6890597336193534
The percentage of data retained for `injuredwitnesspassengers_2025_filt` after filtering is: 1.0


## Data Transformation



In [None]:
# Converting string values to categorical
crashes_2025_filt = crashes_2025_filt.astype({
    'CollisionTypeDescription' : 'category',
    'Weather1' : 'category',
    'RoadCondition1' : 'category',
    'LightingDescription' : 'category'
})

parties_2025_filt = parties_2025_filt.astype({
    'MovementPrecCollDescription' : 'category',
    'AirbagDescription' : 'category',
    'SafetyEquipmentDescription' : 'category',
    'SobrietyDrugPhysicalDescription1' : 'category',
    'SpecialInformation' : 'category'
})

# Encoding order to outcome variable category
injury_groups = {
    'NoInjuryReported': 'None',
    'ComplaintOfPainInactive': 'Minor',
    'PossibleInjury': 'Minor',
    'OtherVisibleInactive': 'Minor',
    'SuspectMinor': 'Minor',
    'SevereInactive': 'Serious',
    'SuspectSerious': 'Serious',
    'Fatal': 'Fatal'
}

injuredwitnesspassengers_2025_filt['ExtentOfInjuryCode'] = injuredwitnesspassengers_2025_filt['ExtentOfInjuryCode'].replace(injury_groups)

injuredwitnesspassengers_2025_filt['ExtentOfInjuryCode'] = pd.Categorical(injuredwitnesspassengers_2025_filt['ExtentOfInjuryCode'],
                                                                          categories = ['None','Minor','Serious','Fatal'],
                                                                          ordered = True
                                                                          )

# Confirming datatypes after conversions
print("Confirming datatypesafter conversions:\n")
print(f"{crashes_2025_filt.dtypes}\n")
print(f"{parties_2025_filt.dtypes}\n")
print(f"{injuredwitnesspassengers_2025_filt.dtypes}\n")

print("Checking outcome variable after conversion:\n")
injuredwitnesspassengers_2025_filt['ExtentOfInjuryCode'].value_counts()



Confirming datatypesafter conversions:

CollisionId                    int64
CollisionTypeDescription    category
IsHighwayRelated                bool
Weather1                    category
RoadCondition1              category
LightingDescription         category
dtype: object

PartyId                                int64
CollisionId                            int64
SpeedLimit                           float64
MovementPrecCollDescription         category
AirbagDescription                   category
SafetyEquipmentDescription          category
SobrietyDrugPhysicalDescription1    category
SpecialInformation                  category
dtype: object

InjuredWitPassId         int64
CollisionId              int64
ExtentOfInjuryCode    category
dtype: object

Checking outcome variable after conversion:



Unnamed: 0_level_0,count
ExtentOfInjuryCode,Unnamed: 1_level_1
Minor,53951
,51548
Serious,3497
Fatal,379


## Joining and Aggregating

In [None]:
# Reduce `injuredwitnesspassengers_2025_filt` to only retain highest severity per `CollisionId`
injuredwitnesspassengers_2025_filt = injuredwitnesspassengers_2025_filt.drop(columns=['InjuredWitPassId'])
most_severe_idx = injuredwitnesspassengers_2025_filt.groupby('CollisionId')['ExtentOfInjuryCode'].idxmax()
injuredwitnesspassengers_2025_filt_most_severe = injuredwitnesspassengers_2025_filt.loc[most_severe_idx]

# Creating list of categories per feature per `CollisionId` if there are multiple `PartyId` per `CollisionId` in `parties_2025_filt` and retaining highest speed limit in `parties_2025_filt`
parties_2025_filt_grouped = parties_2025_filt.groupby('CollisionId').agg({
    'MovementPrecCollDescription' : list,
    'AirbagDescription' : list,
    'SafetyEquipmentDescription' : list,
    'SobrietyDrugPhysicalDescription1' : list,
    'SpecialInformation' : list,
    'SpeedLimit' : 'max'
}).reset_index()


In [None]:
# Left joining `injuredwitnesspassengers_2025_filt_most_severe` and `parties_2025_filt_grouped` to `crashes_2025_filt`
merged_2025 = crashes_2025_filt.merge(
    injuredwitnesspassengers_2025_filt_most_severe,
    on = 'CollisionId',
    how = 'left'
)

final_merged_2025 = merged_2025.merge(
    parties_2025_filt_grouped,
    on = 'CollisionId',
    how = 'left'
)

In [None]:
# Checking for uniqueness and structure

print(f"The uniqueness of `CollisionId` is: {final_merged_2025['CollisionId'].is_unique}\n")
print(f"The shape of `CollisionId` is: {final_merged_2025.shape}\n")
final_merged_2025.head(5)

The uniqueness of `CollisionId` is: True

The shape of `CollisionId` is: (139443, 13)



Unnamed: 0,CollisionId,CollisionTypeDescription,IsHighwayRelated,Weather1,RoadCondition1,LightingDescription,ExtentOfInjuryCode,MovementPrecCollDescription,AirbagDescription,SafetyEquipmentDescription,SobrietyDrugPhysicalDescription1,SpecialInformation,SpeedLimit
0,4550264,REAR END,False,CLEAR,NO UNUSUAL CONDITIONS,DAYLIGHT,,"[STOPPED, PROCEEDING STRAIGHT]","[AIR BAG NOT DEPLOYED, AIR BAG NOT DEPLOYED]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, HAD NOT BEEN DRINKING]","[CELL PHONE NOT IN USE, CELL PHONE NOT IN USE]",35.0
1,4550263,SIDE SWIPE,True,CLEAR,NO UNUSUAL CONDITIONS,DAYLIGHT,,"[PROCEEDING STRAIGHT, OTHER]","[AIR BAG NOT DEPLOYED, UNKNOWN]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, IMPAIRMENT_NOT_KNOWN]","[CELL PHONE NOT IN USE, CELL PHONE USE UNKNOWN]",65.0
2,4550262,HIT OBJECT,True,WIND,NO UNUSUAL CONDITIONS,DAYLIGHT,,[PROCEEDING STRAIGHT],[AIR BAG NOT DEPLOYED],[OCCUPANT LAP SHOULDER HARNESS USED],[HAD NOT BEEN DRINKING],[CELL PHONE NOT IN USE],65.0
3,4550261,BROADSIDE,False,CLEAR,NO UNUSUAL CONDITIONS,DAYLIGHT,Minor,"[PROCEEDING STRAIGHT, PROCEEDING STRAIGHT]","[AIR BAG NOT DEPLOYED, AIR BAG NOT DEPLOYED]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, HAD NOT BEEN DRINKING]","[CELL PHONE NOT IN USE, CELL PHONE NOT IN USE]",45.0
4,4550260,REAR END,True,CLEAR,NO UNUSUAL CONDITIONS,DAYLIGHT,,"[STOPPED, PROCEEDING STRAIGHT]","[AIR BAG NOT DEPLOYED, AIR BAG NOT DEPLOYED]","[OCCUPANT LAP SHOULDER HARNESS USED, OCCUPANT ...","[HAD NOT BEEN DRINKING, HAD NOT BEEN DRINKING]","[CELL PHONE NOT IN USE, CELL PHONE NOT IN USE]",65.0


## Exporting data as CSV

In [None]:
# Exporting `final_merged_2025` locally
from google.colab import files

final_merged_2025.to_csv('final_merged_2025.csv', index = False)
files.download('final_merged_2025.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>