## Name: Margaret Nguyen

# Exploring the Impact of Infrastructure on Incident Rates for Vulnerable Road Users: A Machine Learning Analysis

**Assignment: Use the 'POPULATION' variable to conduct stepwise regression. This coding file is intended for the dataset without BNA score.**

In [144]:
# Import packages
import numpy as np # v 1.21.5
import sklearn # v 1.0.2
import pandas as pd # v 1.4.4
import ydata_profiling as pp # v 3.6.6
import statsmodels.api as sm # v 0.13.2
from mlxtend.feature_selection import SequentialFeatureSelector # v 0.23.0

# Regression
from sklearn.linear_model import LinearRegression # v 1.0.2
from sklearn.model_selection import train_test_split # v 1.0.2

# PCA
from sklearn.decomposition import PCA # v 1.0.2
from sklearn.preprocessing import StandardScaler # v 1.0.2

from sklearn.metrics import mean_absolute_error # v 1.0.2
from sklearn.metrics import mean_squared_error # v 1.0.2
from sklearn.metrics import mean_absolute_percentage_error # v 1.0.2

# Ploting libraries 
import matplotlib
#matplotlib.use('Qt5Agg')  # Use an appropriate backend like 'Qt5Agg' for GUI display
import matplotlib.pyplot as plt # v 3.5.2
import seaborn as sns # v 0.11.2
# Display any generated plots or visualizations directly in the notebook interface
%matplotlib inline 

# I. Data Manipulation

In [145]:
# Read the csv file 
df_pa_acs = pd.read_csv('/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/2017_TO_2021_MUNI_CRASH_DATA.csv')

df_mass_acs = pd.read_csv('/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/df_mass_acs.csv', low_memory=False)

df_col_acs = pd.read_csv('/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/df_col_acs.csv')

## A. Clean the Pennsylvania Crash Dataset

In [146]:
# Clean datasets
df_pa_acs.drop(columns = ['Unnamed: 0'], inplace=True)

# Select columns with numeric data types (int or float) using select_dtypes
numeric_columns = df_pa_acs.select_dtypes(include=['number'])

# Create a new DataFrame with only the numeric columns
df_pa_crash = df_pa_acs[numeric_columns.columns]

# Drop unnessary columns
df_pa_crash = df_pa_crash.drop(['PENN_DOT_MUNI_ID', 'state', 'county', 'county_subdivision', 'LAND_AREA.1', 'LAND_AREA', 'PENN_DOT_COUNTY_NUM', 'FEDERAL_EIN_CODE', 'HOME_RULE_YEAR', 'INCORPORATION_YEAR', 'MUNICIPALITY'], axis=1)

# Replace NaN values with 0 throughout the DataFrame to address missing data, following the experience of cleaning the original dataset.
df_pa_crash = df_pa_crash.fillna(0)

# Drop population == 0 (small municipalities)
df_pa_crash = df_pa_crash[df_pa_crash['POPULATION'] != 0]

# Reset index
df_pa_crash.reset_index(inplace = True, drop = True)

In [147]:
# Define the columns for which you want to calculate per capita values
columns_to_convert = [
    'BIKE_TO_WORK_EST', 'BIKE_TO_WORK_MARG',
    'WALK_TO_WORK_EST', 'WALK_TO_WORK_MARG', 'DRIVE_SOLO_TO_WORK_EST',
    'DRIVE_SOLO_TO_WORK_MARG', 'CARPOOL_TO_WORK_EST',
    'CARPOOL_TO_WORK_MARG', 'PUBTRANS_TO_WORK_EST',
    'PUBTRANS_TO_WORK_MARG', 'EMPLOYEES_FULL_TIME',
    'EMPLOYEES_PART_TIME', 'AUTOMOBILE_COUNT',
    'BICYCLE_BY_AUTO_COUNT', 'BICYCLE_DEATH_BY_AUTO_COUNT',
    'BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT', 'PED_BY_AUTO_COUNT',
    'PED_DEATH_BY_AUTO_COUNT', 'PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT',
    'BICYCLE_SOLO_COUNT', 'BICYCLE_DEATH_SOLO_COUNT',
    'BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT', 'PED_SOLO_COUNT',
    'PED_DEATH_SOLO_COUNT', 'PED_SUSP_SERIOUS_INJ_SOLO_COUNT',
]

# Create new columns with "_PER_CAPITA" suffix by dividing each column by 'POPULATION'
for column in columns_to_convert:
    new_column_name = column + '_PER_CAPITA'
    df_pa_crash[new_column_name] = df_pa_crash[column] / df_pa_crash['POPULATION']

## B. Clean the Massachusett Crash Dataset

In [148]:
# Clean datasets
df_mass_acs.drop(columns = ['Unnamed: 0'], inplace=True)

# Exclude the NaN from 'VEHC_CONFIG_CL'
df_mass_crash = df_mass_acs[df_mass_acs['VEHC_CONFIG_CL'].notna()]

# List of NOT automobiles: Snowmobile, Moped, Motorcycle, Other Light Trucks (10,000 lbs., or Less), Other e.g. Farm Equipment, Unknown.
# Exclude the non-automobiles from 'VEHC_CONFIG_CL' columns
list_non_automobiles = ['V1:(Unknown vehicle configuration)', 'V1:(Other e.g. farm equipment)', 'V1:(Unknown vehicle configuration) / V2:(Unknown vehicle configuration)']
df_mass_crash= df_mass_crash[~df_mass_crash['VEHC_CONFIG_CL'].isin(list_non_automobiles)]

In [None]:
# Fatal - injuries that resulted in death 
# Incapacitating - serious injuries require immediate medical attention

## BICYCLE_DEATH_BY_AUTO_COUNT
# Filter the DataFrame for cyclist fatalities
cyclist_fatalities = df_mass_crash[(df_mass_crash['INJY_STAT_DESCR'] == 'Fatal injury (K)') & (df_mass_crash['NON_MTRST_TYPE_CL'] == 'Cyclist')]

# Group the filtered DataFrame by 'CITY_TOWN_NAME' and calculate the count for each city
bicycle_death_counts = cyclist_fatalities.groupby('CITY_TOWN_NAME').size().reset_index(name='BICYCLE_DEATH_BY_AUTO_COUNT')

# Merge the bicycle_death_counts DataFrame into df_mass_crash on 'CITY_TOWN_NAME'
df_mass_crash = df_mass_crash.merge(bicycle_death_counts, on='CITY_TOWN_NAME', how='left')

## BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT
cyclist_incapacitating = df_mass_crash[(df_mass_crash['INJY_STAT_DESCR'] == 'Non-fatal injury - Incapacitating') & (df_mass_crash['NON_MTRST_TYPE_CL'] == 'Cyclist')]
bicycle_sus_serious_inj_counts = cyclist_incapacitating.groupby('CITY_TOWN_NAME').size().reset_index(name='BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT')

# Merge the bicycle_sus_serious_inj_counts DataFrame into df_mass_crash on 'CITY_TOWN_NAME'
df_mass_crash = df_mass_crash.merge(bicycle_sus_serious_inj_counts, on='CITY_TOWN_NAME', how='left')

# Replace NaN values with 0 in the specified columns
df_mass_crash['BICYCLE_DEATH_BY_AUTO_COUNT'].fillna(0, inplace=True)
df_mass_crash['BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT'].fillna(0, inplace=True)

## BICYCLE_BY_AUTO_COUNT
df_mass_crash['BICYCLE_BY_AUTO_COUNT'] = df_mass_crash['BICYCLE_DEATH_BY_AUTO_COUNT'] + df_mass_crash['BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT']

## AUTOMOBILE_COUNT
auto_count = df_mass_crash.groupby('CITY_TOWN_NAME')['NUMB_VEHC'].sum().reset_index()
auto_count.rename(columns={'NUMB_VEHC': 'AUTOMOBILE_COUNT'}, inplace=True)
# Merge the auto_count into df_mass_crash on 'CITY_TOWN_NAME'
df_mass_crash = df_mass_crash.merge(auto_count, on='CITY_TOWN_NAME', how='left')

## PED_DEATH_BY_AUTO_COUNT
# Filter the DataFrame for pedestrian fatalities
ped_fatalities = df_mass_crash[(df_mass_crash['INJY_STAT_DESCR'] == 'Fatal injury (K)') & (df_mass_crash['NON_MTRST_TYPE_CL'] == 'Pedestrian')]

# Group the filtered DataFrame by 'CITY_TOWN_NAME' and calculate the count for each city
ped_death_counts = cyclist_fatalities.groupby('CITY_TOWN_NAME').size().reset_index(name='PED_DEATH_BY_AUTO_COUNT')

# Merge the ped_death_counts DataFrame into df_mass_crash on 'CITY_TOWN_NAME'
df_mass_crash = df_mass_crash.merge(ped_death_counts, on='CITY_TOWN_NAME', how='left')

## PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT
ped_incapacitating = df_mass_crash[(df_mass_crash['INJY_STAT_DESCR'] == 'Non-fatal injury - Incapacitating') & (df_mass_crash['NON_MTRST_TYPE_CL'] == 'Pedestrian')]
ped_sus_serious_inj_counts = ped_incapacitating.groupby('CITY_TOWN_NAME').size().reset_index(name='PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT')

# Merge the ped_sus_serious_inj_counts DataFrame into df_mass_crash on 'CITY_TOWN_NAME'
df_mass_crash = df_mass_crash.merge(ped_sus_serious_inj_counts, on='CITY_TOWN_NAME', how='left')

# Replace NaN values with 0 in the specified columns
df_mass_crash['PED_DEATH_BY_AUTO_COUNT'].fillna(0, inplace=True)
df_mass_crash['PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT'].fillna(0, inplace=True)

##PED_BY_AUTO_COUNT
df_mass_crash['PED_BY_AUTO_COUNT'] = df_mass_crash['PED_DEATH_BY_AUTO_COUNT'] + df_mass_crash['PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT']

# Drop the duplicated rows
df_mass_crash = df_mass_crash.drop_duplicates(subset=['CITY_TOWN_NAME', 'POPULATION', 
                                                      'BIKE_TO_WORK_EST', 'BICYCLE_BY_AUTO_COUNT', 
                                                      'BICYCLE_DEATH_BY_AUTO_COUNT', 
                                                      'BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT', 
                                                      'AUTOMOBILE_COUNT', 'PED_BY_AUTO_COUNT', 
                                                      'PED_DEATH_BY_AUTO_COUNT', 'PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT'])

In [None]:
# Select columns with numeric data types (int or float) using select_dtypes
numeric_columns = df_mass_crash.select_dtypes(include=['number'])

# Create a new DataFrame with only the numeric columns
df_mass_crash = df_mass_crash[numeric_columns.columns]

# Reset index
df_mass_crash.reset_index(drop = True, inplace = True)

In [None]:
# Define the columns for which you want to calculate per capita values
columns_to_convert = [
    'BIKE_TO_WORK_EST', 'BIKE_TO_WORK_MARG',
    'WALK_TO_WORK_EST', 'WALK_TO_WORK_MARG', 'DRIVE_SOLO_TO_WORK_EST',
    'DRIVE_SOLO_TO_WORK_MARG', 'CARPOOL_TO_WORK_EST',
    'CARPOOL_TO_WORK_MARG', 'PUBTRANS_TO_WORK_EST',
    'PUBTRANS_TO_WORK_MARG', 'AUTOMOBILE_COUNT',
    'BICYCLE_BY_AUTO_COUNT', 'BICYCLE_DEATH_BY_AUTO_COUNT',
    'BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT', 'PED_BY_AUTO_COUNT',
    'PED_DEATH_BY_AUTO_COUNT', 'PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT']

# Create new columns with "_PER_CAPITA" suffix by dividing each column by 'POPULATION'
for column in columns_to_convert:
    new_column_name = column + '_PER_CAPITA'
    df_mass_crash[new_column_name] = df_mass_crash[column] / df_mass_crash['POPULATION']

## C. Clean the Colorado Crash Dataset

In [None]:
# Clean datasets
df_col_acs.drop(columns = ['Unnamed: 0'], inplace=True)

In [None]:
# Nake a copy of df_col_acs
df_col_crash = df_col_acs.copy()

In [None]:
# Define the columns for which you want to calculate per capita values
columns_to_convert = [
    'BIKE_TO_WORK_EST', 'BIKE_TO_WORK_MARG',
    'WALK_TO_WORK_EST', 'WALK_TO_WORK_MARG', 'DRIVE_SOLO_TO_WORK_EST',
    'DRIVE_SOLO_TO_WORK_MARG', 'CARPOOL_TO_WORK_EST',
    'CARPOOL_TO_WORK_MARG', 'PUBTRANS_TO_WORK_EST',
    'PUBTRANS_TO_WORK_MARG', 
    'BICYCLE_BY_AUTO_COUNT', 'BICYCLE_DEATH_BY_AUTO_COUNT',
    'BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT', 'PED_BY_AUTO_COUNT',
    'PED_DEATH_BY_AUTO_COUNT', 'PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT']

# Create new columns with "_PER_CAPITA" suffix by dividing each column by 'POPULATION'
for column in columns_to_convert:
    new_column_name = column + '_PER_CAPITA'
    df_col_crash[new_column_name] = df_col_crash[column] / df_col_crash['POPULATION']

In [None]:
# Set the STATE coloumn
df_pa_crash['STATE'] = 'PA'
df_mass_crash['STATE'] = 'MA'
df_col_crash['STATE'] = 'CO'

In [None]:
# Find the common columns
common_columns = list(set(df_pa_crash.columns) & set(df_col_crash.columns) & set(df_mass_crash.columns))

# Keep to have the same columns
df_mass_crash = df_mass_crash[common_columns]
df_pa_crash = df_pa_crash[common_columns]
df_col_crash = df_col_crash[common_columns]

# Reorder the columns of the df_mass_crash and df_col_crash to match df_pa_crash
df_mass_crash = df_mass_crash[df_pa_crash.columns]
df_col_crash = df_col_crash[df_pa_crash.columns]

In [None]:
# Print the shape of each datadrame
df_mass_crash.shape, df_pa_crash.shape, df_col_crash.shape

## C. Merge df_pa_crash and df_mass_crash Dataframes

In [None]:
# Merge dataframes
crash_acs = pd.concat([df_pa_crash, df_mass_crash], axis=0)
df_crash_acs = pd.concat([crash_acs, df_col_crash], axis=0)

# Reset index
df_crash_acs.reset_index(drop = True, inplace = True)

# Check for the shape of the datafrane
print(df_crash_acs.shape)

# Show merge dataframe
df_crash_acs.head()

**Create dumnmy variable for each dataframe**

In [None]:
df_crash_acs = pd.get_dummies(df_crash_acs, columns=['STATE'])

In [None]:
# Check for NaN missing values
df_crash_acs.isna().sum()

**Use Pandas Profiling**

In [None]:
#pp.ProfileReport(df_crash_acs)

In [None]:
# Assuming df_crash_acs is your DataFrame
selected_columns = [
    'CARPOOL_TO_WORK_MARG_PER_CAPITA', 
    'PED_BY_AUTO_COUNT_PER_CAPITA', 
    'PED_DEATH_BY_AUTO_COUNT_PER_CAPITA', 
    'DRIVE_SOLO_TO_WORK_MARG_PER_CAPITA', 
    'BICYCLE_DEATH_BY_AUTO_COUNT_PER_CAPITA', 
    'WALK_TO_WORK_MARG_PER_CAPITA', 
    'WALK_TO_WORK_EST_PER_CAPITA', 
    'CARPOOL_TO_WORK_EST_PER_CAPITA', 
    'DRIVE_SOLO_TO_WORK_EST_PER_CAPITA', 
    'BICYCLE_BY_AUTO_COUNT_PER_CAPITA', 
    'PUBTRANS_TO_WORK_EST_PER_CAPITA', 
    'BICYCLE_SUSP_SERIOUS_INJ_BY_AUTO_COUNT_PER_CAPITA', 
    'BIKE_TO_WORK_EST_PER_CAPITA', 
    'PUBTRANS_TO_WORK_MARG_PER_CAPITA', 
    'BIKE_TO_WORK_MARG_PER_CAPITA', 
    'PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT_PER_CAPITA', 
    'STATE_CO', 
    'STATE_MA',
    'STATE_PA'
]

# Select the specified columns from the DataFrame
df_crash_acs = df_crash_acs[selected_columns]

# II. Stepwise Regression

## A. Separate Dataset in Y (independent) and X (dependent) Variables

In [None]:
y = df_crash_acs["BICYCLE_BY_AUTO_COUNT_PER_CAPITA"] # Y = df_crash.BICYCLE_BY_AUTO_COUNT_PER_CAPITA
X = df_crash_acs.drop(columns="BICYCLE_BY_AUTO_COUNT_PER_CAPITA")

In [None]:
y2 = df_crash_acs["PED_BY_AUTO_COUNT_PER_CAPITA"] # Y = df_crash.PED_BY_AUTO_COUNT_PER_CAPITA
X2 = df_crash_acs.drop(columns="PED_BY_AUTO_COUNT_PER_CAPITA")

## B. Use the train_test_split Function to Split Data into Training (80%) and Testing Set (20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=5)

## C. Fit, Run or Estimate the Regression Model

### 1. BICYCLE_BY_AUTO_COUNT_PER_CAPITA as the Dependent Variable

In [None]:
# Create a linear regression model
OLS = LinearRegression()

# Forward feature selection using SequentialFeatureSelector
sfs = SequentialFeatureSelector(OLS, k_features="best", forward=True, scoring='neg_mean_squared_error', cv=10)
sfs.fit(X_train, y_train)

# Selected features
selected_features = list(X_train.columns[list(sfs.k_feature_idx_)])
print("Selected Features:", selected_features)

In [None]:
# Get the selected features
selected_features = list(X_train.columns[list(sfs.k_feature_idx_)])

# Build a linear regression model using statsmodels with the selected features
X_train_selected = X_train[selected_features]
X_train_selected = sm.add_constant(X_train_selected)  # Add a constant for the intercept
model = sm.OLS(y_train, X_train_selected).fit()

# Display the summary of the statsmodels linear regression model
print(model.summary())

In [None]:
# Make predictions on the test data
X_test_selected = X_test[selected_features]
X_test_selected = sm.add_constant(X_test_selected) # Add a constant term to the test data
y_pred = model.predict(X_test_selected)

### 2. PED_BY_AUTO_COUNT_PER_CAPITA as the Dependent Variable

In [None]:
# Create a linear regression model
OLS = LinearRegression()

# Forward feature selection using SequentialFeatureSelector
sfs = SequentialFeatureSelector(OLS, k_features="best", forward=True, scoring='neg_mean_squared_error', cv=10)
sfs.fit(X_train2, y_train2)

# Selected features
selected_features2 = list(X_train2.columns[list(sfs.k_feature_idx_)])
print("Selected Features:", selected_features2)

In [None]:
# Get the selected features
selected_features2 = list(X_train2.columns[list(sfs.k_feature_idx_)])

# Build a linear regression model using statsmodels with the selected features
X_train_selected2 = X_train2[selected_features2]
X_train_selected2 = sm.add_constant(X_train_selected2)  # Add a constant for the intercept
model2 = sm.OLS(y_train2, X_train_selected2).fit()

# Display the summary of the statsmodels linear regression model
print(model2.summary())

In [None]:
# Make predictions on the test data
X_test_selected2 = X_test2[selected_features2]
X_test_selected2 = sm.add_constant(X_test_selected2) # Add a constant term to the test data
y_pred2 = model2.predict(X_test_selected2)

## D. Plot the Residuals

### Graph 1: The residuals for the model with 'BICYCLE_BY_AUTO_COUNT_PER_CAPITA' as the dependent variable

In [None]:
residuals = model.resid

plt.scatter(model.fittedvalues, residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("OLS Residual Plot for Cyclists")
plt.axhline(y=0, color='r', linestyle='-')  # Add a horizontal line at y=0
plt.show()

### Graph 2: The residuals for the model with 'PED_BY_AUTO_COUNT_PER_CAPITA' as the dependent variable

In [None]:
residuals2 = model2.resid

plt.scatter(model2.fittedvalues, residuals2)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("OLS Residual Plot for Pedestrains")
plt.axhline(y=0, color='r', linestyle='-')  # Add a horizontal line at y=0
plt.show()

## E. Plot the Actual vs. Predicted Values

### Graph 3: The Actual vs. Predicted Values for the model with 'BICYCLE_BY_AUTO_COUNT_PER_CAPITA' as the dependent variable

In [None]:
# Plot the actual vs. predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values for Cyclists")
plt.show()

### Graph 4: The Actual vs. Predicted Values for the model with 'PED_BY_AUTO_COUNT_PER_CAPITA' as the dependent variable

In [None]:
# Plot the actual vs. predicted values
plt.scatter(y_test2, y_pred2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values for Pedestrians")
plt.show()