In [1]:
import os
import pandas as pd
import statsmodels.api as sm
import json
import pprint
import glob
import datetime
import random
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings('ignore')

# Importing Data:

In [3]:
# 1975 to 1981:

acc_sect_1 = pd.read_csv('acc_sect_1.csv')
acc_sect_1.head()

Unnamed: 0,YEAR,ST_CASE,PERSONS,HOUR,SCH_BUS,FATALS
0,1975,10001,3,22,0.0,1
1,1975,10002,2,4,0.0,1
2,1975,10003,1,4,0.0,1
3,1975,10004,3,14,0.0,1
4,1975,10005,1,20,0.0,1


In [4]:
# 1982 to 1990:

acc_sect_2 = pd.read_csv('acc_sect_2.csv')
acc_sect_2.head()

Unnamed: 0,YEAR,ST_CASE,PERSONS,HOUR,ROAD_NAME,MILE_MARKER,SCH_BUS,FATALS
0,1982,10001,2,8,65,2131.0,0,2
1,1982,10002,1,23,409,0.0,0,1
2,1982,10003,1,12,234,0.0,0,1
3,1982,10004,4,23,59,28128.0,0,2
4,1982,10005,2,10,310,0.0,0,1


In [5]:
# 1991 to 2000:

acc_sect_3 = pd.read_csv('acc_sect_3.csv')
acc_sect_3.head()

Unnamed: 0,YEAR,ST_CASE,NUM_PEDS,PERSONS,HOUR,ROAD_NAME,MILE_MARKER,SCH_BUS,FATALS
0,1991,10001,0,2,11,I20,1723.0,0,1
1,1991,10002,0,6,7,SR13,25.0,0,2
2,1991,10003,0,1,23,5849,0.0,0,1
3,1991,10004,0,2,16,2208,0.0,0,1
4,1991,10005,0,2,15,69,1882.0,0,1


In [6]:
# 2001 to 2007:

acc_sect_4 = pd.read_csv('acc_sect_4.csv')
acc_sect_4.head()

Unnamed: 0,YEAR,ST_CASE,NUM_PEDS,PERSONS,HOUR,ROAD_NAME,MILE_MARKER,SCH_BUS,FATALS
0,2001,10001,0,1,20,1274,0.0,0,1
1,2001,10002,0,1,5,SR-204,75.0,0,1
2,2001,10003,0,2,22,SR-21,1230.0,0,1
3,2001,10004,0,2,17,I-65,1642.0,0,1
4,2001,10005,0,2,9,US-SR3,2993.0,0,1


In [7]:
# 2008 to 2014:

acc_sect_5 = pd.read_csv('acc_sect_5.csv')
acc_sect_5.head()

Unnamed: 0,YEAR,ST_CASE,NUM_PEDS,NUM_VEHC,PERSONS,HOUR,ROAD_NAME,MILE_MARKER,LATITUDE,LONGITUD,SCH_BUS,FATALS
0,2008,10001,1,1,3,17,SR-69,0.0,34.326947,-86.489567,0,1
1,2008,10002,0,2,2,12,I-20,1230.0,33.520883,-86.847739,0,1
2,2008,10003,0,1,1,2,1027,0.0,32.477511,-86.346628,0,1
3,2008,10004,0,1,2,10,8438,0.0,30.814786,-88.171917,0,1
4,2008,10005,2,1,3,21,US-SR6,0.0,32.326347,-86.320975,0,2


In [8]:
# 2015 to 2021:

acc_sect_6 = pd.read_csv('acc_sect_6.csv')
acc_sect_6.head()

Unnamed: 0,YEAR,STATENAME,ST_CASE,NUM_PEDS,NUM_VEHC,PERSONS,MONTHNAME,DAYNAME,DAY_WEEKNAME,HOUR,...,COLLISION_TYPE,TYP_INTNAME,REL_ROADNAME,WRK_ZONENAME,LGT_CONDNAME,WEATHERNAME,SCH_BUS,FATALS,COUNTYNAME,CITYNAME
0,2015,Alabama,10001,0,1,1,January,1,Thursday,2,...,Not a Collision with Motor Vehicle In-Transport,Not an Intersection,On Roadside,,Dark - Not Lighted,Clear,0,1,NOT AVAILABLE,NOT AVAILABLE
1,2015,Alabama,10002,0,1,1,January,1,Thursday,22,...,Not a Collision with Motor Vehicle In-Transport,Not an Intersection,On Median,,Dark - Not Lighted,Cloudy,0,1,NOT AVAILABLE,NOT AVAILABLE
2,2015,Alabama,10003,0,1,2,January,1,Thursday,1,...,Not a Collision with Motor Vehicle In-Transport,Not an Intersection,On Roadside,,Dark - Not Lighted,Clear,0,1,NOT AVAILABLE,NOT AVAILABLE
3,2015,Alabama,10004,0,1,1,January,4,Sunday,0,...,Not a Collision with Motor Vehicle In-Transport,Not an Intersection,On Roadside,,Dark - Not Lighted,Cloudy,0,1,NOT AVAILABLE,NOT AVAILABLE
4,2015,Alabama,10005,0,2,2,January,7,Wednesday,7,...,Angle,T-Intersection,On Roadway,,Daylight,Clear,0,1,NOT AVAILABLE,NOT AVAILABLE


# Linear Regression Models:

First transform some qualitative variables to dummy variables & remove those that have too many unique values such as ROAD_NAME


In [9]:
# Check to make sure all types inputted into the model are quantitative
acc_sect_1.dtypes

YEAR         int64
ST_CASE      int64
PERSONS      int64
HOUR         int64
SCH_BUS    float64
FATALS       int64
dtype: object

In [10]:
X_1 = acc_sect_1.loc[:, (acc_sect_1.columns != 'FATALS')] # doesn't have ROAD_NAME

y_1 = acc_sect_1.loc[:,'FATALS']

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size = 0.25)

regr_mod_sect_1 = sm.OLS(y_train_1, X_train_1).fit()

print("Summary for Section 1 Model:")
print(regr_mod_sect_1.summary())


Summary for Section 1 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.876
Model:                            OLS   Adj. R-squared (uncentered):              0.876
Method:                 Least Squares   F-statistic:                          3.174e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:08   Log-Likelihood:                     -1.2957e+05
No. Observations:              225044   AIC:                                  2.592e+05
Df Residuals:                  225039   BIC:                                  2.592e+05
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
------------

In [11]:
# Predictions
y_preds_1 = regr_mod_sect_1.predict(X_test_1)

# Finding MSE using scikit-learn since no function for it in the statsreg package
mse_1 = mean_squared_error(y_test_1, y_preds_1)

# Getting R^2 Score from the fitted regr_mod_sect_1
r2_1 = regr_mod_sect_1.rsquared

print("Mean Squared Error of Section 1 Model:", mse_1)
print("R^2 Score of Section 1 Model:", r2_1)

Mean Squared Error of Section 1 Model: 0.1780931696705311
R^2 Score of Section 1 Model: 0.8758251130296058


#### Based on the model summary printout, regressors are statistically significant at the alpha = 0.05 significance level because their p-values for the partial t-tests are < 0.05 & a reduced model doesn't need to be created

In [12]:
acc_sect_2.dtypes

YEAR             int64
ST_CASE          int64
PERSONS          int64
HOUR             int64
ROAD_NAME       object
MILE_MARKER    float64
SCH_BUS          int64
FATALS           int64
dtype: object

In [13]:
X_2 = acc_sect_2.drop(columns=['FATALS', 'ROAD_NAME'])  
y_2 = acc_sect_2['FATALS']

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.25)

regr_mod_sect_2 = sm.OLS(y_train_2, X_train_2).fit()

print("Summary for Section 2 Model:")
print(regr_mod_sect_2.summary())



Summary for Section 2 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.884
Model:                            OLS   Adj. R-squared (uncentered):              0.884
Method:                 Least Squares   F-statistic:                          3.444e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:08   Log-Likelihood:                     -1.4170e+05
No. Observations:              270847   AIC:                                  2.834e+05
Df Residuals:                  270841   BIC:                                  2.835e+05
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [14]:
y_preds_2 = regr_mod_sect_2.predict(X_test_2)

mse_2 = mean_squared_error(y_test_2, y_preds_2)

r2_2 = regr_mod_sect_2.rsquared

print("Mean Squared Error of Section 2 Model:", mse_2)
print("R^2 Score Section of Section 2 Model:", r2_2)


Mean Squared Error of Section 2 Model: 0.16513053193253865
R^2 Score Section of Section 2 Model: 0.8841152757569224


#### Based on the model summary printout, regressors are statistically significant at the alpha = 0.05 significance level because their p-values for the partial t-tests are < 0.05 & a reduced model doesn't need to be created

In [15]:
acc_sect_3.dtypes

YEAR             int64
ST_CASE          int64
NUM_PEDS         int64
PERSONS          int64
HOUR             int64
ROAD_NAME       object
MILE_MARKER    float64
SCH_BUS          int64
FATALS           int64
dtype: object

In [16]:
X_3 = acc_sect_3.drop(columns=['FATALS', 'ROAD_NAME'])  
y_3 = acc_sect_3['FATALS']

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.25)

regr_mod_sect_3 = sm.OLS(y_train_3, X_train_3).fit()

print("Summary for Section 3 Model:")
print(regr_mod_sect_3.summary())


Summary for Section 3 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.883
Model:                            OLS   Adj. R-squared (uncentered):              0.883
Method:                 Least Squares   F-statistic:                          2.988e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:08   Log-Likelihood:                     -1.4542e+05
No. Observations:              275808   AIC:                                  2.909e+05
Df Residuals:                  275801   BIC:                                  2.909e+05
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [17]:
y_preds_3 = regr_mod_sect_3.predict(X_test_3)

mse_3 = mean_squared_error(y_test_3, y_preds_3)

r2_3 = regr_mod_sect_3.rsquared

print("Mean Squared Error of Section 3 Model:", mse_3)
print("R^2 Score of Section 3 Model:", r2_3)


Mean Squared Error of Section 3 Model: 0.17157645536245103
R^2 Score of Section 3 Model: 0.8834954195134441


#### Based on the model summary printout, all regressors are statistically significant except HOUR, which has a p-value = 0.099 so a reduced model without it will be created below

In [18]:
X_3 = acc_sect_3.drop(columns=['FATALS', 'ROAD_NAME', 'HOUR'])  
y_3 = acc_sect_3['FATALS']

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.25)

regr_mod_reduced_sect_3 = sm.OLS(y_train_3, X_train_3).fit()

print("Summary for Reduced Section 3 Model:")
print(regr_mod_reduced_sect_3.summary())


Summary for Reduced Section 3 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.883
Model:                            OLS   Adj. R-squared (uncentered):              0.883
Method:                 Least Squares   F-statistic:                          3.481e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:09   Log-Likelihood:                     -1.4562e+05
No. Observations:              275808   AIC:                                  2.912e+05
Df Residuals:                  275802   BIC:                                  2.913e+05
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
---

In [19]:
y_preds_reduced_3 = regr_mod_reduced_sect_3.predict(X_test_3)

mse_reduced_3 = mean_squared_error(y_test_3, y_preds_reduced_3)

r2_reduced_3 = regr_mod_reduced_sect_3.rsquared

print("Mean Squared Error of Section 3 Reduced Model:", mse_reduced_3)
print("R^2 Score of Section 3 Reduced Model:", r2_reduced_3)


Mean Squared Error of Section 3 Reduced Model: 0.17075224505188155
R^2 Score of Section 3 Reduced Model: 0.88336621709107


#### Looks like the R^2 decreased very slightly, but nonetheless, this model doesn't have a unnecessary regressor.

In [20]:
acc_sect_4.dtypes

YEAR             int64
ST_CASE          int64
NUM_PEDS         int64
PERSONS          int64
HOUR             int64
ROAD_NAME       object
MILE_MARKER    float64
SCH_BUS          int64
FATALS           int64
dtype: object

In [21]:
X_4 = acc_sect_4.drop(columns=['FATALS', 'ROAD_NAME'])  
y_4 = acc_sect_4['FATALS']

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, test_size=0.25)

regr_mod_sect_4 = sm.OLS(y_train_4, X_train_4).fit()

print("Summary for Section 4 Model:")
print(regr_mod_sect_4.summary())


Summary for Section 4 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.892
Model:                            OLS   Adj. R-squared (uncentered):              0.892
Method:                 Least Squares   F-statistic:                          2.387e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:09   Log-Likelihood:                         -95213.
No. Observations:              201456   AIC:                                  1.904e+05
Df Residuals:                  201449   BIC:                                  1.905e+05
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [22]:
y_preds_4 = regr_mod_sect_4.predict(X_test_4)

mse_4 = mean_squared_error(y_test_4, y_preds_4)

r2_4 = regr_mod_sect_4.rsquared

print("Mean Squared Error Section 4 regr_mod_sect_4:", mse_4)
print("R^2 Score Section 4 regr_mod_sect_4:", r2_4)


Mean Squared Error Section 4 regr_mod_sect_4: 0.14714673582036825
R^2 Score Section 4 regr_mod_sect_4: 0.8924268672967229


#### Based on the model summary printout, all regressors are statistically significant except MILE_MARKER, which has a p-value = 0.168 so a reduced model without it will be created below

In [23]:
X_4 = acc_sect_4.drop(columns=['FATALS', 'ROAD_NAME', 'MILE_MARKER'])  
y_4 = acc_sect_4['FATALS']

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, test_size=0.25)

regr_mod_reduced_sect_4 = sm.OLS(y_train_4, X_train_4).fit()

print("Summary for Reduced Section 4 Model:")
print(regr_mod_reduced_sect_4.summary())


Summary for Reduced Section 4 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.894
Model:                            OLS   Adj. R-squared (uncentered):              0.894
Method:                 Least Squares   F-statistic:                          2.827e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:09   Log-Likelihood:                         -93780.
No. Observations:              201456   AIC:                                  1.876e+05
Df Residuals:                  201450   BIC:                                  1.876e+05
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
----

In [24]:
y_preds_reduced_4 = regr_mod_reduced_sect_4.predict(X_test_4)

mse_reduced_4 = mean_squared_error(y_test_4, y_preds_reduced_4)

r2_reduced_4 = regr_mod_reduced_sect_4.rsquared

print("Mean Squared Error of Section 4 Reduced Model:", mse_reduced_4)
print("R^2 Score of Section 4 Reduced Model:", r2_reduced_4)


Mean Squared Error of Section 4 Reduced Model: 0.15354249695212704
R^2 Score of Section 4 Reduced Model: 0.8938295080881132


#### Looks like the R^2 decreased slightly, but all the remaining regressors statistically significantly contribute to the response. Nonetheless, this model doesn't have a unnecessary regressor.

In [25]:
acc_sect_5.dtypes

YEAR             int64
ST_CASE          int64
NUM_PEDS         int64
NUM_VEHC         int64
PERSONS          int64
HOUR             int64
ROAD_NAME       object
MILE_MARKER    float64
LATITUDE       float64
LONGITUD       float64
SCH_BUS          int64
FATALS           int64
dtype: object

In [26]:
X_5 = acc_sect_5.drop(columns=['FATALS', 'ROAD_NAME'])  
y_5 = acc_sect_5['FATALS']

X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_5, test_size=0.25)

regr_mod_sect_5 = sm.OLS(y_train_5, X_train_5).fit()

print("Summary for Section 4 Model:")
print(regr_mod_sect_5.summary())

Summary for Section 4 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.908
Model:                            OLS   Adj. R-squared (uncentered):              0.908
Method:                 Least Squares   F-statistic:                          1.605e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:09   Log-Likelihood:                         -59407.
No. Observations:              162345   AIC:                                  1.188e+05
Df Residuals:                  162335   BIC:                                  1.189e+05
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [27]:
y_preds_5 = regr_mod_sect_5.predict(X_test_5)

mse_5 = mean_squared_error(y_test_5, y_preds_5)

r2_5 = regr_mod_sect_5.rsquared

print("Mean Squared Error Section 5 regr_mod_sect_5:", mse_5)
print("R^2 Score Section 5 regr_mod_sect_5:", r2_5)


Mean Squared Error Section 5 regr_mod_sect_5: 0.12145742865691549
R^2 Score Section 5 regr_mod_sect_5: 0.9081418938347336


#### Based on the model summary printout, all regressors are statistically significant except MILE_MARKER and LATITUDE, which have p-values of 0.061 and 0.519, respectively. So, a reduced model without it will be created below

In [28]:
X_5 = acc_sect_5.drop(columns=['FATALS', 'ROAD_NAME', 'MILE_MARKER', 'LATITUDE'])  
y_5 = acc_sect_5['FATALS']

X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_5, test_size=0.25)

regr_mod_reduced_sect_5 = sm.OLS(y_train_5, X_train_5).fit()

print("Summary for Reduced Section 5 Model:")
print(regr_mod_reduced_sect_5.summary())


Summary for Reduced Section 5 Model:
                                 OLS Regression Results                                
Dep. Variable:                 FATALS   R-squared (uncentered):                   0.908
Model:                            OLS   Adj. R-squared (uncentered):              0.908
Method:                 Least Squares   F-statistic:                          1.998e+05
Date:                Thu, 14 Mar 2024   Prob (F-statistic):                        0.00
Time:                        22:37:09   Log-Likelihood:                         -59953.
No. Observations:              162345   AIC:                                  1.199e+05
Df Residuals:                  162337   BIC:                                  1.200e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
----

In [29]:
y_preds_reduced_5 = regr_mod_reduced_sect_5.predict(X_test_5)

mse_reduced_5 = mean_squared_error(y_test_5, y_preds_reduced_5)

r2_reduced_5 = regr_mod_reduced_sect_5.rsquared

print("Mean Squared Error of Section 4 Reduced Model:", mse_reduced_5)
print("R^2 Score of Section 4 Reduced Model:", r2_reduced_5)


Mean Squared Error of Section 4 Reduced Model: 0.11906044829174185
R^2 Score of Section 4 Reduced Model: 0.9078129146297749


#### Looks like the R^2 increased slightly from 0.9080 to 0.9088, and all regressors of this reduced model are significant.

In [30]:
acc_sect_6.dtypes

YEAR                int64
STATENAME          object
ST_CASE             int64
NUM_PEDS            int64
NUM_VEHC            int64
PERSONS             int64
MONTHNAME          object
DAYNAME             int64
DAY_WEEKNAME       object
HOUR                int64
ROAD_NAME          object
ROUTENAME          object
RUR_URBNAME        object
FUNC_SYSNAME       object
RD_OWNERNAME       object
MILE_MARKER         int64
LATITUDE          float64
LONGITUD          float64
HARM_EVNAME        object
COLLISION_TYPE     object
TYP_INTNAME        object
REL_ROADNAME       object
WRK_ZONENAME       object
LGT_CONDNAME       object
WEATHERNAME        object
SCH_BUS             int64
FATALS              int64
COUNTYNAME         object
CITYNAME           object
dtype: object

In [31]:
for column in acc_sect_6.columns:
    unique_count = acc_sect_6[column].nunique()
    print(f"Number of unique values in column '{column}': {unique_count}")


Number of unique values in column 'YEAR': 7
Number of unique values in column 'STATENAME': 51
Number of unique values in column 'ST_CASE': 40822
Number of unique values in column 'NUM_PEDS': 14
Number of unique values in column 'NUM_VEHC': 29
Number of unique values in column 'PERSONS': 59
Number of unique values in column 'MONTHNAME': 12
Number of unique values in column 'DAYNAME': 31
Number of unique values in column 'DAY_WEEKNAME': 7
Number of unique values in column 'HOUR': 25
Number of unique values in column 'ROAD_NAME': 80591
Number of unique values in column 'ROUTENAME': 9
Number of unique values in column 'RUR_URBNAME': 5
Number of unique values in column 'FUNC_SYSNAME': 10
Number of unique values in column 'RD_OWNERNAME': 29
Number of unique values in column 'MILE_MARKER': 4121
Number of unique values in column 'LATITUDE': 236310
Number of unique values in column 'LONGITUD': 239716
Number of unique values in column 'HARM_EVNAME': 60
Number of unique values in column 'COLLISIO

Will make these variables with fewer levels into dummy variables:

MONTHNAME
DAY_WEEKNAME
ROUTENAME
RUR_URBNAME
FUNC_SYSNAME
COLLISION_TYPE
TYP_INTNAME
REL_ROADNAME
WRK_ZONENAME
LGT_CONDNAME
WEATHERNAME

Will exclude these variables because of too many levels:

STATENAME
ROAD_NAME
RD_OWNERNAME
HARM_EVNAME
COUNTYNAME
CITYNAME

In [32]:
columns_dummy = ['MONTHNAME', 'DAY_WEEKNAME', 'ROUTENAME', 'RUR_URBNAME', 
                    'FUNC_SYSNAME', 'COLLISION_TYPE', 'TYP_INTNAME', 'REL_ROADNAME', 
                    'WRK_ZONENAME', 'LGT_CONDNAME', 'WEATHERNAME']

dummy_df_6 = pd.get_dummies(acc_sect_6, columns=columns_dummy)

dummy_df_6 = dummy_df_6.drop(columns=['STATENAME', 'ROAD_NAME', 'RD_OWNERNAME', 
                                      'HARM_EVNAME', 'COUNTYNAME', 'CITYNAME'])  

dummy_df_6.head()


Unnamed: 0,YEAR,ST_CASE,NUM_PEDS,NUM_VEHC,PERSONS,DAYNAME,HOUR,MILE_MARKER,LATITUDE,LONGITUD,...,"WEATHERNAME_Fog, Smog, Smoke",WEATHERNAME_Freezing Rain or Drizzle,WEATHERNAME_Not Reported,WEATHERNAME_Other,WEATHERNAME_Rain,WEATHERNAME_Reported as Unknown,WEATHERNAME_Severe Crosswinds,WEATHERNAME_Sleet or Hail,WEATHERNAME_Snow,WEATHERNAME_Unknown
0,2015,10001,0,1,1,1,2,1754,33.878653,-87.325328,...,0,0,0,0,0,0,0,0,0,0
1,2015,10002,0,1,1,1,22,3604,34.910442,-86.908708,...,0,0,0,0,0,0,0,0,0,0
2,2015,10003,0,1,2,1,1,1958,32.142006,-85.758456,...,0,0,0,0,0,0,0,0,0,0
3,2015,10004,0,1,1,4,0,566,31.439814,-85.5103,...,0,0,0,0,0,0,0,0,0,0
4,2015,10005,0,2,2,7,7,308,31.319331,-85.5151,...,0,0,0,0,0,0,0,0,0,0


In [33]:
dummy_df_6.shape

(244695, 123)

In [34]:
object_columns = dummy_df_6.select_dtypes(include=['object']).columns
dummy_df_6[object_columns].dtypes


Series([], dtype: object)

In [35]:
X_6 = dummy_df_6.drop(columns=['FATALS'])  

y_6 = acc_sect_6['FATALS']

X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(X_6, y_6, test_size=0.25)

regr_mod_sect_6 = sm.OLS(y_train_6, X_train_6).fit()


In [36]:
# More organized print out for summary of Section 6 Model
print("Summary for Section 6 Model:")
print(f"{'Variable':<30} {'Coefficient':<15} {'Std. Error':<15} {'t-value':<15} {'P-value':<15}")
print("-" * 88)
for i, name in enumerate(regr_mod_sect_6.model.exog_names):
    coef = round(regr_mod_sect_6.params[i], 4)
    std_err = round(regr_mod_sect_6.bse[i], 4)
    t_value = round(regr_mod_sect_6.tvalues[i], 4)
    p_value = round(regr_mod_sect_6.pvalues[i], 4)
    print(f"{name:<30} {coef:<15} {std_err:<15} {t_value:<15} {p_value:<15}")
print("-" * 88)
print(f"R-squared: {round(regr_mod_sect_6.rsquared, 4)}")


Summary for Section 6 Model:
Variable                       Coefficient     Std. Error      t-value         P-value        
----------------------------------------------------------------------------------------
YEAR                           0.0019          0.0005          3.6061          0.0003         
ST_CASE                        0.0             0.0             1.045           0.296          
NUM_PEDS                       0.0563          0.0024          23.434          0.0            
NUM_VEHC                       -0.0426         0.0014          -31.5147        0.0            
PERSONS                        0.0697          0.0006          120.2231        0.0            
DAYNAME                        0.0001          0.0001          0.633           0.5267         
HOUR                           -0.0005         0.0001          -6.0609         0.0            
MILE_MARKER                    0.0             0.0             0.2631          0.7925         
LATITUDE                   

In [32]:
y_preds_6 = regr_mod_sect_6.predict(X_test_6)

mse_6 = mean_squared_error(y_test_6, y_preds_6)

r2_6 = regr_mod_sect_6.rsquared

print("Mean Squared Error Section 6 regr_mod_sect_6:", mse_6)
print("R^2 Score Section 6 regr_mod_sect_6:", r2_6)


Mean Squared Error Section 6 regr_mod_sect_6: 0.11580739321989238
R^2 Score Section 6 regr_mod_sect_6: 0.11027930838246514


#### Based on the model summary printout in 1 iteration of the model, the following regressors are not significant since their p-values >= 0.05:

ST_CASE = 0.296

DAYNAME = 0.5267

MILE_MARKER = 0.7925  

LATITUDE = 0.0904

COLLISION_TYPE_Front-to-Front = 0.5474 

COLLISION_TYPE_Not Reported = 0.231

COLLISION_TYPE_Reported as Unknown = 0.0517

TYP_INTNAME_Other Intersection Type = 0.0561

TYP_INTNAME_Traffic Circle = 0.0881 

TYP_INTNAME_Unknown = 0.07

REL_ROADNAME_In Parking Lane/Zone = 0.1546

REL_ROADNAME_Pedestrian Refuge Island or Traffic Island = 0.1728

WRK_ZONENAME_Utility = 0.0522

WEATHERNAME_Blowing Sand, Soil, Dirt = 0.4962

WEATHERNAME_Unknown = 0.0568

In [37]:
mod_6_insig_cols = ['ST_CASE', 'DAYNAME', 'MILE_MARKER', 'LATITUDE', 
                     'COLLISION_TYPE_Front-to-Front', 'COLLISION_TYPE_Not Reported', 
                     'COLLISION_TYPE_Reported as Unknown', 
                     'TYP_INTNAME_Other Intersection Type', 
                     'TYP_INTNAME_Traffic Circle', 'TYP_INTNAME_Unknown', 
                     'REL_ROADNAME_In Parking Lane/Zone', 
                     'REL_ROADNAME_Pedestrian Refuge Island or Traffic Island', 
                     'WRK_ZONENAME_Utility', 'WEATHERNAME_Blowing Sand, Soil, Dirt', 
                     'WEATHERNAME_Unknown']

In [38]:
X_6 = dummy_df_6.drop(columns=mod_6_insig_cols)  
y_6 = dummy_df_6['FATALS']

X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(X_6, y_6, test_size=0.25)

regr_mod_reduced_sect_6 = sm.OLS(y_train_6, X_train_6).fit()


Summary for Reduced Section 6 Model:
                            OLS Regression Results                            
Dep. Variable:                 FATALS   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.259e+30
Date:                Thu, 14 Mar 2024   Prob (F-statistic):               0.00
Time:                        22:54:41   Log-Likelihood:             5.6000e+06
No. Observations:              183521   AIC:                        -1.120e+07
Df Residuals:                  183419   BIC:                        -1.120e+07
Df Model:                         101                                         
Covariance Type:            nonrobust                                         
                                                                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------

In [39]:
# More organized print out for summary of Section 6 Model
print("Summary for Reduced Section 6 Model:")
print(f"{'Variable':<30} {'Coefficient':<15} {'Std. Error':<15} {'t-value':<15} {'P-value':<15}")
print("-" * 88)
for i, name in enumerate(regr_mod_reduced_sect_6.model.exog_names):
    coef = round(regr_mod_reduced_sect_6.params[i], 4)
    std_err = round(regr_mod_reduced_sect_6.bse[i], 4)
    t_value = round(regr_mod_reduced_sect_6.tvalues[i], 4)
    p_value = round(regr_mod_reduced_sect_6.pvalues[i], 4)
    print(f"{name:<30} {coef:<15} {std_err:<15} {t_value:<15} {p_value:<15}")
print("-" * 88)
print(f"R-squared: {round(regr_mod_reduced_sect_6.rsquared, 4)}")


Summary for Reduced Section 6 Model:
Variable                       Coefficient     Std. Error      t-value         P-value        
----------------------------------------------------------------------------------------
YEAR                           -0.0            0.0             -18.1564        0.0            
NUM_PEDS                       -0.0            0.0             -23.9179        0.0            
NUM_VEHC                       0.0             0.0             17.7868         0.0            
PERSONS                        -0.0            0.0             -11.3406        0.0            
HOUR                           -0.0            0.0             -39.3762        0.0            
LONGITUD                       0.0             0.0             7.0997          0.0            
SCH_BUS                        -0.0            0.0             -1.8019         0.0716         
FATALS                         1.0             0.0             1.0600439089396924e+16 0.0            
MONTHNAME_Ap

In [41]:
y_preds_reduced_6 = regr_mod_reduced_sect_6.predict(X_test_6)

mse_reduced_6 = mean_squared_error(y_test_6, y_preds_reduced_6)

r2_reduced_6 = regr_mod_reduced_sect_6.rsquared

print("Mean Squared Error of Section 6 Reduced Model:", mse_reduced_6)
print("R^2 Score of Section 6 Reduced Model:", r2_reduced_6)


Mean Squared Error of Section 6 Reduced Model: 1.820609569692231e-28
R^2 Score of Section 6 Reduced Model: 1.0


#### Looks like the R^2 increased significantly from 0.1103 to 1.0, meaning that getting rid of these insignificant regressors definitely improved the model quality at predicting on the test data. However, it is a bit surprising that the R^2 value is a perfect 1.0