In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats.mstats import winsorize

In [2]:
data_df = pd.read_csv('percent change regression data.csv')
data_df = data_df.drop('station_geom', axis = 1)        # don't need geometry for regression

### Skew & Winsorizing

In [3]:
data_df.drop(['station_complex_name','borough','routes','route_count',
              'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1).skew()


        # extreme skew > 2
                # ridership
                # employee
                # median rent
                # housing units

        # need to winsorize

rider_change             3.215681
emp_change               5.571259
bus_change               0.158823
pop_change               0.574156
women_share_change      -0.251318
med_age_change           0.517981
college_age_change      -0.375009
young_prof_age_change    0.553877
med_income_change        0.092778
med_rent_change          2.517839
housing_units_change     3.783578
renter_share_change     -0.542555
dtype: float64

In [4]:
# save new df for winsorizing
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# try 1% winsorize
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.01, 0.01])
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.01, 0.01])
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.01, 0.01])
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.01, 0.01])

win_df.skew()

rider_change             1.462070
emp_change               3.692988
bus_change               0.158823
pop_change               0.574156
women_share_change      -0.251318
med_age_change           0.517981
college_age_change      -0.375009
young_prof_age_change    0.553877
med_income_change        0.092778
med_rent_change          2.524603
housing_units_change     1.474229
renter_share_change     -0.542555
dtype: float64

In [5]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# try 5% winsorize
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.05, 0.05])
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.05, 0.05])
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.05, 0.05])
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.05, 0.05])

win_df.skew()

rider_change             0.203040
emp_change               1.901686
bus_change               0.158823
pop_change               0.574156
women_share_change      -0.251318
med_age_change           0.517981
college_age_change      -0.375009
young_prof_age_change    0.553877
med_income_change        0.092778
med_rent_change          1.545617
housing_units_change     0.253450
renter_share_change     -0.542555
dtype: float64

In [6]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# try 10% winsorize for employee and median rent
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.05, 0.05])                           # good
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.1, 0.1])
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.1, 0.1])
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.05, 0.05])           # good

win_df.skew()

rider_change             0.203040
emp_change               0.837628
bus_change               0.158823
pop_change               0.574156
women_share_change      -0.251318
med_age_change           0.517981
college_age_change      -0.375009
young_prof_age_change    0.553877
med_income_change        0.092778
med_rent_change          0.609662
housing_units_change     0.253450
renter_share_change     -0.542555
dtype: float64

In [7]:
# skew looks better → finalize into dataset

data_df['rider_change'] = winsorize(data_df['rider_change'], limits = [0.05, 0.05])                           # good
data_df['emp_change'] = winsorize(data_df['emp_change'], limits = [0.1, 0.1])
data_df['med_rent_change'] = winsorize(data_df['med_rent_change'], limits = [0.1, 0.1])
data_df['housing_units_change'] = winsorize(data_df['housing_units_change'], limits = [0.05, 0.05])           # good

### Check Multicollinearity

In [8]:
X = data_df.drop(['station_complex_name','borough','routes'], axis = 1)

# add intercept
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

                  feature        VIF
0                   const  41.992144
1             route_count   2.725132
2                line_ACE   2.222498
3                line_123   2.146749
4               line_BDFM   1.946253
5                line_456   2.164116
6               line_NQRW   2.338492
7                  line_7   1.615963
8                  line_L   1.504634
9                  line_G   1.440262
10                line_JZ   1.765526
11           rider_change   1.223225
12             emp_change   1.501755
13             bus_change   1.898725
14             pop_change   3.183510
15     women_share_change   1.446744
16         med_age_change   2.194785
17     college_age_change   2.095107
18  young_prof_age_change   4.530200
19      med_income_change   3.044424
20        med_rent_change   2.810926
21   housing_units_change   3.799770
22    renter_share_change   1.561442


In [9]:
# all under 5 → stable

# Model

In [10]:
# make Manhattan the baseline borough as it's at the core of the transit system
data_df['borough'] = pd.Categorical(data_df['borough'], categories = ['Manhattan','Bronx','Brooklyn','Queens'])

In [11]:
formula = """
rider_change ~ borough
    + route_count
    + emp_change
    + bus_change
    + pop_change
    + women_share_change
    + med_age_change
    + college_age_change
    + young_prof_age_change
    + med_income_change
    + med_rent_change
    + housing_units_change
    + renter_share_change
    + line_ACE
    + line_123
    + line_BDFM
    + line_456
    + line_NQRW
    + line_7
    + line_L
    + line_G
    + line_JZ
"""
model = smf.ols(formula = formula, data = data_df).fit(cov_type = 'HC3')
model.summary(slim = True)

0,1,2,3
Dep. Variable:,rider_change,R-squared:,0.19
Model:,OLS,Adj. R-squared:,0.141
No. Observations:,418,F-statistic:,4.884
Covariance Type:,HC3,Prob (F-statistic):,3.58e-12

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-17.2109,5.119,-3.362,0.001,-27.243,-7.179
borough[T.Bronx],4.2811,3.565,1.201,0.230,-2.705,11.268
borough[T.Brooklyn],-2.5908,3.214,-0.806,0.420,-8.890,3.708
borough[T.Queens],-2.4661,4.483,-0.550,0.582,-11.253,6.321
route_count,-0.1978,0.988,-0.200,0.841,-2.134,1.738
emp_change,0.0307,0.088,0.351,0.726,-0.141,0.203
bus_change,0.2812,0.228,1.236,0.217,-0.165,0.727
pop_change,0.3471,0.394,0.881,0.379,-0.425,1.120
women_share_change,-0.0306,1.155,-0.026,0.979,-2.294,2.233


### Make summary more readable

In [12]:
# use * for significance
def significance_stars(p):
    if p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.10:
        return '*'
    else:
        return 'NS'
    
results = model

# create summary dataframe
summary_df = pd.DataFrame({
    'Variable': results.params.index,
    'Coefficient': results.params.values.round(1),
    'p_value': results.pvalues.values
})

# use * for significance
summary_df['Significance'] = summary_df['p_value'].apply(significance_stars)
summary_df = summary_df[['Variable', 'Coefficient', 'Significance']]

In [13]:
# rename variables for better readability
readable_names = {
    'Intercept':'Intercept (baseline when all variables = 0)',
    'borough[T.Brooklyn]':'Brooklyn',
    'borough[T.Queens]':'Queens',
    'borough[T.Bronx]':'Bronx',
    'route_count':'Number of Subway Lines',
    'emp_change':'Employee Count',
    'bus_change':'Business Count',
    'pop_change':'Population',
    'women_share_change':'Percent of Women',
    'med_age_change':'Median Age',
    'college_age_change':'18-24 Year Olds',
    'young_prof_age_change':'25-39 Year Olds',
    'med_income_change':'Median Income',
    'med_rent_change':'Median Rent',
    'housing_units_change':'Housing Units',
    'renter_share_change':'Percent of Renters',
    'line_ACE':'ACE Line',
    'line_123':'123 Line',
    'line_BDFM':'BDFM Line',
    'line_456':'456 Line',
    'line_NQRW':'NQRW Line',
    'line_7':'7 Line',
    'line_L':'L Line',
    'line_G':'G Line',
    'line_JZ':'JZ Line'
}

summary_df['Variable'] = summary_df['Variable'].replace(readable_names)

In [14]:
# insert rows for headers
boro_header = pd.DataFrame([['Borough Category — Relative to Manhattan','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:1], boro_header, summary_df.iloc[1:]]).reset_index(drop = True)

cont_header = pd.DataFrame([['Continuous Variables (Percent Change)','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:5], cont_header, summary_df.iloc[5:]]).reset_index(drop = True)

interact_header = pd.DataFrame([['Subway Line Indicator','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:18], interact_header, summary_df.iloc[18:]]).reset_index(drop = True)

# insert spacing
blank_row = pd.DataFrame([['','','']], columns=summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:1], blank_row, summary_df.iloc[1:]]).reset_index(drop = True)
summary_df = pd.concat([summary_df.iloc[:6], blank_row, summary_df.iloc[6:]]).reset_index(drop = True)
summary_df = pd.concat([summary_df.iloc[:20], blank_row, summary_df.iloc[20:]]).reset_index(drop = True)

In [15]:
print('NYC Subway Ridership (2013-2018) — OLS Regression Results')
print('Percent Change Model')
print('')
print('r^2:', results.rsquared.round(4))
print('Significance: *** p<0.01 | ** p<0.05 | * p<0.10 | NS = Not Significant')
print('Coefficient: expected percent change in annual riders')
summary_df

NYC Subway Ridership (2013-2018) — OLS Regression Results
Percent Change Model

r^2: 0.1901
Significance: *** p<0.01 | ** p<0.05 | * p<0.10 | NS = Not Significant
Coefficient: expected percent change in annual riders


Unnamed: 0,Variable,Coefficient,Significance
0,Intercept (baseline when all variables = 0),-17.2,***
1,,,
2,Borough Category — Relative to Manhattan,,
3,Bronx,4.3,NS
4,Brooklyn,-2.6,NS
5,Queens,-2.5,NS
6,,,
7,Continuous Variables (Percent Change),,
8,Number of Subway Lines,-0.2,NS
9,Employee Count,0.0,NS
