In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats.mstats import winsorize

In [2]:
data_df = pd.read_csv('absolute change regression data.csv')
data_df = data_df.drop('station_geom', axis = 1)        # don't need geometry for regression

### Ridership data is in thousands but census data is in raw counts → scale

In [3]:
# ridership is in thousands → divide by 1000 to get figures in millions
data_df['ridership_2013'] = data_df['ridership_2013'] / 1000
data_df['rider_change'] = data_df['rider_change'] / 1000

In [4]:
# employees → for every 1000 additional employees
data_df['emp_change'] = data_df['emp_change'] / 1000

# businesses → for every 100 additional businesses
data_df['bus_change'] = data_df['bus_change'] / 100

# population → for every 1000 additional population
data_df['pop_change'] = data_df['pop_change'] / 1000

# college age → for every 1000 additional 18-24 yr olds
data_df['college_age_change'] = data_df['college_age_change'] / 1000

# young professional → for every 1000 additional 25-39 yr olds
data_df['young_prof_age_change'] = data_df['young_prof_age_change'] / 1000

# median income → for every $1000 additional income
data_df['med_income_change'] = data_df['med_income_change'] / 1000

# median rent → for every $100 additional rent
data_df['med_rent_change'] = data_df['med_rent_change'] / 100

# housing units → for every 100 additional housing unit
data_df['housing_units_change'] = data_df['housing_units_change'] / 100

In [5]:
    # median age can stay as every 1 year increase
    # woman share and renter share are already in percent increase / decrease

### Skew & Winsorizing

In [6]:
data_df.drop(['station_complex_name','borough','routes','route_count',
              'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1).skew()

        # extreme skew > 2
                # baseline ridership
                # employees
                # median rent


        # high skew > 1
                # ridership
                # businesses
                # college age
                # young professional
                # median income
                # housing units


ridership_2013           5.029658
rider_change             1.086831
emp_change               2.547076
bus_change              -1.182375
pop_change               0.437095
women_share_change      -0.251318
med_age_change           0.411629
college_age_change      -1.780760
young_prof_age_change    1.103711
med_income_change        1.009868
med_rent_change          2.928973
housing_units_change     1.126584
renter_share_change     -0.542555
dtype: float64

In [7]:
# save new df for winsorizing
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# try 1% winsorize

# skew > 2
win_df['ridership_2013'] = winsorize(win_df['med_rent_change'], limits = [0.01, 0.01])
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.01, 0.01])
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.01, 0.01])

# skew > 1
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.01, 0.01])
win_df['bus_change'] = winsorize(win_df['bus_change'], limits = [0.01, 0.01])
win_df['college_age_change'] = winsorize(win_df['college_age_change'], limits = [0.01, 0.01])
win_df['young_prof_age_change'] = winsorize(win_df['young_prof_age_change'], limits = [0.01, 0.01])
win_df['med_income_change'] = winsorize(win_df['med_income_change'], limits = [0.01, 0.01])
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.01, 0.01])

win_df.skew()

ridership_2013           2.901185
rider_change            -1.985712
emp_change               2.388571
bus_change              -1.274231
pop_change               0.437095
women_share_change      -0.251318
med_age_change           0.411629
college_age_change      -1.397583
young_prof_age_change    1.032844
med_income_change        1.016314
med_rent_change          2.901185
housing_units_change     0.948263
renter_share_change     -0.542555
dtype: float64

In [8]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# try 5% winsorize

# skew > 2
win_df['ridership_2013'] = winsorize(win_df['med_rent_change'], limits = [0.05, 0.05])                  # extreme skew
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.05, 0.05])                           # extreme skew
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.05, 0.05])                 # extreme skew

# skew > 1
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.05, 0.05])                       # high skew
win_df['bus_change'] = winsorize(win_df['bus_change'], limits = [0.05, 0.05])                           # high skew
win_df['college_age_change'] = winsorize(win_df['college_age_change'], limits = [0.05, 0.05])           # high skew
win_df['young_prof_age_change'] = winsorize(win_df['young_prof_age_change'], limits = [0.05, 0.05])     # high skew
win_df['med_income_change'] = winsorize(win_df['med_income_change'], limits = [0.05, 0.05])             # high skew
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.05, 0.05])       # high skew

win_df.skew()

ridership_2013           1.306683
rider_change            -0.523526
emp_change               1.854908
bus_change              -0.244193
pop_change               0.437095
women_share_change      -0.251318
med_age_change           0.411629
college_age_change      -0.827061
young_prof_age_change    0.339508
med_income_change        0.429484
med_rent_change          1.306683
housing_units_change     0.666007
renter_share_change     -0.542555
dtype: float64

In [9]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# base ridership, employees, college age and median rent still high → 10%

# skew > 2
win_df['ridership_2013'] = winsorize(win_df['med_rent_change'], limits = [0.1, 0.1])                    # high skew
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.1, 0.1])                             # high skew
win_df['med_rent_change'] = winsorize(win_df['med_rent_change'], limits = [0.1, 0.1])                   # high skew

# skew > 1
win_df['rider_change'] = winsorize(win_df['rider_change'], limits = [0.05, 0.05])                       # good at 5%
win_df['bus_change'] = winsorize(win_df['bus_change'], limits = [0.05, 0.05])                           # good at 5%
win_df['college_age_change'] = winsorize(win_df['college_age_change'], limits = [0.1, 0.1])             # moderate skew
win_df['young_prof_age_change'] = winsorize(win_df['young_prof_age_change'], limits = [0.05, 0.05])     # good at 5%
win_df['med_income_change'] = winsorize(win_df['med_income_change'], limits = [0.05, 0.05])             # good at 5%
win_df['housing_units_change'] = winsorize(win_df['housing_units_change'], limits = [0.05, 0.05])       # still moderate but closer to 0.5 → keep

win_df.skew()

ridership_2013           0.811881
rider_change            -0.523526
emp_change               1.068921
bus_change              -0.244193
pop_change               0.437095
women_share_change      -0.251318
med_age_change           0.411629
college_age_change      -0.654854
young_prof_age_change    0.339508
med_income_change        0.429484
med_rent_change          0.811881
housing_units_change     0.666007
renter_share_change     -0.542555
dtype: float64

In [10]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# base ridership, employees and median rent still high or close to high skew → try log transformation
win_df['emp_change'] = np.log1p(win_df['emp_change'])
win_df['med_rent_change'] = np.log1p(win_df['med_rent_change'])
win_df['ridership_2013'] = np.log1p(win_df['ridership_2013'])

print(win_df[['emp_change','med_rent_change']].skew())
print('')
print(win_df[['emp_change','med_rent_change','ridership_2013']].isna().sum())

emp_change        -0.248495
med_rent_change    1.290290
dtype: float64

emp_change         1
med_rent_change    0
ridership_2013     0
dtype: int64


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
win_df = data_df.drop(['station_complex_name','borough','routes','route_count',
                       'line_ACE','line_123','line_BDFM','line_456','line_NQRW','line_7','line_L','line_G','line_JZ'], axis = 1)

# employees and median rent look better but log transformation of employee change created nulls → go further with winsorization
win_df['emp_change'] = winsorize(win_df['emp_change'], limits = [0.15, 0.15])
win_df['emp_change'].skew()

np.float64(0.8698129906834839)

In [12]:
# skew looks better → finalize into dataset

# skew > 2
data_df['ridership_2013'] = np.log1p(data_df['ridership_2013'])
data_df['emp_change'] = winsorize(data_df['emp_change'], limits = [0.15, 0.15])
data_df['med_rent_change'] = np.log1p(data_df['med_rent_change'])

# skew > 1
data_df['rider_change'] = winsorize(data_df['rider_change'], limits = [0.05, 0.05])
data_df['bus_change'] = winsorize(data_df['bus_change'], limits = [0.05, 0.05])
data_df['college_age_change'] = winsorize(data_df['college_age_change'], limits = [0.1, 0.1])
data_df['young_prof_age_change'] = winsorize(data_df['young_prof_age_change'], limits = [0.05, 0.05])
data_df['med_income_change'] = winsorize(data_df['med_income_change'], limits = [0.05, 0.05])
data_df['housing_units_change'] = winsorize(data_df['housing_units_change'], limits = [0.1, 0.1])

### Check Multicollinearity

In [13]:
X = data_df.drop(['station_complex_name','borough','routes'], axis = 1)

# add intercept
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

                  feature        VIF
0                   const  43.044459
1             route_count   3.405747
2                line_ACE   2.153435
3                line_123   2.177959
4               line_BDFM   2.063059
5                line_456   2.242041
6               line_NQRW   2.330078
7                  line_7   1.613072
8                  line_L   1.584452
9                  line_G   1.408649
10                line_JZ   1.749027
11         ridership_2013   2.517035
12           rider_change   1.356937
13             emp_change   2.111639
14             bus_change   2.463538
15             pop_change   3.151579
16     women_share_change   1.386351
17         med_age_change   1.687241
18     college_age_change   2.825912
19  young_prof_age_change   3.770776
20      med_income_change   4.811747
21        med_rent_change   4.871568
22   housing_units_change   3.848020
23    renter_share_change   1.753778


In [14]:
# all under 5 → stable

# Model

In [15]:
# make Manhattan the baseline borough as it's at the core of the transit system
data_df['borough'] = pd.Categorical(data_df['borough'], categories = ['Manhattan','Bronx','Brooklyn','Queens'])

In [16]:
formula = """
rider_change ~ borough
    + route_count
    + ridership_2013
    + emp_change
    + bus_change
    + pop_change
    + women_share_change
    + med_age_change
    + college_age_change
    + young_prof_age_change
    + med_income_change
    + med_rent_change
    + housing_units_change
    + renter_share_change
    + line_ACE
    + line_123
    + line_BDFM
    + line_456
    + line_NQRW
    + line_7
    + line_L
    + line_G
    + line_JZ
"""
model = smf.ols(formula = formula, data = data_df).fit(cov_type = 'HC3')
model.summary(slim = True)

0,1,2,3
Dep. Variable:,rider_change,R-squared:,0.319
Model:,OLS,Adj. R-squared:,0.275
No. Observations:,418,F-statistic:,8.814
Covariance Type:,HC3,Prob (F-statistic):,2.26e-25

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.4254,0.488,-0.871,0.384,-1.383,0.532
borough[T.Bronx],1.0013,0.310,3.233,0.001,0.394,1.608
borough[T.Brooklyn],0.0247,0.241,0.103,0.918,-0.447,0.496
borough[T.Queens],-0.5736,0.338,-1.697,0.090,-1.236,0.089
route_count,0.1450,0.105,1.378,0.168,-0.061,0.351
ridership_2013,-0.6578,0.112,-5.886,0.000,-0.877,-0.439
emp_change,-0.0030,0.008,-0.356,0.722,-0.019,0.013
bus_change,-0.0207,0.021,-0.977,0.329,-0.062,0.021
pop_change,-0.0156,0.010,-1.502,0.133,-0.036,0.005


#### Make summary more readable

In [17]:
# use * for significance
def significance_stars(p):
    if p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.10:
        return '*'
    else:
        return 'NS'
    
results = model

# create summary dataframe
summary_df = pd.DataFrame({
    'Variable': results.params.index,
    'Coefficient': results.params.values.round(3),
    'p_value': results.pvalues.values
})

# use * for significance
summary_df['Significance'] = summary_df['p_value'].apply(significance_stars)
summary_df = summary_df[['Variable', 'Coefficient', 'Significance']]

In [18]:
# rename variables for better readability
readable_names = {
    'Intercept':'Intercept (baseline when all variables = 0)',
    'borough[T.Brooklyn]':'Brooklyn',
    'borough[T.Queens]':'Queens',
    'borough[T.Bronx]':'Bronx',
    'route_count':'Number of Subway Lines',
    'ridership_2013':'Baseline Ridership (in 2013)',
    'emp_change':'Employee Count',
    'bus_change':'Business Count',
    'pop_change':'Population',
    'women_share_change':'Percent of Women',
    'med_age_change':'Median Age',
    'college_age_change':'18-24 Year Olds',
    'young_prof_age_change':'25-39 Year Olds',
    'med_income_change':'Median Income',
    'med_rent_change':'Median Rent',
    'housing_units_change':'Housing Units',
    'renter_share_change':'Percent of Renters',
    'line_ACE':'ACE Line',
    'line_123':'123 Line',
    'line_BDFM':'BDFM Line',
    'line_456':'456 Line',
    'line_NQRW':'NQRW Line',
    'line_7':'7 Line',
    'line_L':'L Line',
    'line_G':'G Line',
    'line_JZ':'JZ Line'
}

summary_df['Variable'] = summary_df['Variable'].replace(readable_names)

In [19]:
# insert rows for headers
boro_header = pd.DataFrame([['Borough Category — Relative to Manhattan','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:1], boro_header, summary_df.iloc[1:]]).reset_index(drop = True)

cont_header = pd.DataFrame([['Continuous Variables','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:5], cont_header, summary_df.iloc[5:]]).reset_index(drop = True)

interact_header = pd.DataFrame([['Subway Line Indicator','','']], columns = summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:19], interact_header, summary_df.iloc[19:]]).reset_index(drop = True)

# insert spacing
blank_row = pd.DataFrame([['','','']], columns=summary_df.columns)
summary_df = pd.concat([summary_df.iloc[:1], blank_row, summary_df.iloc[1:]]).reset_index(drop = True)
summary_df = pd.concat([summary_df.iloc[:6], blank_row, summary_df.iloc[6:]]).reset_index(drop = True)
summary_df = pd.concat([summary_df.iloc[:21], blank_row, summary_df.iloc[21:]]).reset_index(drop = True)

In [20]:
print('NYC Subway Ridership (2013-2018) — OLS Regression Results')
print('Absolute Change Model')
print('')
print('r^2:', results.rsquared.round(4))
print('Significance: *** p<0.01 | ** p<0.05 | * p<0.10 | NS = Not Significant')
print('Coefficient: expected change in annual riders (in millions)')
summary_df

NYC Subway Ridership (2013-2018) — OLS Regression Results
Absolute Change Model

r^2: 0.3188
Significance: *** p<0.01 | ** p<0.05 | * p<0.10 | NS = Not Significant
Coefficient: expected change in annual riders (in millions)


Unnamed: 0,Variable,Coefficient,Significance
0,Intercept (baseline when all variables = 0),-0.425,NS
1,,,
2,Borough Category — Relative to Manhattan,,
3,Bronx,1.001,***
4,Brooklyn,0.025,NS
5,Queens,-0.574,*
6,,,
7,Continuous Variables,,
8,Number of Subway Lines,0.145,NS
9,Baseline Ridership (in 2013),-0.658,***
