In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyspark as ps
import scipy.stats as stats
import sys
sys.path.append("..")
from src.support_functions import get_covid_data, fixing_datetime, get_zip_income

plt.style.use('ggplot')


In [2]:
data_by_zip, tests_by_day, tests_by_boro = get_covid_data()

In [3]:
data_by_zip = data_by_zip.rename(columns={"MODIFIED_ZCTA": "Zip",
                            "NEIGHBORHOOD_NAME": "Neighborhood",
                            "BOROUGH_GROUP": "Borough",
                            "COVID_CASE_COUNT": "Covid_Case_Count",
                            "COVID_CASE_RATE": "Covid_Case_Rate",
                            "POP_DENOMINATOR": "Pop_Denominator",
                            "COVID_DEATH_COUNT": "Covid_Death_Count",
                            "COVID_DEATH_RATE":"Covid_Death_Rate",
                            "PERCENT_POSITIVE":"Percent_Positive",
                            "TOTAL_COVID_TESTS": "Total_Covid_Tests"})

In [4]:
tests_by_day = tests_by_day.rename(columns={"DATE": "Date",
                             "TOTAL_TESTS": "Total_Tests",
                             "POSITIVE_TESTS": "Positive_Tests",
                             "PERCENT_POSITIVE": "Percent_Positive",
                             "TOTAL_TESTS_7DAYS_AVG": "Total_Tests_7Days_AVG",
                             "POSITIVE_TESTS_7DAYS_AVG": "Positive_Tests_7Days_AVG",
                             "PERCENT_POSITIVE_7DAYS_AVG": "Percent_Positive_7Days_AVG",
                             "INCOMPLETE": "Incomplete"})

In [5]:
tests_by_boro = tests_by_boro.rename(columns={"BOROUGH_GROUP": "Borough",
                             "CASE_RATE": "Case_Rate",
                             "HOSPITALIZED_RATE": "Hospitalized_Rate",
                             "DEATH_RATE": "Death_Rate",
                             "CASE_COUNT": "Case_Count",
                             "HOSPITALIZED_COUNT": "Hospitalized_Count",
                             "DEATH_COUNT": "Death_Count"})

In [6]:
median_income = get_zip_income()
med_income = median_income['median_income']
data_by_zip = data_by_zip.join(med_income)

In [7]:
tests_by_day = fixing_datetime(tests_by_day)

In [8]:
turnstile2019_df = pd.read_csv('../data/Turnstile_Usage_Data__2019.csv', low_memory=False)
turnstile2020_df = pd.read_csv('../data/Turnstile_Usage_Data__2020.csv', low_memory=False)

In [34]:
turnstile2019_df

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,A033,R170,02-06-00,14 ST-UNION SQ,LNQR456W,BMT,2019-12-27,00:00:00,REGULAR,769115,559221
1,A033,R170,02-00-04,14 ST-UNION SQ,LNQR456W,BMT,2019-12-27,00:00:00,REGULAR,6483080,4945335
2,A033,R170,02-00-03,14 ST-UNION SQ,LNQR456W,BMT,2019-12-27,00:00:00,REGULAR,7191422,8417203
3,A033,R170,02-00-02,14 ST-UNION SQ,LNQR456W,BMT,2019-12-27,00:00:00,REGULAR,14983900,14554087
4,A033,R170,02-06-01,14 ST-UNION SQ,LNQR456W,BMT,2019-12-27,00:00:00,REGULAR,71047673,20925389
...,...,...,...,...,...,...,...,...,...,...,...
10467096,PTH02,R544,00-00-03,HARRISON,1,PTH,2018-12-29,23:57:36,REGULAR,114402,15472
10467097,PTH17,R541,01-01-02,THIRTY THIRD ST,1,PTH,2018-12-29,23:58:09,REGULAR,758475,361319
10467098,PTH04,R551,00-00-03,GROVE STREET,1,PTH,2018-12-29,23:58:55,REGULAR,8451,24664
10467099,PTH19,R549,02-02-04,NEWARK C,1,PTH,2018-12-29,23:59:03,REGULAR,26452,2129


In [9]:
ts_df_2019 = fixing_datetime(turnstile2019_df)
ts_df_2020 = fixing_datetime(turnstile2020_df)

In [10]:
gb2019 = ts_df_2019[['Unit', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Unit', 'SCP']).agg([min, max])
gb2019.reset_index()

gb2019['Total'] = gb2019[('Entries', 'max')] - gb2019[('Entries', 'min')]
gb2019.drop(gb2019[gb2019['Total'] < 1000].index, inplace=True)
gb2019.drop(gb2019[gb2019['Total'] > 10000].index, inplace=True)

sum_per_station_2019 = gb2019.groupby(level=[0, 1]).sum()

sum_per_date_2019 = sum_per_station_2019.groupby(level=[0]).sum()

sum_per_date_2019['week_avg_2019'] = sum_per_date_2019.iloc[:,2].rolling(window=7).mean()

In [11]:
gb2020 = ts_df_2020[['Unit', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Unit', 'SCP']).agg([min, max])
gb2020.reset_index()

gb2020['Total'] = gb2020[('Entries', 'max')] - gb2020[('Entries', 'min')]
gb2020.drop(gb2020[gb2020['Total'] < 1000].index, inplace=True)
gb2020.drop(gb2020[gb2020['Total'] > 10000].index, inplace=True)

sum_per_station_2020 = gb2020.groupby(level=[0, 1]).sum()

sum_per_date_2020 = sum_per_station_2020.groupby(level=[0]).sum()

sum_per_date_2020['week_avg_2020'] = sum_per_date_2020.iloc[:,2].rolling(window=7).mean()

## Hypothesis Test on MTA Usage between 2019 and 2020

H0: There was no difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.

HA: There was a difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.

Alpha: 0.05

In [12]:
stats.ttest_ind(sum_per_date_2019['week_avg_2019'].dropna(), sum_per_date_2020['week_avg_2020'].dropna(), equal_var=False )

Ttest_indResult(statistic=53.20656382203368, pvalue=8.146563428977898e-125)

***Conclusion***: Since the p-value is lower than our alpha, we reject the Null Hypothesis 
and can say there is effecient statistical evidence that there was a difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.

## 

In [13]:
import folium 
from folium.plugins import MarkerCluster 

In [14]:
# man_map = folium.Map(location=[40.7831, -73.9712],zoom_start=13.5)
# mc = MarkerCluster()
# for ind,row in man20.iterrows():
#     mc.add_child(folium.CircleMarker(location=[row['latitude'],row['longitude']],
#     radius=1,color='#500cc'))
#     man_map.add_child(mc)
#     man_map

In [15]:
sum_per_station_2020

Unnamed: 0_level_0,Unnamed: 1_level_0,Entries,Entries,Total
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,Unnamed: 4_level_1
Date,Unit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2020-03-01,R010,64967419,64975577,8158
2020-03-01,R011,5135715860,5135732599,16739
2020-03-01,R012,9596889,9605312,8423
2020-03-01,R017,242236,243340,1104
2020-03-01,R018,94466153,94487583,21430
...,...,...,...,...
2020-09-25,R451,18785230,18786604,1374
2020-09-25,R551,598681,599688,1007
2020-09-25,R552,2312749,2318193,5444
2020-09-25,R570,7347053,7349863,2810


In [16]:
gb_station_2020 = ts_df_2020[['Unit', 'SCP', 'Entries']].groupby(['Unit', 'SCP']).agg([min, max])
gb_station_2020.reset_index()
gb_station_2019 = ts_df_2019[['Unit', 'SCP', 'Entries']].groupby(['Unit', 'SCP']).agg([min, max])
gb_station_2019.reset_index()

Unnamed: 0_level_0,Unit,SCP,Entries,Entries
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max
0,R001,00-00-00,4119945,4657675
1,R001,00-00-01,3333074,3731126
2,R001,00-00-02,578591,893497
3,R001,00-00-03,2586201,2817674
4,R001,00-00-04,547373,730937
...,...,...,...,...
4995,R572,01-03-00,190600,242972
4996,R572,01-03-01,221462,281510
4997,R572,01-03-02,335750,423760
4998,R572,01-03-03,579374,735886


In [17]:
gb_station_2020.columns

MultiIndex([('Entries', 'min'),
            ('Entries', 'max')],
           )

In [18]:
gb_station_2020['Total'] = gb_station_2020[('Entries', 'max')] - gb_station_2020[('Entries', 'min')]
gb_station_2020.drop(gb_station_2020[gb_station_2020['Total'] < 100].index, inplace=True)
gb_station_2020.drop(gb_station_2020[gb_station_2020['Total'] > 10000].index, inplace=True)

gb_station_2019['Total'] = gb_station_2019[('Entries', 'max')] - gb_station_2019[('Entries', 'min')]
gb_station_2019.drop(gb_station_2019[gb_station_2019['Total'] < 100].index, inplace=True)
gb_station_2019.drop(gb_station_2019[gb_station_2019['Total'] > 10000].index, inplace=True)

In [24]:
gb_station_2020.reset_index()

Unnamed: 0_level_0,Unit,SCP,Entries,Entries,Total
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,Unnamed: 5_level_1
0,R001,01-06-00,117917594,117922519,4925
1,R001,01-06-01,1664682,1671785,7103
2,R001,01-06-02,174153,176742,2589
3,R001,02-03-00,210301,213819,3518
4,R009,00-06-01,1946243322,1946252256,8934
...,...,...,...,...,...
443,R552,00-02-00,12071,16034,3963
444,R552,00-05-00,23256,33224,9968
445,R552,00-05-01,0,2600,2600
446,R552,00-05-02,362,3081,2719


In [25]:
gb_station_2019.reset_index()

Unnamed: 0_level_0,Unit,SCP,Entries,Entries,Total
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,Unnamed: 5_level_1
0,R013,00-03-00,619191,621919,2728
1,R013,00-03-01,4459955,4462028,2073
2,R013,00-03-02,126122,128011,1889
3,R013,00-03-03,113606,115471,1865
4,R013,00-03-04,120158,121551,1393
...,...,...,...,...,...
79,R549,00-01-06,0,3791,3791
80,R549,03-01-07,0,6036,6036
81,R549,03-01-09,0,1609,1609
82,R550,00-02-00,5,2678,2673


In [26]:
station_2020 = gb_station_2020.groupby(level=[0]).sum()
station_2019 = gb_station_2019.groupby(level=[0]).sum()

In [27]:
hm_stations_2020 = station_2020['Total']
hm_stations_2019 = station_2019['Total']

Unnamed: 0,Unit,Total
0,R001,18135
1,R009,8934
2,R011,9129
3,R012,5238
4,R013,18458
...,...,...
155,R549,100918
156,R550,77310
157,R551,9437
158,R552,19250


In [29]:
hm_stations_2019.reset_index()

Unnamed: 0,Unit,Total
0,R013,11165
1,R018,4648
2,R019,9288
3,R020,8134
4,R022,1000
5,R025,7804
6,R029,4749
7,R043,6031
8,R044,9627
9,R050,8634


In [32]:
hm_stations_2020.describe()

count       160.000000
mean      15213.281250
std       14481.356488
min        1030.000000
25%        7952.500000
50%       10184.500000
75%       18649.000000
max      100918.000000
Name: Total, dtype: float64

In [33]:
hm_stations_2019.describe()

count       47.000000
mean      9237.638298
std       7733.088647
min       1000.000000
25%       4798.500000
50%       7804.000000
75%       9379.500000
max      43814.000000
Name: Total, dtype: float64