In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyspark as ps
import scipy.stats as stats
import sys
sys.path.append("..")
from src.support_functions import get_covid_data, fixing_datetime, get_zip_income

plt.style.use('ggplot')


In [2]:
data_by_zip, tests_by_day, tests_by_boro = get_covid_data()

In [3]:
data_by_zip = data_by_zip.rename(columns={"MODIFIED_ZCTA": "Zip",
                            "NEIGHBORHOOD_NAME": "Neighborhood",
                            "BOROUGH_GROUP": "Borough",
                            "COVID_CASE_COUNT": "Covid_Case_Count",
                            "COVID_CASE_RATE": "Covid_Case_Rate",
                            "POP_DENOMINATOR": "Pop_Denominator",
                            "COVID_DEATH_COUNT": "Covid_Death_Count",
                            "COVID_DEATH_RATE":"Covid_Death_Rate",
                            "PERCENT_POSITIVE":"Percent_Positive",
                            "TOTAL_COVID_TESTS": "Total_Covid_Tests"})

In [4]:
tests_by_day = tests_by_day.rename(columns={"DATE": "Date",
                             "TOTAL_TESTS": "Total_Tests",
                             "POSITIVE_TESTS": "Positive_Tests",
                             "PERCENT_POSITIVE": "Percent_Positive",
                             "TOTAL_TESTS_7DAYS_AVG": "Total_Tests_7Days_AVG",
                             "POSITIVE_TESTS_7DAYS_AVG": "Positive_Tests_7Days_AVG",
                             "PERCENT_POSITIVE_7DAYS_AVG": "Percent_Positive_7Days_AVG",
                             "INCOMPLETE": "Incomplete"})

In [5]:
tests_by_boro = tests_by_boro.rename(columns={"BOROUGH_GROUP": "Borough",
                             "CASE_RATE": "Case_Rate",
                             "HOSPITALIZED_RATE": "Hospitalized_Rate",
                             "DEATH_RATE": "Death_Rate",
                             "CASE_COUNT": "Case_Count",
                             "HOSPITALIZED_COUNT": "Hospitalized_Count",
                             "DEATH_COUNT": "Death_Count"})

In [6]:
median_income = get_zip_income()
med_income = median_income['median_income']
data_by_zip = data_by_zip.join(med_income)

In [7]:
tests_by_day = fixing_datetime(tests_by_day)

In [8]:
turnstile2019_df = pd.read_csv('../data/Turnstile_Usage_Data__2019.csv', low_memory=False)
turnstile2020_df = pd.read_csv('../data/Turnstile_Usage_Data__2020.csv', low_memory=False)

In [9]:
ts_df_2019 = fixing_datetime(turnstile2019_df)
ts_df_2020 = fixing_datetime(turnstile2020_df)

In [10]:
gb2019 = ts_df_2019[['Unit', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Unit', 'SCP']).agg([min, max])
gb2019.reset_index()

gb2019['Total'] = gb2019[('Entries', 'max')] - gb2019[('Entries', 'min')]
gb2019.drop(gb2019[gb2019['Total'] < 1000].index, inplace=True)
gb2019.drop(gb2019[gb2019['Total'] > 10000].index, inplace=True)

sum_per_station_2019 = gb2019.groupby(level=[0, 1]).sum()

sum_per_date_2019 = sum_per_station_2019.groupby(level=[0]).sum()

sum_per_date_2019['week_avg_2019'] = sum_per_date_2019.iloc[:,2].rolling(window=7).mean()

In [11]:
gb2020 = ts_df_2020[['Unit', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Unit', 'SCP']).agg([min, max])
gb2020.reset_index()

gb2020['Total'] = gb2020[('Entries', 'max')] - gb2020[('Entries', 'min')]
gb2020.drop(gb2020[gb2020['Total'] < 1000].index, inplace=True)
gb2020.drop(gb2020[gb2020['Total'] > 10000].index, inplace=True)

sum_per_station_2020 = gb2020.groupby(level=[0, 1]).sum()

sum_per_date_2020 = sum_per_station_2020.groupby(level=[0]).sum()

sum_per_date_2020['week_avg_2020'] = sum_per_date_2020.iloc[:,2].rolling(window=7).mean()

## Hypothesis Test on MTA Usage between 2019 and 2020

H0: There was no difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.

HA: There was a difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.

Alpha: 0.05

In [12]:
stats.ttest_ind(sum_per_date_2019['week_avg_2019'].dropna(), sum_per_date_2020['week_avg_2020'].dropna(), equal_var=False )

Ttest_indResult(statistic=53.20656382203368, pvalue=8.146563428977898e-125)

***Conclusion***: Since the p-value is lower than our alpha, we reject the Null Hypothesis 
and can say there is effecient statistical evidence that there was a difference in MTA Turnstile usage between March 01, 2019 thru September 30, 2019 and March 01, 2020 thru September 30, 2020.