In [1]:
import pandas as pd
from datetime import date, timedelta


In [79]:

class DataWrangler:
    """
    DataWrangler extracts Covid data from https://healthdata.gov/ and provides the following metrics:
        - The total number of PCR tests performed as of a particular day (total_pcr_date) in the United States.
        - The n-day (window) rolling average number of new cases per day for the last k (rolling_averages_days) days.
        - The top n (top_states) states with the highest test positivity rate (positive tests / tests performed) for tests performed in the last k (positivity_rates_days) days.

    Parameters:
        total_pcr_date: date. Date up until when total pcr tests should calculated to. Default = current date - 1 day.
        window: int. Number of days for the rolling average window. Defaul = 7.
        rolling_averages_days: int. Number of days to be caculated for the rolling average. Default = 30 days.
        positivity_rates_days: date. Date of the earliest day to start calculating positivity rate. Default = current date - 30 days.
        top_states: int. Number of top states with highest positivity rates.
    """

    def __init__(self,
            total_pcr_date=(date.today()-timedelta(days=1)).strftime('%Y-%m-%d'),
            window=7,
            rolling_averages_days=30,
            positivity_rates_days=30,
            # (date.today()-timedelta(days=30)).strftime('%Y-%m-%d'),
            top_states=10):
        
        url = 'https://healthdata.gov/resource/j8mb-icvb.json?$limit=300000'

        self.covid_data = pd.read_json(url)
        self.total_pcr_date = total_pcr_date
        self.window = window
        self.rolling_averages_days = rolling_averages_days
        self.positivity_rates_days = positivity_rates_days
        self.positivity_rates_date = (date.today()-timedelta(days=positivity_rates_days)).strftime('%Y-%m-%d')
        self.top_states = top_states

    def total_pcr_test(self):
        """
        Calculates the total number PCR tests done as at a particular day
        :return: total number of PCR test as at a given day
        """
        return self.covid_data[self.covid_data['date']<=self.total_pcr_date]['total_results_reported'].sum()

    def rolling_average(self):
        """
        Calulates the n-day rolling average of new positive cases in the last k days
        :return: rolling averages
        """
        # new cases includes only positive cases
        new_cases_sum = self.covid_data[self.covid_data['overall_outcome']=='Positive'].groupby(['date'])['new_results_reported'].sum()
        new_cases_sum.sort_index()
        
        windows = new_cases_sum.rolling(self.window)
        rolling_averages = windows.mean()
        return rolling_averages[-self.rolling_averages_days:]

    def top_n_states(self):
        """
        Calculates positivity rate by states. First gets total tests and total positive tests then divide
        :return: top n states with highest positivity rates
        """
        total_test_df = pd.DataFrame(self.covid_data[self.covid_data['date']>=self.positivity_rates_date]\
            .groupby(['state_name'])['total_results_reported'].sum())

        positive_test_df = pd.DataFrame(self.covid_data[(self.covid_data['overall_outcome']=='Positive') \
            & (self.covid_data['date']>=self.positivity_rates_date)].groupby(['state_name'])['total_results_reported'].sum())

        positive_test_df.rename(columns={'total_results_reported': 'positive_results'}, inplace=True)

        joined_df = total_test_df.join(positive_test_df)
        joined_df['positivity_rate'] = round(joined_df['positive_results'] / joined_df['total_results_reported'],3)
        
        return joined_df[['positivity_rate']].sort_values(by=['positivity_rate'], ascending=False).head(self.top_states)

    def main(self):
        print(f'\n1. The total number of PCR tests performed as at {self.total_pcr_date} in the United State is {self.total_pcr_test()}\n')

        print(f'2. The {self.window}-day rolling average number of new cases per day for the last {self.rolling_averages_days} days is \n{self.rolling_average()}\n')

        print(f'3. The {self.top_states} states with the highest test positivity rate (positive tests / tests performed) for tests performed in the last {self.positivity_rates_days} days is {self.top_n_states()}')
        

In [80]:
obj = DataWrangler()
# obj.total_pcr_test()
# obj.rolling_average()
# obj.top_n_states()
obj.main()


1. The total number of PCR tests performed as at 2022-05-22 in the United State is 317858748732

2. The 7-day rolling average number of new cases per day for the last 30 days is 
date
2022-04-21    34272.857143
2022-04-22    34821.714286
2022-04-23    35282.714286
2022-04-24    36237.428571
2022-04-25    39042.142857
2022-04-26    42950.142857
2022-04-27    46806.428571
2022-04-28    50961.000000
2022-04-29    54455.428571
2022-04-30    56555.428571
2022-05-01    58098.142857
2022-05-02    59388.142857
2022-05-03    60494.714286
2022-05-04    61724.000000
2022-05-05    62467.428571
2022-05-06    63764.428571
2022-05-07    65641.285714
2022-05-08    67213.714286
2022-05-09    69455.714286
2022-05-10    72624.714286
2022-05-11    76082.285714
2022-05-12    78761.857143
2022-05-13    80478.000000
2022-05-14    82242.000000
2022-05-15    83468.285714
2022-05-16    85481.714286
2022-05-17    86511.571429
2022-05-18    85338.142857
2022-05-19    81489.714286
2022-05-20    73715.714286
Name:

In [84]:
obj.covid_data

Unnamed: 0,state,state_name,state_fips,fema_region,overall_outcome,date,new_results_reported,total_results_reported
0,AL,Alabama,1,Region 4,Negative,2020-03-01,96,96
1,AL,Alabama,1,Region 4,Positive,2020-03-01,16,16
2,AL,Alabama,1,Region 4,Negative,2020-03-02,72,168
3,AL,Alabama,1,Region 4,Positive,2020-03-02,6,22
4,AL,Alabama,1,Region 4,Negative,2020-03-03,94,262
...,...,...,...,...,...,...,...,...
131624,WY,Wyoming,56,Region 8,Negative,2022-05-19,860,1302998
131625,WY,Wyoming,56,Region 8,Positive,2022-05-19,71,120950
131626,WY,Wyoming,56,Region 8,Inconclusive,2022-05-20,0,3345
131627,WY,Wyoming,56,Region 8,Negative,2022-05-20,269,1303267
