In [None]:
%%capture
%pip install -r requirements.txt

# Global Covid Cases per Week

In [None]:
from utilities import * 

## Process and Import Data

In [None]:
# Use Public Online data 

url = "https://catalog.ourworldindata.org/garden/covid/latest/compact/compact.csv"
df = pd.read_csv(url)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] < '2023-06-01']
df['week_start'] = df['date'] - pd.to_timedelta(df['date'].dt.weekday, unit='D')
df.drop(columns = 'date', inplace = True)

weekly_df = df.copy()
weekly_df = df.groupby(['week_start', 'country']).sum().reset_index()

In [None]:
## check for missing values
weekly_df.isna().sum().sum()

In [None]:
# Manually Create Variant Data 
weekly_df['variant'] = 'pre_variant'

# Define the variant start dates
delta_start = pd.to_datetime('2021-05-01')
omicron_start = pd.to_datetime('2021-11-01')

# Define the three-month end dates
delta_end = delta_start + pd.DateOffset(months=3)
omicron_end = omicron_start + pd.DateOffset(months=3)

weekly_df.loc[(weekly_df['week_start'] >= delta_start) & (weekly_df['week_start'] < delta_end), 'variant'] = 'delta'
weekly_df.loc[(weekly_df['week_start'] >= omicron_start) & (weekly_df['week_start'] < omicron_end), 'variant'] = 'omicron'

## one hot encode for modeling down the line 
weekly_df = pd.get_dummies(weekly_df, columns=['variant'])
weekly_df['variant_pre_variant'] = weekly_df['variant_pre_variant'].astype(int)
weekly_df['variant_delta'] = weekly_df['variant_delta'].astype(int)
weekly_df['variant_omicron'] = weekly_df['variant_omicron'].astype(int)


In [None]:
weekly_df.head()

## EDA 

In [None]:
ProfileReport(df, title="Profiling Report")

In [None]:
plt.xticks(rotation=90) 
sns.lineplot(x='week_start', y='new_cases', data = weekly_df[weekly_df['country'] == 'United States'], label= 'New Cases')
sns.lineplot(x='week_start', y='new_tests', data = weekly_df[weekly_df['country'] == 'United States'], label= 'New Tests')

In [None]:
sns.lineplot(x='week_start', y='new_cases', data = weekly_df[weekly_df['country'] == 'United States'])
sns.lineplot(x='week_start', y='icu_patients', data = weekly_df[weekly_df['country'] == 'United States'])

## Proof of Concept Model: US Cases per Week 

In [None]:
## PREP DATA FOR MODELING

selected_cols = [
    'week_start',
    'new_cases',
    'new_tests',
    'variant_delta',
    'variant_omicron', 
    'variant_pre_variant',
    # 'new_deaths',
    # 'new_vaccinations',
    'icu_patients'
]

In [None]:
us_df = weekly_df[weekly_df['country'] == 'United States'].sort_values('week_start', ascending=True).reset_index(drop=True)

In [None]:
us_input = us_df[selected_cols].rename(columns={'week_start':'ds', 'new_cases':'y'})

In [None]:
us_input.head()

In [None]:
## CREATE LAGS 
us_input['new_tests_lag_1'] = df['new_tests'].shift(1)  # Lag by 1 week
us_input['new_tests_lag_2'] = df['new_tests'].shift(2)  # Lag by 2 weeks

us_input['icu_lag_1'] = df['icu_patients'].shift(1)  # Lag by 1 week
us_input['icu_lag_2'] = df['icu_patients'].shift(2)  # Lag by 2 weeks

In [None]:
us_input = us_input.fillna(0)
us_input.head()

In [None]:
us_input[us_input['ds']].ds.unique()

In [None]:
## SPLIT DATA 
training_data = us_input[us_input['ds'] <= '2023-04-01'].reset_index(drop=True)
validation_data = us_input[us_input['ds'] > '2023-04-01'].reset_index(drop=True)

In [None]:
## CREATE MODEL 

holiday_df = pd.DataFrame.from_dict(holidays.country_holidays('US', years= range(2020,2024)), orient='index').reset_index() \
    .rename({'index':'ds', 0:'holiday'}, axis='columns')

change_points = ['2021-11-15', '2022-01-10']
model = Prophet(holidays= holiday_df,
                changepoints = change_points,
    yearly_seasonality= True,
    weekly_seasonality = False,
    daily_seasonality=False)


# add regressor
model.add_regressor('new_tests')
model.add_regressor('new_tests_lag_1')
model.add_regressor('new_tests_lag_2')
model.add_regressor('icu_patients')
model.add_regressor('icu_lag_1')
model.add_regressor('icu_lag_2')
model.add_regressor('variant_pre_variant')
model.add_regressor('variant_omicron')
model.add_regressor('variant_delta')

model.fit(training_data)

In [None]:
## CREATE PREDCITONS AND GET MODEL PEROFORMANCE 
fig, ax = plt.subplots(figsize=(8, 5)) 
train_predict = model.predict(training_data)
plt.xticks(rotation=90)

mape = calculate_mape(training_data['y'], train_predict['yhat'])
print('Training MAPE:', mape, '%')
sns.lineplot(x='ds', y='y', data=training_data)
sns.lineplot(x='ds', y='yhat', data=train_predict)

In [None]:
validation_predict = model.predict(validation_data)
model_mape = calculate_mape(validation_data['y'], validation_predict['yhat'])

print('Performance Metrics')
print('------------------------')
print('Model MAPE:', model_mape, '%')

## Benchmark Metric 
validation_data['benchmark'] = np.mean(training_data['y'])
plt.xticks(rotation=90)
sns.lineplot(x='ds', y='y', data=validation_data, label= 'Actual')
sns.lineplot(x='ds', y='yhat', data=validation_predict, label='Prediction')