In [2]:
import pandas as pd
import altair as alt
from outbreak_data import outbreak_data
from outbreak_tools import outbreak_tools
import requests

In [3]:
state_list = ['USA_US-WA', 'USA_US-CA', 'USA_US-NY', 'USA_US-LA']
states = '(' + " OR ".join(state_list) + ')'

In [4]:
nopage = 'fetch_all=true&page=0'
covid19_endpoint = 'covid19/query'

In [5]:
args_all = f'q=location_id:{states}&sort=date&fields=date,confirmed_numIncrease,admin1&{nopage}'
data_all = outbreak_data.get_outbreak_data(covid19_endpoint, args_all, collect_all=True)

In [6]:
states_df = pd.DataFrame(data_all['hits'])

In [7]:
states_df['date'] = states_df['date'].apply(lambda x: pd.to_datetime(x))
states_df = states_df.sort_values('date')
states_df.reset_index(inplace=True, drop=True)

### Below is the distribution of "Confirmed # Increase" over time for California, Louisiana, New York, and Washington

In [8]:
case_increase = outbreak_tools.plot_case_increase(state_list)

In [9]:
case_increase

### Below Is a Combined Plot of Case Increase & Lineage Prevalence for the past 60 days

In [10]:
#selecting all ba.5 sub strains
lin_increase_ca = outbreak_data.prevalence_by_location('USA_US-CA', 'ba.5')
lin_increase_la = outbreak_data.prevalence_by_location('USA_US-LA', 'ba.5')
lin_increase_ny = outbreak_data.prevalence_by_location('USA_US-NY', 'ba.5')
lin_increase_wa = outbreak_data.prevalence_by_location('USA_US-WA', 'ba.5')

In [11]:
# Counting number of lineages with most entries / dates
num_dats_ca = lin_increase_ca.groupby('lineage').apply(len)
num_dats_la = lin_increase_la.groupby('lineage').apply(len)
num_dats_ny = lin_increase_ny.groupby('lineage').apply(len)
num_dats_wa = lin_increase_wa.groupby('lineage').apply(len)

In [12]:
counts = pd.DataFrame({'ca_counts': num_dats_ca, 'la_counts': num_dats_la, 'ny_counts': num_dats_ny, 'wa_counts': num_dats_wa})

In [13]:
# Top four longest lasting Lineage strains that intersect & infected all states
lin_choices = counts.dropna(how='any').sum(axis=1).sort_values(ascending=False)[:4]

In [14]:
#selecting lin_choice lineages in main data
lin_increase_ca = lin_increase_ca.where(lin_increase_ca.lineage.apply(lambda x: x in lin_choices.index)).dropna(how='all')
lin_increase_la = lin_increase_la.where(lin_increase_la.lineage.apply(lambda x: x in lin_choices.index)).dropna(how='all')
lin_increase_ny = lin_increase_ny.where(lin_increase_ny.lineage.apply(lambda x: x in lin_choices.index)).dropna(how='all')
lin_increase_wa = lin_increase_wa.where(lin_increase_wa.lineage.apply(lambda x: x in lin_choices.index)).dropna(how='all')

In [15]:
#adding location names since admin1 is missing field & endpoint doesn't support multiple lineage query?
lin_increase_ca['location'] = 'California'
lin_increase_la['location'] = 'Louisiana'
lin_increase_ny['location'] = 'New York'
lin_increase_wa['location'] = 'Washington'

In [16]:
#combining all for visualization
lin_increase_all = pd.concat([lin_increase_ca, lin_increase_la, lin_increase_ny, lin_increase_wa])

In [17]:
lin_increase_all['date'] = lin_increase_all['date'].apply(pd.to_datetime)

In [18]:
desc_all = lin_increase_all.date.describe(datetime_is_numeric=True)
#selecting only the last 60 days of data for easier interpretation
desc_all['max'] - desc_all['25%']

Timedelta('61 days 00:00:00')

In [19]:
past_60_all = lin_increase_all.where(lin_increase_all.date.apply(lambda x: x > desc_all['25%'])).dropna(how='all')

In [20]:
#aggregating data on all dates across all 4 states to generalize findings by location
combined_past_60_all = past_60_all.groupby(['date', 'lineage'])['prevalence_rolling'].apply(sum)

In [21]:
combined_past_60_all

date        lineage 
2022-05-27  ba.5        0.003434
            ba.5.1      0.014740
            ba.5.2.1    0.049671
            ba.5.5      0.058588
2022-05-28  ba.5        0.004253
                          ...   
2022-07-24  ba.5.2.1    0.250000
            ba.5.5      0.000000
2022-07-25  ba.5.2.1    1.000000
            ba.5.5      0.000000
2022-07-26  ba.5.5      0.285714
Name: prevalence_rolling, Length: 234, dtype: float64

In [22]:
normalized_combined_past_60 = []
for group, data in combined_past_60_all.groupby(['date']):
    normalized_combined_past_60.append(data / data.sum())

In [23]:
normalized_proportion = pd.concat(normalized_combined_past_60)

In [24]:
normalized_proportion = normalized_proportion.reset_index(drop=False)

In [25]:
normalized_proportion['prevalence_rolling'] = normalized_proportion['prevalence_rolling'] * 100

In [26]:
lin_prevalence = alt.Chart(normalized_proportion, title='Lineage Prevalence (7-Day Rolling)').mark_area().encode(
        x='date:T',
        y=alt.Y('prevalence_rolling:Q', scale=alt.Scale(domain=[0,100])),
        color='lineage:N'
    )

In [27]:
alt.vconcat(case_increase, lin_prevalence
           ).resolve_scale(x='shared')