In [2]:
# Imports
from typing import Iterator, Tuple

import numpy as np
import pandas as pd
from scipy.optimize import curve_fit

In [6]:
# Variables
START_DATE = pd.to_datetime("2020-06-07")
TODAY = pd.Timestamp.today().date()
FIPS = 48029
BETA = None
BETA_MULTIPLIER = .475
GAMMA = 1/7
START_DAY = 0
STOP_DAY = 100

In [7]:
FILE_NAME = f"{START_DATE.date()}_{TODAY}_{FIPS}_{BETA}_{BETA_MULTIPLIER}_sir-model-output"

In [8]:
# Read in usafacts data
case_df = pd.read_csv(
    "https://usafactsstatic.blob.core.windows.net/"
    "public/data/covid-19/covid_confirmed_usafacts.csv",
).set_index("countyFIPS").drop(
    # Remove unallocated cases (FIPS 0 or 1)
    range(2)
).reset_index().melt(
    # Melt dataframe (wide to long format)
    id_vars=["countyFIPS", "County Name", "State", "stateFIPS"],
    value_name="confirmed",
    var_name="date",
).astype({"date": "datetime64"})

In [9]:
# Create a day variable from the date variable
case_df = case_df.assign(
    days=(case_df["date"] - case_df["date"].min()).dt.days
)

# Trim up to but not including day 30
case_df = case_df[case_df["days"].ge(30)]

# Reverse order (highest to lowest day)
case_df = case_df.sort_values(["countyFIPS", "days"], ascending=False)

# Replace incorrect values with missing values
while case_df.groupby("countyFIPS")["confirmed"].pct_change().gt(0).any():
    case_df.loc[
        case_df.groupby("countyFIPS")["confirmed"].pct_change().gt(0),
        "confirmed",
    ] = np.nan

# Replace missing values with previous values
case_df.assign(
    confirmed=case_df["confirmed"].ffill()
)

# Restore the original order (lowest to highest day)
case_df = case_df.sort_values(["countyFIPS", "days"])

case_df = case_df.assign(
    # Calculate new cases from confirmed cases
    new_cases=case_df.groupby("countyFIPS")["confirmed"]
    .diff()
    .fillna(0)
)

In [10]:
# Read in population data
cens_df = pd.read_csv(
    "https://www2.census.gov/programs-surveys/popest/datasets/"
    "2010-2019/counties/totals/co-est2019-alldata.csv",
    usecols=[
        "STATE",
        "COUNTY",
        "STNAME",
        "CTYNAME",
        "POPESTIMATE2019"
    ],
    encoding="latin-1"
)

# Combine state and county fips
cens_df = cens_df.assign(
    county_fips=(
        cens_df["STATE"].astype(str)
        + cens_df["COUNTY"].astype(str).str.zfill(3)
    ).astype(int)
)

In [11]:
# pop_df = pd.read_csv(
#     "https://usafactsstatic.blob.core.windows.net/"
#     "public/data/covid-19/covid_county_population_usafacts.csv",
#     ).set_index("countyFIPS").drop(
#         # Remove unallocated cases (FIPS 0)
#         0
#     ).reset_index()
# pop_df.head()

In [12]:
# Merge data files
df = case_df.merge(
    cens_df,
    left_on="countyFIPS",
    right_on="county_fips",
    how="left"
)

In [13]:
# Calculate model inputs

# Growth rate
df = df.assign(
    gr=df.groupby("countyFIPS")["confirmed"].pct_change()
       / df.groupby("countyFIPS")["days"].diff()
)

# Calculate 7-day moving average of growth rate
df = df.assign(
    smooth_gr=df.groupby("countyFIPS")
    .rolling(window=7, min_periods=1)["gr"]
    .mean()
    .reset_index(level=0, drop=True)
)

# Calculate a rolling mean of new cases added in the previous 7 days
df = df.assign(
    seven_day_new_cases = df.groupby("countyFIPS")
    .rolling(window=7, min_periods=1)["new_cases"]
    .mean()
    .reset_index(level=0, drop=True)
)

# Doubling time and beta
df = df.assign(
    dt=np.log(2) / np.log(df["gr"] + 1),
    smooth_dt=np.log(2) / np.log(df["smooth_gr"] + 1),
    beta=(df["gr"] + GAMMA) / df["POPESTIMATE2019"],
    smooth_beta=(df["smooth_gr"] + GAMMA) / df["POPESTIMATE2019"],
)

# Rt
df = df.assign(
    rt=df["beta"] / GAMMA * df["POPESTIMATE2019"],
    smooth_rt=df["smooth_beta"] / GAMMA * df["POPESTIMATE2019"],
)

In [5]:
# Define sir function
def sir(
        susceptible: int,
        infected: int,
        recovered: int,
        beta: float,
        gamma: float = 1 / 7,
        case_adjustment_factor: float = 10,
        start: int = 0,
        stop: int = 90
) -> Iterator[Tuple[
    int,
    float,
    float,
    float,
]]:
    """
    Calculate the number of susceptible people and incidence and prevalence
    of infection and recovery in a population based on initial values.

    Core SIR model function that forecasts new
    - susceptible (S),
    - infected (I), and
    - recovered (R) values

    Arguments:
        susceptible: Initial number of susceptible people.
        total_infected: Initial number of total confirmed infections.
        total_recovered: Initial number of people who have recovered.
        new_infected: Initial number of new infections.
        new_recovered: Initial number of newly recovered individuals.
        beta: The effective contact rate during social distancing.
            BETA = (GROWTH_RATE + GAMMA) / SUSCEPTIBLE
        gamma: The inverse of the recovery length, e.g.
            RECOVERY_TIME = 7
            GAMMA = 1 / RECOVERY_TIME
        case_adjustment_factor: A multiplier to estimate the true case count.
            Under-reporting of cases is common because many are asymptomatic.
            The case adjustment factor compensates for under-reporting.
        start: The day for which initial values are provided, e.g. day 0.
        stop: The total number of days to forecast (daily projections).

    Yields:
        Collected values across each day from start to end.
        1. day
        2. susceptible
        3. total infected (infected prevalence)
        4. total recovered (recovered prevalence)
    """
    # Yield initial values
    yield (
        start,
        susceptible,
        infected,
        recovered,
    )
    # Iterate over days and yield model outputs
    for day in range(start + 1, stop + 1):
        # New confirmed infections (new positive tests)
        infected_incidence = beta * infected * susceptible
        # Number of infected who have just recovered
        recovered_incidence = gamma * infected
        # Number of people who are susceptible as of this time point
        susceptible_new = (
                susceptible
                - case_adjustment_factor
                * infected_incidence
        )
        # Total number of people who are infected as of this time point
        infected_prevalence = (
                infected_incidence
                + infected
                - recovered_incidence
        )
        # Total number of people who have recovered as of this time point
        recovered_prevalence = (
                recovered_incidence
                + recovered
                + infected_incidence * max(1., case_adjustment_factor - 1)
        )
        yield (
            day,
            susceptible_new,
            infected_prevalence,
            recovered_prevalence,
        )
        susceptible, infected, recovered = (
            susceptible_new, infected_prevalence, recovered_prevalence
        )

In [234]:
pd.DataFrame(_sir(
    susceptible=100000,
    infected=30,
    recovered=0,
    beta=2e-6,
    gamma=1/14
), columns=(
    "day",
    "susceptible",
    "infected",
    "recovered"
)).round(0).astype(int).drop("day", axis=1)

Unnamed: 0,susceptible,infected,recovered
0,100000,30,0
1,99940,34,56
2,99872,38,119
3,99796,43,191
4,99710,49,271
...,...,...,...
86,9282,788,89959
87,9136,746,90147
88,9000,707,90324
89,8873,669,90488


In [233]:
pd.DataFrame(sim_sir(
    susceptible=100000,
    infected=30,
    recovered=0,
    beta=2e-6,
), columns=(
    "day",
    "susceptible",
    "infected",
    "recovered"
)).drop("day", axis=1)

Unnamed: 0,susceptible,infected,recovered
0,100000,30,0
1,99950,33,47
2,99900,36,94
3,99840,39,151
4,99760,44,226
...,...,...,...
86,9070,825,90135
87,8890,784,90356
88,8800,737,90493
89,8610,703,90717


In [106]:
susceptible = 10000
beta = 1e-8
infected = 30
np.random.binomial(susceptible, beta*infected, susceptible).sum()

22

In [26]:
30*1e-12

3e-11

In [89]:
a, b = 1, 2

In [100]:
x = d if (
    d
    :=a
    +b
) > 0 else 0

In [None]:
x = max(a+b, 0)

In [91]:
x

3

In [213]:
# Define sir function
def random_sir(
        susceptible: int,
        infected: int,
        recovered: int,
        beta: float,
        gamma: float = 1 / 14,
        case_adjustment_factor: float = 10,
        start: int = 0,
        stop: int = 90
) -> Iterator[Tuple[
    int,
    float,
    float,
    float,
]]:
    """
    Calculate the number of susceptible people and incidence and prevalence
    of infection and recovery in a population based on initial values.

    Core SIR model function that forecasts new
    - susceptible (S),
    - infected (I), and
    - recovered (R) values

    Arguments:
        susceptible: Initial number of susceptible people.
        total_infected: Initial number of total confirmed infections.
        recovered: Initial number of people who have recovered.
        beta: The effective contact rate during social distancing.
            BETA = (GROWTH_RATE + GAMMA) / SUSCEPTIBLE
        gamma: The inverse of the recovery length, e.g.
            RECOVERY_TIME = 7
            GAMMA = 1 / RECOVERY_TIME
        case_adjustment_factor: A multiplier to estimate the true case count.
            Under-reporting of cases is common because many are asymptomatic.
            The case adjustment factor compensates for under-reporting.
        start: The day for which initial values are provided, e.g. day 0.
        stop: The total number of days to forecast (daily projections).

    Yields:
        Collected values across each day from start to end.
        1. day
        2. susceptible
        3. total infected (infected prevalence)
        4. total recovered (recovered prevalence)
    """
    # Yield initial values
    yield (
        start,
        susceptible,
        infected,
        recovered,
    )
    # Iterate over days and yield model outputs
    for day in range(start + 1, stop + 1):
        # New confirmed infections (new positive tests)
        infected_incidence = np.random.binomial(
            susceptible,
            beta * infected / susceptible,
            susceptible
        ).sum()
        # Number of infected who have just recovered
        recovered_incidence = int(round(gamma * infected, 0))
        # Number of people who are susceptible as of this time point
        susceptible_new = max(
            susceptible - case_adjustment_factor * infected_incidence,
            0
        )
        # Total number of people who are infected as of this time point
        infected_prevalence = max(
            infected_incidence + infected - recovered_incidence,
            0
        )
        # Total number of people who have recovered as of this time point
        recovered_prevalence = max(
            recovered_incidence + recovered
            + infected_incidence * max(1., case_adjustment_factor - 1),
            0
        )
        yield (
            day,
            susceptible_new,
            infected_prevalence,
            recovered_prevalence,
        )
        susceptible, infected, recovered = (
            susceptible_new, infected_prevalence, recovered_prevalence
        )

In [79]:
def gen_confirmed(
    susceptible_infected: Tuple[int],
    beta: float,
    gamma: float = 1 / 7,
    start: int = 0,
    stop: int = 90
) -> Iterator[float]:
    """
    Calculate the number of susceptible people and incidence and prevalence
    of infection and recovery in a population based on initial values.

    SIR model function that forecasts cumulative confirmed cases.

    Arguments:
        susceptible: Initial number of susceptible people.
        infected: Initial number of total confirmed infections.
        beta: The effective contact rate during social distancing.
            BETA = (GROWTH_RATE + GAMMA) / SUSCEPTIBLE
        gamma: The inverse of the recovery length, e.g.
            RECOVERY_TIME = 7
            GAMMA = 1 / RECOVERY_TIME
        case_adjustment_factor: A multiplier to estimate the true case count.
            Under-reporting of cases is common because many are asymptomatic.
            The case adjustment factor compensates for under-reporting.
        start: The day for which initial values are provided, e.g. day 0.
        stop: The total number of days to forecast (daily projections).

    Yields:
        Collected values of infected across each day from start to end.
    """
    susceptible, infected = susceptible_infected
    confirmed = infected
    # Yield initial values
    yield confirmed
    # Iterate over days and yield model outputs
    for day in range(start + 1, stop + 1):
        # New confirmed infections (new positive tests)
        infected_incidence = beta * infected * susceptible
        # Number of infected who have just recovered
        recovered_incidence = gamma * infected
        # Number of people who are susceptible as of this time point
        susceptible_new = susceptible - infected_incidence
        # Total number of people who are infected as of this time point
        infected_prevalence = (
                infected_incidence
                + infected
                - recovered_incidence
        )
        confirmed += infected_incidence
        yield confirmed
        susceptible, infected = susceptible_new, infected_prevalence
            
# Define sir function
def get_confirmed(
    susceptible_infected: Tuple[int],
    beta: float,
    gamma: float = 1 / 7,
    start: int = 0,
    stop: int = 90
) -> Tuple[float]:
    return tuple(gen_confirmed(susceptible_infected, beta, gamma, start, stop))

In [80]:
initial_parameters = (
    (BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
    GAMMA,
)

In [81]:
popt, pcov = curve_fit(get_confirmed, (county_df.loc[START_DATE, "POPESTIMATE2019"], county_df.loc[START_DATE, "confirmed"]), county_df.loc[START_DATE, "confirmed"], p0=initial_parameters)



In [82]:
pd.DataFrame(
    gen_confirmed(
        (county_df.loc[START_DATE, "POPESTIMATE2019"], county_df.loc[START_DATE, "confirmed"]),
        (BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
        GAMMA,
    ),
    columns=["predicted"]
)#.assign(actual=)

Unnamed: 0,predicted
0,3290.000000
1,3547.838624
2,3789.019033
3,4014.616908
4,4225.638539
...,...
86,7267.062790
87,7267.886296
88,7268.656568
89,7269.377048


In [89]:
(BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]),

(8.23488264351164e-08,)

In [87]:
actual = county_df.loc[START_DATE:START_DATE+pd.DateOffset(days=STOP_DAY-START_DAY), "confirmed"]
predicted = pd.Series(
    gen_confirmed(
        (county_df.loc[START_DATE, "POPESTIMATE2019"], county_df.loc[START_DATE, "confirmed"]),
        (BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]),
        GAMMA,
    )
)[:len(actual)]

In [88]:
pd.DataFrame({"predicted": predicted.values, "actual": actual.values}, index=actual.index)

Unnamed: 0_level_0,predicted,actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-07,3290.0,3290.0
2020-06-08,3832.818155,3311.0
2020-06-09,4387.500281,3333.0
2020-06-10,4954.277493,3513.0
2020-06-11,5533.384104,3648.0
2020-06-12,6125.057608,3840.0
2020-06-13,6729.538659,4012.0
2020-06-14,7347.071038,4242.0
2020-06-15,7977.901633,4393.0
2020-06-16,8622.280393,4437.0


In [63]:
pd.concat(
    [actual,
    np.array(gen_confirmed(
        (county_df.loc[START_DATE, "POPESTIMATE2019"], county_df.loc[START_DATE, "confirmed"]),
        (BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
        GAMMA,
    ))],
    axis=1
)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [14]:
# Define sir function
def sir(
        susceptible: int,
        total_infected: int,
        total_recovered: int,
        new_infected: int,
        new_recovered: int,
        beta: float,
        gamma: float = 1 / 7,
        case_adjustment_factor: float = 10,
        start: int = 0,
        stop: int = 90
) -> Iterator[Tuple[
    int,
    float,
    float,
    float,
    float,
    float,
    float,
]]:
    """
    Calculate the number of susceptible people and incidence and prevalence
    of infection and recovery in a population based on initial values.

    Core SIR model function that forecasts new
    - susceptible (S),
    - infected (I), and
    - recovered (R) values

    Arguments:
        susceptible: Initial number of susceptible people.
        total_infected: Initial number of total confirmed infections.
        total_recovered: Initial number of people who have recovered.
        new_infected: Initial number of new infections.
        new_recovered: Initial number of newly recovered individuals.
        beta: The effective contact rate during social distancing.
            BETA = (GROWTH_RATE + GAMMA) / SUSCEPTIBLE
        gamma: The inverse of the recovery length, e.g.
            RECOVERY_TIME = 7
            GAMMA = 1 / RECOVERY_TIME
        case_adjustment_factor: A multiplier to estimate the true case count.
            Under-reporting of cases is common because many are asymptomatic.
            The case adjustment factor compensates for under-reporting.
        start: The day for which initial values are provided, e.g. day 0.
        stop: The total number of days to forecast (daily projections).

    Yields:
        Collected values across each day from start to end.
        1. day
        2. susceptible
        3. new cases (infected incidence)
        4. newly recovered individuals (recovered incidence)
    """
    # Yield initial values
    yield (
        start,
        susceptible,
        new_infected,
        new_recovered,
        beta
    )
    # Iterate over days and yield model outputs
    for day in range(start + 1, stop + 1):
        # New confirmed infections (new positive tests)
        infected_incidence = beta * total_infected * susceptible
        # Number of infected who have just recovered
        recovered_incidence = gamma * total_infected
        # Number of people who are susceptible as of this time point
        susceptible_new = (
                susceptible
                - case_adjustment_factor
                * infected_incidence
        )
        yield (
            day,
            susceptible_new,
            infected_incidence,
            recovered_incidence,
            beta
        )
        susceptible, total_infected, total_recovered = (
            susceptible_new, infected_prevalence, recovered_prevalence
        )

In [18]:
# Pick county and start date - Model Lever 1 and 2
county = df["countyFIPS"] == FIPS
start_date = df["date"] == START_DATE

# Get model inputs
county_df = df[county].set_index("date")

In [21]:
# Create sir output dataframe
simple_sir_df = pd.DataFrame(
    simple_sir(
        susceptible=county_df.loc[START_DATE, "POPESTIMATE2019"],
        infected=county_df.loc[START_DATE, "confirmed"],
        recovered=0,
        beta=(BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
        gamma=GAMMA,
        start=START_DAY,
        stop=STOP_DAY
    ),
    columns=(
        "forecast_days",
        "susceptible",
        "total_infected",
        "total_recovered",
    )
).assign(
    start_date=START_DATE,
)

In [14]:
# Get historical confirmed from day 0 onward
historical = df[county]
historical.to_excel(f"{TODAY}_{FIPS}_history.xlsx")

In [15]:
# Create sir output dataframe
sir_df = pd.DataFrame(
    sir(
        susceptible=county_df.loc[START_DATE, "POPESTIMATE2019"],
        total_infected=county_df.loc[START_DATE, "confirmed"],
        total_recovered=0,
        new_infected=0,
        new_recovered=0,
        beta=(BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
        gamma=GAMMA,
        start=START_DAY,
        stop=STOP_DAY
    ),
    columns=(
        "forecast_days",
        "susceptible",
        "total_infected",
        "total_recovered",
        "infected_incidence",
        "new_recovered",
        "input_beta"
    )
).assign(
    start_date=START_DATE,
)

In [16]:
sir_df = sir_df.assign(
    date=START_DATE + pd.to_timedelta(sir_df['forecast_days'], unit='d'),
    confirmed_forecast=sir_df["susceptible"][0]-sir_df["susceptible"]+sir_df["total_infected"][0]
)

In [17]:
# merge sir and historical
df = historical.set_index("date").join(sir_df.set_index("date"))

In [18]:
df = df.assign(loss=df["new_cases"].subtract(df["infected_incidence"]))
df = df.assign(mse=df["loss"].pow(2).mean())
df = df.assign(rmse=df["mse"] ** .5)

In [19]:
df.to_excel(f"{FILE_NAME}.xlsx")
df.to_csv(f"{FILE_NAME}.csv", index=False)