In [1]:
# Imports
from typing import Iterator, Tuple

import numpy as np
import pandas as pd

In [2]:
# Variables
START_DATE = pd.to_datetime("2020-06-07")
TODAY = pd.Timestamp.today().date()
FIPS = 48029
BETA = None
BETA_MULTIPLIER = .475
GAMMA = 1/7
START_DAY = 0
STOP_DAY = 100

In [3]:
FILE_NAME = f"{START_DATE.date()}_{TODAY}_{FIPS}_{BETA}_{BETA_MULTIPLIER}_sir-model-output"

In [4]:
# Read in usafacts data
case_df = pd.read_csv(
    "https://usafactsstatic.blob.core.windows.net/"
    "public/data/covid-19/covid_confirmed_usafacts.csv",
).set_index("countyFIPS").drop(
    # Remove unallocated cases (FIPS 0 or 1)
    range(2)
).reset_index().melt(
    # Melt dataframe (wide to long format)
    id_vars=["countyFIPS", "County Name", "State", "StateFIPS"],
    value_name="confirmed",
    var_name="date",
).astype({"date": "datetime64"})

In [5]:
# Create a day variable from the date variable
case_df = case_df.assign(
    days=(case_df["date"] - case_df["date"].min()).dt.days
)

# Trim up to but not including day 30
case_df = case_df[case_df["days"].ge(30)]

# Reverse order (highest to lowest day)
case_df = case_df.sort_values(["countyFIPS", "days"], ascending=False)

# Replace incorrect values with missing values
while case_df.groupby("countyFIPS")["confirmed"].pct_change().gt(0).any():
    case_df.loc[
        case_df.groupby("countyFIPS")["confirmed"].pct_change().gt(0),
        "confirmed",
    ] = np.nan

# Replace missing values with previous values
case_df.assign(
    confirmed=case_df["confirmed"].ffill()
)

# Restore the original order (lowest to highest day)
case_df = case_df.sort_values(["countyFIPS", "days"])

case_df = case_df.assign(
    # Calculate new cases from confirmed cases
    new_cases=case_df.groupby("countyFIPS")["confirmed"]
    .diff()
    .fillna(0)
)

In [6]:
# Read in population data
cens_df = pd.read_csv(
    "https://www2.census.gov/programs-surveys/popest/datasets/"
    "2010-2019/counties/totals/co-est2019-alldata.csv",
    usecols=[
        "STATE",
        "COUNTY",
        "STNAME",
        "CTYNAME",
        "POPESTIMATE2019"
    ],
    encoding="latin-1"
)

# Combine state and county fips
cens_df = cens_df.assign(
    county_fips=(
        cens_df["STATE"].astype(str)
        + cens_df["COUNTY"].astype(str).str.zfill(3)
    ).astype(int)
)

In [7]:
# pop_df = pd.read_csv(
#     "https://usafactsstatic.blob.core.windows.net/"
#     "public/data/covid-19/covid_county_population_usafacts.csv",
#     ).set_index("countyFIPS").drop(
#         # Remove unallocated cases (FIPS 0)
#         0
#     ).reset_index()
# pop_df.head()

In [8]:
# Merge data files
df = case_df.merge(
    cens_df,
    left_on="countyFIPS",
    right_on="county_fips",
    how="left"
)

In [9]:
# Calculate model inputs

# Growth rate
df = df.assign(
    gr=df.groupby("countyFIPS")["confirmed"].pct_change()
       / df.groupby("countyFIPS")["days"].diff()
)

# Calculate 7-day moving average of growth rate
df = df.assign(
    smooth_gr=df.groupby("countyFIPS")
    .rolling(window=7, min_periods=1)["gr"]
    .mean()
    .reset_index(level=0, drop=True)
)

# Calculate a rolling mean of new cases added in the previous 7 days
df = df.assign(
    seven_day_new_cases = df.groupby("countyFIPS")
    .rolling(window=7, min_periods=1)["new_cases"]
    .mean()
    .reset_index(level=0, drop=True)
)

# Doubling time and beta
df = df.assign(
    dt=np.log(2) / np.log(df["gr"] + 1),
    smooth_dt=np.log(2) / np.log(df["smooth_gr"] + 1),
    beta=(df["gr"] + GAMMA) / df["POPESTIMATE2019"],
    smooth_beta=(df["smooth_gr"] + GAMMA) / df["POPESTIMATE2019"],
)

# Rt
df = df.assign(
    rt=df["beta"] / GAMMA * df["POPESTIMATE2019"],
    smooth_rt=df["smooth_beta"] / GAMMA * df["POPESTIMATE2019"],
)

In [29]:
susceptible = 10000
beta = 1e-8
infected = 30
new_infected = np.random.binomial(susceptible, beta*infected, susceptible).sum()

In [30]:
new_infected

32

In [49]:
x

'a1b2'

In [10]:
# Define sir function
def sim_sir(
        susceptible: int,
        infected: int,
        recovered: int,
        new_infected: int,
        new_recovered: int,
        beta: float,
        gamma: float = 1 / 7,
        case_adjustment_factor: float = 10,
        start: int = 0,
        stop: int = 90
) -> Iterator[Tuple[
    int,
    float,
    float,
    float,
    float,
    float,
    float,
]]:
    """
    Calculate the number of susceptible people and incidence and prevalence
    of infection and recovery in a population based on initial values.

    Core SIR model function that forecasts new
    - susceptible (S),
    - infected (I), and
    - recovered (R) values

    Arguments:
        susceptible: Initial number of susceptible people.
        total_infected: Initial number of total confirmed infections.
        total_recovered: Initial number of people who have recovered.
        new_infected: Initial number of new infections.
        new_recovered: Initial number of newly recovered individuals.
        beta: The effective contact rate during social distancing.
            BETA = (GROWTH_RATE + GAMMA) / SUSCEPTIBLE
        gamma: The inverse of the recovery length, e.g.
            RECOVERY_TIME = 7
            GAMMA = 1 / RECOVERY_TIME
        case_adjustment_factor: A multiplier to estimate the true case count.
            Under-reporting of cases is common because many are asymptomatic.
            The case adjustment factor compensates for under-reporting.
        start: The day for which initial values are provided, e.g. day 0.
        stop: The total number of days to forecast (daily projections).

    Yields:
        Collected values across each day from start to end.
        1. day
        2. susceptible
        3. total infected (infected prevalence)
        4. total recovered (recovered prevalence)
        5. new cases (infected incidence)
        6. newly recovered individuals (recovered incidence)
    """
    # Yield initial values
    yield (
        start,
        susceptible,
        infected,
        recovered,
        new_infected,
        new_recovered,
        beta
    )
    # Iterate over days and yield model outputs
    for day in range(start + 1, stop + 1):
        # New confirmed infections (new positive tests)
        infected_incidence = np.random.binomial(susceptible, beta*infected, susceptible).sum()
        # Number of infected who have just recovered
        recovered_incidence = gamma * total_infected
        # Number of people who are susceptible as of this time point
        susceptible_new = (
                susceptible
                - case_adjustment_factor
                * infected_incidence
        )
        # Total number of people who are infected as of this time point
        infected_prevalence = (
                infected_incidence
                + total_infected
                - recovered_incidence
        )
        # Total number of people who have recovered as of this time point
        recovered_prevalence = (
                recovered_incidence
                + total_recovered
                + infected_incidence * max(1., case_adjustment_factor - 1)
        )
        yield (
            day,
            susceptible_new,
            infected_prevalence,
            recovered_prevalence,
            infected_incidence,
            recovered_incidence,
            beta
        )
        susceptible, total_infected, total_recovered = (
            susceptible_new, infected_prevalence, recovered_prevalence
        )

In [11]:
# Pick county and start date - Model Lever 1 and 2
county = df["countyFIPS"] == FIPS
start_date = df["date"] == START_DATE

# Get model inputs
county_df = df[county].set_index("date")

In [12]:
# Get historical confirmed from day 0 onward
historical = df[county]
historical.to_excel(f"{TODAY}_{FIPS}_history.xlsx")

In [52]:
# Concatenate a and b
# If both are empty, None
a, b = "a", "b"
x = y if (y := a + b) else None

In [53]:
x

'a12b23'

In [13]:
# Create sir output dataframe
sir_df = pd.DataFrame(
    sir(
        susceptible=county_df.loc[START_DATE, "POPESTIMATE2019"],
        total_infected=county_df.loc[START_DATE, "confirmed"],
        total_recovered=0,
        new_infected=0,
        new_recovered=0,
        beta=(BETA if BETA else county_df.loc[START_DATE, "smooth_beta"]) * BETA_MULTIPLIER,
        gamma=GAMMA,
        start=START_DAY,
        stop=STOP_DAY
    ),
    columns=(
        "forecast_days",
        "susceptible",
        "total_infected",
        "total_recovered",
        "infected_incidence",
        "new_recovered",
        "input_beta"
    )
).assign(
    start_date=START_DATE,
)

In [14]:
sir_df = sir_df.assign(
    date=START_DATE + pd.to_timedelta(sir_df['forecast_days'], unit='d'),
    confirmed_forecast=sir_df["susceptible"][0]-sir_df["susceptible"]+sir_df["total_infected"][0]
)

In [15]:
# merge sir and historical
df = historical.set_index("date").join(sir_df.set_index("date"))

In [16]:
df = df.assign(loss=df["new_cases"].subtract(df["infected_incidence"]))
df = df.assign(mse=df["loss"].pow(2).mean())
df = df.assign(rmse=df["mse"].pow(.5))

In [19]:
df.to_excel(f"{FILE_NAME}.xlsx")
df.to_csv(f"{FILE_NAME}.csv", index=False)

In [17]:
def solve_sir(data, beta, gamma, s0, i0, r0):
    """
    RMSE between actual confirmed cases and the estimated infectious people with given beta and gamma.
    """
    size = 
    def SIR(t, y):
        S, I, R = y
        return [-beta*S*I, beta*S*I-gamma*I, gamma*I]
    return solve_ivp(SIR, [0, size], [S_0,I_0,R_0], t_eval=np.arange(0, size, 1), vectorized=True)
    size = len(data)
    def sir(s, i, r):
        return [-beta*s*i, beta*s*i-gamma*i, gamma*i]
    solution = solve_ivp(sir, [0, size], [s0, i0, r0], t_eval=np.arange(0, size, 1), vectorized=True)
    
def get_loss(beta, gamma, data, s0, i0, r0):
    return np.sqrt(np.mean((solve_sir(beta, gamma, data, s0, i0, r0).y[1] - data)**2))

# TODO use curve_fit to find best beta and gamma
def get_best_params(beta, gamma, data, s0, i0, r0):
    return optimize.curve_fit(solve_sir, xdata, ydata)

In [None]:
solve_sir()