In [None]:
import pandas as pd
import numpy as np
from wbe.constants import DATA_PATH

In [None]:
# Get data
data = pd.read_csv(DATA_PATH / "wbe/cdc_data.csv")

# Get masks
log_mask = data["pcr_target_units"] == "log10 copies/l wastewater"
linear_mask = ~log_mask
solid_mask = data["pcr_target_units"] == "copies/g dry sludge"
liquid_mask = ~solid_mask

# Three possible values in the dataset
log_liquid = log_mask & liquid_mask
lin_liquid = linear_mask & liquid_mask
lin_solid = linear_mask & solid_mask

# Initialise new columns
data.loc[:, "liquid_pcr_conc"] = np.nan
data.loc[:, "solid_pcr_conc"] = np.nan

# Fill
data.loc[lin_liquid, "liquid_pcr_conc"] = data.loc[lin_liquid, "pcr_target_avg_conc"]
data.loc[log_liquid, "liquid_pcr_conc"] = 10.0 ** data.loc[log_liquid, "pcr_target_avg_conc"]
data.loc[lin_solid, "solid_pcr_conc"] = data.loc[lin_solid, "pcr_target_avg_conc"]

data["sample_collect_date"] = pd.to_datetime(data["sample_collect_date"])

In [None]:
# Separate the targets
liquid_data = data[data["liquid_pcr_conc"].notna()].copy()
solid_data = data[data["solid_pcr_conc"].notna()].copy()

# Get unique records by date and shed
liquid_obs = (
    liquid_data.groupby(["sewershed_id", "sample_collect_date"], as_index=False)
       .agg(
           pcr_conc=("liquid_pcr_conc", "median"),
           n_raw_rows=("liquid_pcr_conc", "size"),
       )
).sort_values(["sewershed_id", "sample_collect_date"])
liquid_obs.index = liquid_obs["sample_collect_date"]

solid_obs = (
    solid_data.groupby(["sewershed_id", "sample_collect_date"], as_index=False)
       .agg(
           pcr_conc=("solid_pcr_conc", "median"),
           n_raw_rows=("solid_pcr_conc", "size"),
       )
).sort_values(["sewershed_id", "sample_collect_date"])
solid_obs.index = solid_obs["sample_collect_date"]