In [1]:
from data_filters.config import get_config_from_file
from data_filters.main import build_filter_runner_from_config, build_smoother_from_config, univariate_process
from data_filters.utilities import mirror
from data_filters.plots import UnivariatePlotter
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [2]:
def get_df_from_raw_data(
    df: pd.DataFrame, first_date=None, last_date=None
) -> pd.DataFrame:
    df["Calculated_timestamp"] = pd.to_datetime(df["Calculated_timestamp"])
    data = df.set_index("Calculated_timestamp")
    if first_date:
        first_date = pd.to_datetime(first_date, format="%Y-%m-%d")
        data = data.loc[data.index > first_date]
    if last_date:
        last_date = pd.to_datetime(last_date, format="%Y-%m-%d")
        data = data.loc[data.index < last_date]
    value_columns = [col for col in data.columns if ("value" in col)]
    quality_columns = [col for col in data.columns if ("qualityFlag" in col)]
    data = data.dropna(subset=value_columns, how="all", axis=0)
    data[quality_columns] = data[quality_columns].astype(bool)
    return data.loc[:, value_columns + quality_columns]

In [3]:
def prepare_time_series(
    ts: pd.Series,
    data: pd.DataFrame,
    flow_name: str,
    use_log=False,
    use_flow=False,
    is_flag=False,
) -> pd.Series:
    series_name = "SRAS"
    if use_flow:
        ts = ts * data[flow_name] / 1e3  # Tgc/d
        series_name += " x Débit"
    ts = ts.dropna()

    if use_log:
        ts = np.log(ts + 0.001)
        series_name = f"log({series_name})"

    series_name = "Manually flagged" if is_flag else series_name
    ts.name = series_name
    return ts.asfreq("D")

In [4]:
def retrieve_flagged_data(
    data: pd.DataFrame, series_name: str, quality_name: str
) -> pd.Series:
    ts = data[series_name]  # gc/ml
    flags = data[quality_name].fillna(False)
    flagged = ts.loc[flags]
    flagged.name = "Flagged"
    return flagged

In [5]:
def remove_flagged(time_series: pd.Series, flags: pd.Series) -> pd.Series:
    df = pd.concat([time_series, flags], axis=1)
    flag_name = flags.name
    df["keep"] = df.apply(lambda x: np.isnan(x[flag_name]), axis=1)
    return time_series.loc[df["keep"]]


In [6]:
def write_units(use_log: bool, use_flow: bool) -> str:
    units = "1e9 cg/j" if use_flow else "cg/j"
    if use_log:
        units = f"log ({units})"
    return units

In [7]:
CONFIG_PATH = "+flow-log.yaml"
config = get_config_from_file(CONFIG_PATH)
DATA_PATH = "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables 2022/qc_01.csv"

SARS_NAME = "WWMeasure_covn1_gcml_single-to-mean_value"
SARS_QUALITY_NAME = "WWMeasure_covn1_gcml_single-to-mean_qualityFlag"
FLOW_NAME = "SiteMeasure_wwflow_m3d_single-to-mean_value"

YEAR = 2022

start = config.filtration_period.start
end = config.filtration_period.end

use_mirror = True
use_log = "+" in CONFIG_PATH.split("flow")[1]
use_flow = "+" in CONFIG_PATH.split("flow")[0]
remove_flags = False

raw_data = pd.read_csv(DATA_PATH)
data = get_df_from_raw_data(raw_data, first_date=start, last_date=end)

if YEAR == 2021:
    SARS_NAME = SARS_NAME.replace("n1", "n2")
    SARS_QUALITY_NAME = SARS_QUALITY_NAME.replace("n1", "n2")

flagged = retrieve_flagged_data(data, SARS_NAME, SARS_QUALITY_NAME)
original_time_series = data[SARS_NAME]
flagged = prepare_time_series(
    flagged, data, FLOW_NAME, use_log, use_flow, is_flag=True
)
original_time_series = prepare_time_series(
    original_time_series, data, FLOW_NAME, use_log, use_flow, is_flag=False
)
if remove_flags:
    time_series = remove_flagged(original_time_series, flagged)
else:
    time_series = original_time_series.copy()
time_series = time_series.asfreq("D").interpolate()
units = write_units(use_log, use_flow)

if use_mirror:
    # lengthen the series by mirroring
    mirrored_df = mirror(time_series, first_date=start, last_date=end)
    mirrored_df = mirror(mirrored_df)

    # fill gaps
    time_series = mirrored_df.interpolate()

# seven-day rolling average
seven_days = time_series.rolling(window=7, center=True).mean()

filter_runner = build_filter_runner_from_config(config)
smoother = build_smoother_from_config(config)

filter_results = univariate_process(
    raw_data=time_series,
    calibration_period=config.calibration_period,
    filter_runner=filter_runner,
    smoother=smoother
)

# plotter
plotter = UnivariatePlotter(
    signal_name="SARS", df=filter_results, template="plotly_white", language="french"
)
fig = plotter.plot()

raw_trace = go.Scatter(
    x=original_time_series.index,
    y=original_time_series,
    name="Signal brut",
    marker=dict(color="#bbbbbb"),
)
fig.add_trace(raw_trace)

# flagged_trace = go.Scatter(x=flagged.index, y=flagged, name="Annoté manuellement", line=dict(color="#D62728"), mode='markers', marker=dict(symbol="star", size=15), showlegend=True)
# fig.add_trace(flagged_trace)

seven_day_trace = go.Scatter(
    x=seven_days.index,
    y=seven_days,
    name="Liss. moy. 7j",
    marker=dict(color="#FF6F00"),
)
fig.add_trace(seven_day_trace)

start = time_series.first_valid_index() if start is None else start
end = (time_series.last_valid_index() if end is None else end) + pd.to_timedelta(1, "D")
fig.update_layout(
    dict(
        # template="presentation",
        title=f"Surveillance des eaux usées - Ville de Québec, Station Est - {time_series.name}"
    ),
    yaxis=dict(title=f"Valeur ({units})"),
    xaxis=dict(title="Jour d'échantillonnage", range=[start, end]),
),
fig.update_layout(dict(hovermode="x unified", width=1000, height=800))
fig.update_traces(hovertemplate="%{y:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smooth_df.drop(["input_values"], axis=1, inplace=True)


In [8]:
fig.write_html("sars_x_flow.html")

In [9]:
import json
from json import JSONEncoder


In [13]:
smoother.__dict__

{'control_parameters': {'size': 2},
 'algorithm': None,
 'signal_model': None,
 'uncertainty_model': None,
 'current_position': 163,
 'input_data':             accepted_values
 date                       
 2021-11-20        42.494004
 2021-11-21        69.881528
 2021-11-22        69.881528
 2021-11-23      1668.654437
 2021-11-24       634.699395
 ...                     ...
 2022-04-29       634.699395
 2022-04-30      1668.654437
 2022-05-01      1948.908651
 2022-05-02      1547.234806
 2022-05-03        42.494004
 
 [165 rows x 1 columns],
 'results': [FilterRow(date=NaT, input_values=array([nan]), inputs_are_outliers=array([False]), accepted_values=array([nan]), predicted_values=array([nan]), predicted_upper_limits=array([nan]), predicted_lower_limits=array([nan])),
  FilterRow(date=NaT, input_values=array([nan]), inputs_are_outliers=array([False]), accepted_values=array([nan]), predicted_values=array([nan]), predicted_upper_limits=array([nan]), predicted_lower_limits=array([nan]

In [65]:
from data_filters.protocols import FilterRow
import numpy as np

a = FilterRow(
    date = pd.to_datetime("22 may 2022"),
    input_values=np.array([1]),
    inputs_are_outliers=np.array([1]),
    accepted_values=np.array([2]),
    predicted_values=np.array([2]),
    predicted_lower_limits=np.array([0]),
    predicted_upper_limits=np.array([3])
)

class FilterRowEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, FilterRow):
            return {f'__{obj.__class__.__name__}__': obj.__dict__}
        if isinstance(obj, pd.Timestamp):
            return json.dumps(obj, cls=TimestampEncoder)
        if isinstance(obj, np.ndarray):
            return json.dumps(obj, cls=NumpyEncoder)
        else:
            return json.JSONEncoder.default(self, obj)
            
def decode_filter_row(obj):
    if '__FilterRow__' in obj:
        a = FilterRow(
            obj['__FilterRow__']['date'],
            obj['__FilterRow__']['input_values'],
            obj['__FilterRow__']['inputs_are_outliers'],
            obj['__FilterRow__']['accepted_values'],
            obj['__FilterRow__']['predicted_values'],
            obj['__FilterRow__']['predicted_lower_limits'],
            obj['__FilterRow__']['predicted_upper_limits'])
        a.__dict__.update(obj['__FilterRow__'])
        return a
    elif '__Timestamp__' in obj:
        return decode_timestamp(obj)
    elif '__Array__' in obj:
         return decode_numpy(obj)
    else:
        return obj

a = json.dumps(a, cls=FilterRowEncoder)
b = json.loads(a, object_hook=decode_filter_row)
b

FilterRow(date='{"__Timestamp__": "2022-05-22 00:00:00"}', input_values='[1]', inputs_are_outliers='[1]', accepted_values='[2]', predicted_values='[2]', predicted_upper_limits='[3]', predicted_lower_limits='[0]')

Timestamp('2022-05-22 00:00:00')

numpy.int64