In [1]:
import pandas as pd
import numpy as np

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os

In [2]:
wide_dirs = [
    (
        "CentrEauCOVID-2021",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables",
    ),
    (
        "CentrEauCOVID-2022",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables 2022",
    ),
    (
        "St-Paul",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/MI/Output/wide tables",
    ),
]
inventory = {key: {} for key, _ in wide_dirs}
for year, dir in wide_dirs:
    for file in os.listdir(dir):
        if ".parquet" not in file:
            continue
        if (
            "qc" not in file
            and "mtl" not in file
            and "lvl" not in file
            and "gtn" not in file
            and "mi" not in file
        ):
            continue

        df = pd.read_parquet(os.path.join(dir, file))
        df = df["2020-09-01":]
        values = [
            col
            for col in df.columns.to_list()
            if "value" in col
            and ("WWMeasure" in col or "SiteMeasure" in col or "CPHD" in col)
        ]
        values = [
            col
            for col in values
            if "cov" in col
            or "pmmov" in col
            or "brsv" in col
            or "flow" in col
            or "conf" in col
        ]
        values = [
            col
            for col in values
            if "pct" in col or "gcml" in col or "m3d" in col or "report" in col
        ]

        flow_col = "SiteMeasure_wwflow_m3d_single-to-mean_value"
        if flow_col not in df.columns:
            inventory[year][file.split(".")[0]] = 0
            continue
        n1_col = "WWMeasure_covn1_gcml_single-to-mean_value"
        inventory[year][file.split(".")[0]] = len(
            df[[n1_col, flow_col]].dropna(how="any")
        )

        fig = make_subplots(rows=len(values), cols=1)
        for i, value in enumerate(values):
            if "SiteMeasure" in value or "CPHD" in value:
                fig.add_trace(
                    go.Scatter(
                        x=df.index,
                        y=df[value],
                        name=value,
                        mode="markers",
                        marker={"symbol": "circle"},
                    ),
                    row=i + 1,
                    col=1,
                )
            else:
                df_true = df[df[value.replace("value", "qualityFlag")] == "True"]
                df_false = df[df[value.replace("value", "qualityFlag")] == "False"]

                fig.add_trace(
                    go.Scatter(
                        x=df_false.index,
                        y=df_false[value],
                        name=value + " Good Q",
                        mode="markers",
                        marker={"symbol": "circle"},
                    ),
                    row=i + 1,
                    col=1,
                )
                fig.add_trace(
                    go.Scatter(
                        x=df_true.index,
                        y=df_true[value],
                        name=value + " Bad Q",
                        mode="markers",
                        marker={"symbol": "x"},
                    ),
                    row=i + 1,
                    col=1,
                )
            fig.update_yaxes(title_text=value.split("_")[1], row=i + 1, col=1)
        fig.update_layout(title_text=file.replace(".parquet", "") + " " + str(year))
        fig.show()

    # Path: scratch.ipynb
    print(inventory)

{'CentrEauCOVID-2021': {'lvl_02': 91, 'qc_02': 297, 'lvl_01': 90, 'qc_01': 296, 'lvl_05': 45, 'mtl_01': 0, 'mtl_02': 0}, 'CentrEauCOVID-2022': {}, 'St-Paul': {}}


{'CentrEauCOVID-2021': {'lvl_02': 91, 'qc_02': 297, 'lvl_01': 90, 'qc_01': 296, 'lvl_05': 45, 'mtl_01': 0, 'mtl_02': 0}, 'CentrEauCOVID-2022': {'lvl_02': 354, 'qc_02': 331, 'lvl_01': 355, 'qc_01': 334, 'lvl_05': 350, 'gtn_01': 306, 'mtl_01': 346, 'mtl_02': 346}, 'St-Paul': {}}


{'CentrEauCOVID-2021': {'lvl_02': 91, 'qc_02': 297, 'lvl_01': 90, 'qc_01': 296, 'lvl_05': 45, 'mtl_01': 0, 'mtl_02': 0}, 'CentrEauCOVID-2022': {'lvl_02': 354, 'qc_02': 331, 'lvl_01': 355, 'qc_01': 334, 'lvl_05': 350, 'gtn_01': 306, 'mtl_01': 346, 'mtl_02': 346}, 'St-Paul': {'mi_01': 603}}


In [3]:
os.listdir(wide_dirs[0][1])

['lvl_02.parquet',
 'qc_02.parquet',
 'trpis_01.parquet',
 'lvl_01.parquet',
 'qc_01.parquet',
 'lvl_05.parquet',
 'mtl_01.parquet',
 'stak_01.parquet',
 'rdl_01.parquet',
 'mtne_01.parquet',
 'mtl_02.parquet',
 'riki_01.parquet']

In [4]:
inv = pd.DataFrame(inventory)
inv["total"] = inv.sum(axis=1)
inv.sort_index(inplace=True)
inv

Unnamed: 0,CentrEauCOVID-2021,CentrEauCOVID-2022,St-Paul,total
gtn_01,,306.0,,306.0
lvl_01,90.0,355.0,,445.0
lvl_02,91.0,354.0,,445.0
lvl_05,45.0,350.0,,395.0
mi_01,,,603.0,603.0
mtl_01,0.0,346.0,,346.0
mtl_02,0.0,346.0,,346.0
qc_01,296.0,334.0,,630.0
qc_02,297.0,331.0,,628.0


In [5]:
value_cols = [col for col in df1.columns if "value" in col]
fig = px.scatter(df1[value_cols])
fig.show()

NameError: name 'df1' is not defined

In [6]:
wide_dirs = [
    (
        2021,
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables",
    ),
    (
        2022,
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables 2022",
    ),
]
for year, dir in wide_dirs:
    for file in os.listdir(dir):
        if ".parquet" not in file:
            continue
        if (
            "qc" not in file
            and "mtl" not in file
            and "lvl" not in file
            and "gtn" not in file
        ):
            continue

        df = pd.read_parquet(os.path.join(dir, file))

        values = [
            col
            for col in df.columns.to_list()
            if "value" in col and "SiteMeasure" in col
        ]
        if len(values) == 0:
            continue
        fig = make_subplots(rows=len(values), cols=1)
        for i, value in enumerate(values):
            fig.add_trace(
                go.Scatter(
                    x=df.index,
                    y=df[value],
                    name=value,
                    mode="markers",
                    marker={"symbol": "circle"},
                ),
                row=i + 1,
                col=1,
            )
            fig.update_yaxes(title_text=value.split("_")[1], row=i + 1, col=1)
        fig.update_layout(title_text=file.replace(".parquet", "") + " " + str(year))
        fig.show()

In [7]:
wide_mi_path = "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/MI/Output/mi_01.parquet"
df = pd.read_parquet(wide_mi_path)
values = [col for col in df.columns.to_list() if "value" in col]
values = [col for col in values if "ct" not in col and "CPHD" not in col]
if len(values) == 0:
    pass
fig = make_subplots(rows=len(values), cols=1)
for i, value in enumerate(values):
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df[value],
            name=value,
            mode="markers",
            marker={"symbol": "circle"},
        ),
        row=i + 1,
        col=1,
    )
    fig.update_yaxes(title_text=value.split("_")[1], row=i + 1, col=1)
fig.update_layout(title_text=file.replace(".parquet", "") + " " + str(year))
fig.show()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/MI/Output/mi_01.parquet'

In [8]:
wide_dirs = [
    (
        "CentrEauCOVID-2021",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables",
    ),
    (
        "CentrEauCOVID-2022",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/wide tables 2022",
    ),
    (
        "St-Paul",
        "/Users/jeandavidt/Library/CloudStorage/OneDrive-UniversitéLaval/Université/Doctorat/COVID/Latest Data/MI/Output/wide tables",
    ),
]
inventory = {key: {} for key, _ in wide_dirs}
fig = make_subplots(rows=1, cols=1)
for year, dir in wide_dirs:
    for file in os.listdir(dir):
        if ".parquet" not in file:
            continue
        if (
            "qc" not in file
            and "mtl" not in file
            and "lvl" not in file
            and "gtn" not in file
            and "mi" not in file
        ):
            continue

        df = pd.read_parquet(os.path.join(dir, file))
        df = df["2020-09-01":]
        values = [col for col in df.columns.to_list() if "CPHD" in col]
        values = [col for col in values if "conf" in col]
        values = [col for col in values if "report" in col]

        for i, value in enumerate(values):
            if "CPHD" in value:
                fig.add_trace(
                    go.Scatter(
                        x=df.index,
                        y=df[value],
                        name=value.split("conf")[0].split("CPHD-")[1],
                        mode="markers",
                        marker={"symbol": "circle"},
                    ),
                    row=1,
                    col=1,
                )
            else:
                continue
            fig.update_yaxes(title_text=value.split("_")[1], row=1, col=1)
fig.update_layout(title_text="cases")
fig.show()