In [2]:
import clean
import holoviews as hv
import numpy as np
import polars as pl
from clean import data_dir

In [3]:
df_sas = pl.read_parquet(data_dir("sas/avg_cleaned.pq"))
df_join = pl.read_parquet(data_dir("samples/simple_joined.pq"))

In [4]:
sas_true = df_sas["sensor_voltage"]
linear_pred = np.load(clean.data_dir("pred/linear.npy"))
svm_pred = np.load(clean.data_dir("pred/svm.npy"))
base_pred = np.load(clean.data_dir("pred/base_mse.npy"))
base_wgh = np.load(clean.data_dir("pred/base_wgh.npy"))
space_pred = np.load(clean.data_dir("pred/lstm_mse.npy"))
space_wgh = np.load(clean.data_dir("pred/lstm_wgh.npy"))
time_pred = np.load(clean.data_dir("pred/time_mse.npy"))
time_wgh = np.load(clean.data_dir("pred/time_wgh.npy"))

In [16]:
freq_df: pl.DataFrame = (
    pl.concat(
        (
            pl.select(pl.struct(name=pl.lit("Actual (SAS)"), val=sas_true)),
            pl.select(pl.struct(name=pl.lit("Linear"), val=linear_pred.flatten())),
            pl.select(pl.struct(name=pl.lit("SVM"), val=svm_pred.flatten())),
            pl.select(pl.struct(name=pl.lit("Simple"), val=base_pred.flatten())),
            pl.select(pl.struct(name=pl.lit("Simple, DWMSE"), val=base_wgh.flatten())),
            pl.select(pl.struct(name=pl.lit("Space"), val=space_pred.flatten())),
            pl.select(pl.struct(name=pl.lit("Space, DWMSE"), val=space_wgh.flatten())),
            pl.select(pl.struct(name=pl.lit("Time"), val=time_pred.flatten())),
            pl.select(pl.struct(name=pl.lit("Time, DWMSE"), val=time_wgh.flatten())),
        ),
        how="vertical_relaxed",
    )
    .lazy()
    .unnest("name")
    .with_row_index()
    .group_by(Distribution="name", Voltage=pl.col("val").round())
    .agg(Frequency=pl.col("index").count())
    .with_columns(
        pl.col("Frequency")
        .truediv(pl.col("Frequency").sum())
        .over("Distribution")
        .alias("Relative frequency")
    )
    .sort("Distribution", "Voltage")
    .collect()
)
freq_df.plot.line(
    x="Voltage",
    y="Relative frequency",
    by="Distribution",
    title="Full distribution chart",
)

In [23]:
freq_df.filter(
    pl.col("Distribution").is_in(["Actual (SAS)", "Simple", "SVM", "Space", "Time"])
    & pl.col("Voltage").is_between(-10, 80)
).plot.line(
    x="Voltage",
    y="Relative frequency",
    by="Distribution",
    title="Model output voltage distributions",
    line_dash=["dashed"] + ["solid"] * 4,
    ylim=(-0.01, 0.3),
)

In [22]:
freq_df.filter(
    pl.col("Distribution").is_in(
        [
            "Actual (SAS)",
            "Space, DWMSE",
            "Time, DWMSE",
            "Simple, DWMSE",
        ]
    )
    & pl.col("Voltage").is_between(-10, 120)
).plot.line(
    x="Voltage",
    y="Relative frequency",
    by="Distribution",
    title="Adjusted loss model output voltage distributions",
    color=["#31A2DA", "#FD9857", "#78D32B", "#C6A6A6"],
    line_dash=["dashed"] + ["solid"] * 3,
    ylim=(-0.01, 0.3),
)

In [19]:
freq_df.filter(pl.col("Voltage").ge(100)).plot.line(
    x="Voltage",
    y="Frequency",
    by="Distribution",
    title="Adjusted loss high-voltage output distributions",
    color=["#31A2DA", "#FD9857", "#78D32B", "#C6A6A6"],
    line_dash=["dotted"] + ["solid"] * 3,
)

In [20]:
freq_df.filter(
    pl.col("Distribution").is_in(["Actual (SAS)", "Time", "Time, DWMSE"])
).plot.line(
    x="Voltage",
    y="Relative frequency",
    by="Distribution",
    title="Distributions of SAS data and Time model, repeated",
    color=["#31A2DA", "#8B8B8B", "#C6A6A6"],
    # line_dash=["dotted"] + ["solid"] * 2,
)

In [21]:
hv.HLine(253).opts(
    color="blue", line_dash="dotted", line_width=1, xlim=(120, 200)
) * hv.VLine(140).opts(
    color="blue", line_dash="dotted", line_width=1, ylim=(0, 500)
) * (
    freq_df.filter(
        pl.col("Distribution").eq("Actual (SAS)") & pl.col("Voltage").ge(120)
    )
    .with_columns(
        pl.col("Frequency").cum_sum(reverse=True).alias("Cumulative frequency")
    )
    .plot.line(
        x="Voltage",
        y="Cumulative frequency",
        title="SAS high-voltage cumulative frequencies (253 measurements ≥ 140V)",
        xlabel="Voltage",
        ylabel="Cumulative frequency",
    )
) * (
    pl.DataFrame({"Voltage": [140], "Cumulative frequency": [253]}).plot.scatter(
        x="Voltage", y="Cumulative frequency"
    )
)

In [11]:
df_sas.lazy().group_by(
    day=pl.col("time").dt.weekday(), hour=pl.col("time").dt.hour()
).agg(pl.col("sensor_voltage").mean()).collect().sort("day", "hour").plot.line(
    x="hour",
    y="sensor_voltage",
    by="day",
    title="Mean voltage by day, hour",
    ylabel="Voltage",
    xlabel="Hour of day",
)

In [12]:
df_sas.lazy().group_by(
    day=pl.col("time").dt.weekday(),
    hour=pl.col("time").dt.hour(),
).agg(pl.col("sensor_voltage").median()).collect().sort("day", "hour").plot.line(
    x="hour",
    y="sensor_voltage",
    by="day",
    title="Median voltage by day, hour",
)

In [13]:
(
    df_sas.lazy()
    .group_by(hour=pl.col("time").dt.hour())
    .agg(
        low=pl.col("sensor_voltage").min(),
        med=pl.col("sensor_voltage").median(),
        q99=pl.col("sensor_voltage").quantile(0.99),
        q95=pl.col("sensor_voltage").quantile(0.95),
        high=pl.col("sensor_voltage").max(),
    )
    .collect()
    .sort("hour")
    .plot.line(
        x="hour",
        y=["low", "med", "q95", "q99", "high"],
        title="Quantiles of sensor data",
        xlabel="Hour of day",
        ylabel="Voltage",
    )
)

In [14]:
(
    df_join.lazy()
    .group_by(hour=pl.col("time").dt.hour())
    .agg(
        q0=pl.col("sensor_voltage").min(),
        q50=pl.col("sensor_voltage").median(),
        q95=pl.col("sensor_voltage").quantile(0.95),
        q99=pl.col("sensor_voltage").quantile(0.99),
        q100=pl.col("sensor_voltage").max(),
    )
    .collect()
    .sort("hour")
    .plot.line(
        x="hour",
        y=["q0", "q50", "q95", "q99", "q100"],
        title="Quantiles of joined sensor data (after RTM linking)",
        xlabel="Hour of day",
        ylabel="Voltage",
    )
)

In [15]:
df_join.lazy().filter(pl.col("time").dt.hour().is_in([3, 8, 12, 17, 22])).group_by(
    hour=pl.col("time").dt.hour(), voltage=pl.col("sensor_voltage").round()
).agg(num=pl.col("sensor").len()).sort("hour", "voltage").collect().plot.line(
    x="voltage",
    y="num",
    by="hour",
    title="Distribution of joined sensor data (after RTM linking)",
    xlabel="Voltage",
    ylabel="Frequency",
)

In [None]:
import matplotlib.pyplot as plt
import polars as pl
from clean import constants

In [None]:
df_sas = pl.read_parquet(constants.data_dir("sas/avg_cleaned.pq"))

In [None]:
df_rtm = (
    pl.scan_parquet(constants.data_dir("mtps/gps_3.pq"))
    .with_row_index()
    .group_by(pl.col("lat").round(2), pl.col("lon").round(2))
    .agg(count=pl.col("index").count())
    .with_columns(pl.col("count").truediv(pl.col("count").max()).sqrt().sqrt())
    .sort("count", descending=False)
    .collect()
    # .sample(fraction=0.002, shuffle=True)
)
df_rtm.head()

In [None]:
plt.scatter(x=df_rtm["lon"], y=df_rtm["lat"], alpha=1, s=2, c=df_rtm["count"])
plt.scatter(x=df_sas["longitude"], y=df_sas["latitude"], s=15, color="red")
plt.gcf().set_dpi(800)
plt.gca().set_aspect(111_139 / 87_578)
plt.show()