In [13]:
from datetime import datetime
import polars as pl
import numpy as np
from pyarrow.dataset import write_dataset

In [14]:
rng = np.random.default_rng()

In [15]:
df = pl.read_csv("external_ids.csv", columns=["WindDirection"])
df["dummy"] = pl.repeat(value=True, n=df.height, eager=True)
df["init_wdir_value"] = pl.Series(rng.uniform(low=0.0, high=360.0, size=df.height))
df

WindDirection,dummy,init_wdir_value
str,bool,f64
"""wdir1""",True,120.084212
"""wdir2""",True,291.825755
"""wdir3""",True,114.876751
"""wdir4""",True,182.311029
"""wdir5""",True,0.748241
"""wdir6""",True,48.709457
"""wdir7""",True,237.744829
"""wdir8""",True,254.154982


In [16]:
def add_timestamp_partition_columns(df):
    df["year"] = df["timestamp"].dt.year()
    df["month"] = df["timestamp"].dt.month()
    df["day"] = df["timestamp"].dt.day()

In [17]:
by = ["year", "month", "day", "id"]

In [18]:
datapoint_counter = 0
for (month, day) in [(8, 29), (8, 30), (9, 1)]:
    out_wd = []
    for hour in range(0, 24):
        print(month, day, hour)
        timestamps = pl.DataFrame({"timestamp": pl.date_range(low=datetime(2022, month, day, hour, 0, 0),
                                                              high=datetime(2022, month, day, hour, 59, 59), interval="10s")})
        timestamps["dummy"] = pl.repeat(value=True, n=timestamps.height, eager=True)
        wind_direction_deltas = rng.normal(0, 3.6, size=df.height * timestamps.height)
        wd = df[["init_wdir_value", "WindDirection", "dummy"]].join(timestamps, left_on=["dummy"],
                                                                    right_on=["dummy"]).drop(
            "dummy").rename({"WindDirection": "id"})
        wd["wdir_value"] = wind_direction_deltas
        wd["wdir_value"] = wd.groupby(["id"], maintain_order=True).apply(lambda x: pl.DataFrame({"wdir_value": x["wdir_value"].cumsum()}))[
            "wdir_value"]
        wd["wdir_value"] = wd["init_wdir_value"] + wd["wdir_value"]
        wd["wdir_value"] = (wd["wdir_value"] % 360).abs()
        df["init_wdir_value"] = wd.groupby(["id"], maintain_order=True).apply(lambda x:pl.DataFrame({"init_wdir_value":x["wdir_value"].tail(1)}))["init_wdir_value"]

        wd = wd.drop("init_wdir_value")
        wd = wd.rename({"wdir_value": "value"})
        add_timestamp_partition_columns(wd)
        wd = wd.sort(by)
        wd["datapoint_id"] = wd.get_column("id") + "_" + pl.Series("counter", range(datapoint_counter, datapoint_counter + wd.height)).cast(str)
        datapoint_counter += wd.height
        out_wd.append(wd)
    wd_arrow = pl.concat(out_wd, rechunk=True).to_arrow()
    write_dataset(wd_arrow, format="parquet", base_dir="timeseries_double", partitioning=by, use_threads=False,
                  existing_data_behavior="overwrite_or_ignore", min_rows_per_group=1073741824, max_rows_per_file=1073741824,
                  max_rows_per_group=1073741824, max_partitions=10000, max_open_files=10000)

8 29 0
8 29 1
8 29 2
8 29 3
8 29 4
8 29 5
8 29 6
8 29 7
8 29 8
8 29 9
8 29 10
8 29 11
8 29 12
8 29 13
8 29 14
8 29 15
8 29 16
8 29 17
8 29 18
8 29 19
8 29 20
8 29 21
8 29 22
8 29 23
8 30 0
8 30 1
8 30 2
8 30 3
8 30 4
8 30 5
8 30 6
8 30 7
8 30 8
8 30 9
8 30 10
8 30 11
8 30 12
8 30 13
8 30 14
8 30 15
8 30 16
8 30 17
8 30 18
8 30 19
8 30 20
8 30 21
8 30 22
8 30 23
9 1 0
9 1 1
9 1 2
9 1 3
9 1 4
9 1 5
9 1 6
9 1 7
9 1 8
9 1 9
9 1 10
9 1 11
9 1 12
9 1 13
9 1 14
9 1 15
9 1 16
9 1 17
9 1 18
9 1 19
9 1 20
9 1 21
9 1 22
9 1 23
