In [1]:
from datetime import datetime
import polars as pl
import numpy as np
from pyarrow.dataset import write_dataset

In [2]:
rng = np.random.default_rng()

In [3]:
df = pl.read_csv("external_ids.csv", columns=["WindSpeed"])
df["dummy"] = pl.repeat(value=True, n=df.height, eager=True)
df["init_wsp_value"] = pl.Series(rng.lognormal(mean=2, sigma=0.5, size=df.height))
df

WindSpeed,dummy,init_wsp_value
str,bool,f64
"""wsp1""",True,6.362013
"""wsp2""",True,6.241776
"""wsp3""",True,16.536649
"""wsp4""",True,3.355752
"""wsp5""",True,8.058262
"""wsp6""",True,4.023754
"""wsp7""",True,6.739918
"""wsp8""",True,22.248208


In [4]:
def add_timestamp_partition_columns(df):
    df["year"] = df["timestamp"].dt.year()
    df["month"] = df["timestamp"].dt.month()
    df["day"] = df["timestamp"].dt.day()

In [5]:
by = ["year", "month", "day", "id"]

In [6]:
datapoint_counter = 0
for (month,day) in [(8,29),(8,30),(9,1)]:
    out_wsp = []
    for hour in range(0,24):
        print(month, day, hour)
        timestamps = pl.DataFrame({"timestamp":pl.date_range(low=datetime(2022,month,day,hour,0,0), high=datetime(2022,month,day,hour,59,59), interval="10s")})
        timestamps["dummy"] = pl.repeat(value=True, n=timestamps.height, eager=True)
        wind_speed_deltas = rng.normal(0, 1.0, size=df.height * timestamps.height)

        wsp = df[["init_wsp_value", "WindSpeed", "dummy"]].join(timestamps, left_on=["dummy"], right_on=["dummy"]).drop("dummy").rename({"WindSpeed":"id"})

        wsp["value"] = wind_speed_deltas
        wsp["value"] = wsp.groupby(["id"], maintain_order=True).apply(lambda x: pl.DataFrame({"value":x["value"].cumsum()}))["value"]
        wsp["value"] = wsp["init_wsp_value"] + wsp["value"]
        wsp["value"] = wsp["value"].abs()
        df["init_wsp_value"] = wsp.groupby(["id"], maintain_order=True).apply(lambda x:pl.DataFrame({"init_wsp_value":x["value"].tail(1)}))["init_wsp_value"]
        wsp = wsp.drop("init_wsp_value")
        add_timestamp_partition_columns(wsp)
        wsp = wsp.sort(by)
        wsp["datapoint_id"] = wsp.get_column("id") + "_" + pl.Series("counter", range(datapoint_counter,datapoint_counter + wsp.height)).cast(str)
        datapoint_counter += wsp.height
        out_wsp.append(wsp)

    wsp_arrow = pl.concat(out_wsp, rechunk=True).to_arrow()
    write_dataset(wsp_arrow, format="parquet", base_dir="timeseries_double", partitioning=by, use_threads=False, existing_data_behavior="overwrite_or_ignore", min_rows_per_group=1073741824, max_rows_per_group=1073741824,  max_partitions=10000, max_open_files=10000, max_rows_per_file=1073741824)


8 29 0
8 29 1
8 29 2
8 29 3
8 29 4
8 29 5
8 29 6
8 29 7
8 29 8
8 29 9
8 29 10
8 29 11
8 29 12
8 29 13
8 29 14
8 29 15
8 29 16
8 29 17
8 29 18
8 29 19
8 29 20
8 29 21
8 29 22
8 29 23
8 30 0
8 30 1
8 30 2
8 30 3
8 30 4
8 30 5
8 30 6
8 30 7
8 30 8
8 30 9
8 30 10
8 30 11
8 30 12
8 30 13
8 30 14
8 30 15
8 30 16
8 30 17
8 30 18
8 30 19
8 30 20
8 30 21
8 30 22
8 30 23
9 1 0
9 1 1
9 1 2
9 1 3
9 1 4
9 1 5
9 1 6
9 1 7
9 1 8
9 1 9
9 1 10
9 1 11
9 1 12
9 1 13
9 1 14
9 1 15
9 1 16
9 1 17
9 1 18
9 1 19
9 1 20
9 1 21
9 1 22
9 1 23
