In [2]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

month = 1
year = 2023
path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.parquet"

table = pq.read_table(path)
ts_data = table.to_pandas()
ts_data.head()

Unnamed: 0,start_hour,start_station_id,rides
0,2023-01-01 00:00:00,5905.14,0
1,2023-01-01 01:00:00,5905.14,5
2,2023-01-01 02:00:00,5905.14,7
3,2023-01-01 03:00:00,5905.14,3
4,2023-01-01 04:00:00,5905.14,2


In [3]:
import numpy as np

def transform_time_series_to_tabular(df, location_id, feature_col='rides', window_size=12, step_size=1):
    """
    Transforms time series data for a given location ID into a tabular format.
    The first `window_size` rows are used as features, and the next row is the target.
    The process slides down by `step_size` rows at a time to create the next set of features and target.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing time series data.
        location_id (int): The location ID to filter the data for.
        feature_col (str): The column name containing the values to use as features and target (default is "rides").
        window_size (int): The number of rows to use as features (default is 12).
        step_size (int): The number of rows to slide the window by (default is 1).

    Returns:
        pd.DataFrame: A transformed DataFrame where the first `window_size` columns are features
                      and the last column is the target.
    """
    # Filter the data
    location_data = df[df["start_station_id"] == location_id].reset_index(drop=True)

    # Extract the feature column as a Numpy array
    values = location_data[feature_col].values

    # Ensure one window can be created

    if len(values) <= window_size:
        raise ValueError("Not enough data to create even one window of features and target.")

    # Create tabular data using a sliding window
    rows = []
    for i in range(0, len(values) - window_size, step_size):
        # The first 'window_size' values are features, and the next value is the target
        features = values[i:i + window_size]
        target = values[i+window_size]
        rows.append(np.append(features, target))

    # Convert list of rows into a data frame
    column_names = [f"feature_{i+1}" for i in range(window_size)] + ["target"]
    transformed_df = pd.DataFrame(rows, columns=column_names)

    return transformed_df

In [4]:
features_targets = transform_time_series_to_tabular(ts_data, "5905.14", "rides", 24, 1)
features_targets

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,target
0,0,5,7,3,2,1,2,0,2,2,...,19,18,5,12,3,6,3,11,4,2
1,5,7,3,2,1,2,0,2,2,4,...,18,5,12,3,6,3,11,4,2,0
2,7,3,2,1,2,0,2,2,4,4,...,5,12,3,6,3,11,4,2,0,0
3,3,2,1,2,0,2,2,4,4,10,...,12,3,6,3,11,4,2,0,0,0
4,2,1,2,0,2,2,4,4,10,9,...,3,6,3,11,4,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,28,23,10,7,7,3,1,0,0,0,...,19,8,9,22,15,20,28,34,34,22
716,23,10,7,7,3,1,0,0,0,1,...,8,9,22,15,20,28,34,34,22,22
717,10,7,7,3,1,0,0,0,1,3,...,9,22,15,20,28,34,34,22,22,12
718,7,7,3,1,0,0,0,1,3,4,...,22,15,20,28,34,34,22,22,12,17
