In [1]:
import pandas as pd

ts_data = pd.read_parquet("../data/transformed/ts_data_2022_01.parquet")
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


In [2]:
# Filter the loaded data to keep only the records related to location 43.
# Reset the index to have a clean DataFrame.
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43, :].reset_index(drop=True)
ts_data_one_location.head(15)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [3]:
def get_cutoff_indices(data: pd.DataFrame, n_features: int, step_size: int) -> list:
    """
    Calculate indices for slicing the DataFrame into subsequences for model training.
    
    Parameters:
    - data (pd.DataFrame): The DataFrame containing the time series data.
    - n_features (int): The number of features to include in each subsequence.
    - step_size (int): The step size to move the window for each new subsequence.
    
    Returns:
    - list: A list of tuples where each tuple contains the start, middle, and end indices of each subsequence.
    """
    
    # Determine the last valid index position in the DataFrame
    stop_position = len(data) - 1

    # Initialize the indices for the first, middle, and last positions of the subsequence
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    # Loop until the end index of the subsequence is within the DataFrame
    while subseq_last_idx <= stop_position:
        # Append the current set of indices as a tuple to the list
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
        
        # Update the indices for the next iteration based on the step size
        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices

In [4]:
n_features = 24
step_size = 1

indices = get_cutoff_indices(data=ts_data_one_location, n_features=n_features, step_size=step_size)
indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [5]:
import numpy as np

n_examples = len(indices)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []
for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["rides"].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]["pickup_hour"])

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values


In [7]:
print(f"{x.shape=}")
print(f"{x=}")
print(f"{pickup_hours[:5]=}")

x.shape=(719, 24)
x=array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


In [8]:
# Creating a DataFrame 'features_one_location' to store the features in a structured manner.
# Each column represents the ride counts of previous hours, labeled in descending order.
features_one_location = pd.DataFrame(
    x,  # The data to populate the DataFrame. It's the feature array 'x' created previously.
    columns=[f"rides_previous_{i+1}_hour" for i in reversed(range(n_features))]
    # Naming each column based on the previous hours in descending order.
)
features_one_location

Unnamed: 0,rides_previous_2_hour,rides_previous_2_hour.1,rides_previous_2_hour.2,rides_previous_2_hour.3,rides_previous_2_hour.4,rides_previous_2_hour.5,rides_previous_2_hour.6,rides_previous_2_hour.7,rides_previous_2_hour.8,rides_previous_2_hour.9,...,rides_previous_2_hour.10,rides_previous_2_hour.11,rides_previous_2_hour.12,rides_previous_2_hour.13,rides_previous_2_hour.14,rides_previous_2_hour.15,rides_previous_2_hour.16,rides_previous_2_hour.17,rides_previous_2_hour.18,rides_previous_2_hour.19
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


In [9]:
# Creating a DataFrame 'targets_one_location' to store the target values (labels) in a structured manner.
# The DataFrame has a single column named 'target_rides_next_hour' that stores the ride counts to be predicted.
targets_one_location = pd.DataFrame(y, columns=[f"target_rides_next_hour"])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0


from tqdm import tqdm

def transform_ts_data_into_features_and_targets(
        ts_data: pd.DataFrame,
        input_seq_len: int,
        step_size: int
) -> pd.DataFrame:
    """
    Transforms the time series data into features and targets for model training.
    
    Parameters:
    - ts_data (pd.DataFrame): DataFrame containing the time series data.
    - input_seq_len (int): Number of previous hours to use for feature creation.
    - step_size (int): Step size to move the window to create subsequences.
    
    Returns:
    - features (pd.DataFrame): DataFrame containing the features.
    - targets (pd.Series): Series containing the target values.
    """
    
    # Ensuring the input DataFrame has the expected columns
    assert set(ts_data.columns) == {"pickup_hour", "rides", "pickup_location_id"}

    # Getting unique location IDs
    location_ids = ts_data["pickup_location_id"].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    # Looping through each location ID to process data location-wise
    for location_id in tqdm(location_ids):
        # Filtering the data for one specific location
        ts_data_one_location = ts_data.loc[
            ts_data.pickup_location_id == location_id, ["pickup_hour", "rides"]
        ]
        
        # Getting indices for slicing the data into subsequences
        indices = get_cutoff_indices(
            ts_data_one_location,
            input_seq_len,
            step_size
        )
        
        # Creating arrays to hold features and targets
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []

        # Populating the feature and target arrays
        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["rides"].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]["pickup_hour"])

        # Creating DataFrames for features and targets
        features_one_location = pd.DataFrame(
            x,
            columns=[f"rides_previous_{i+1}_hour" for i in reversed(range(input_seq_len))]
        )
        features_one_location["pickup_hour"] = pickup_hours
        features_one_location["pickup_location_id"] = location_id
        
        targets_one_location = pd.DataFrame(y, columns=[f"target_rides_next_hour"])

        # Concatenating the features and targets of each location
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])
    
    # Resetting index for clean DataFrames
    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets["target_rides_next_hour"]


In [15]:
from tqdm import tqdm

def transform_ts_data_into_features_and_targets_optimized(
        ts_data: pd.DataFrame,
        input_seq_len: int,
        step_size: int
) -> (pd.DataFrame, pd.Series):
    """
    Transforms the time series data into features and targets for model training.
    
    Parameters:
    - ts_data (pd.DataFrame): DataFrame containing the time series data.
    - input_seq_len (int): Number of previous hours to use for feature creation.
    - step_size (int): Step size to move the window to create subsequences.
    
    Returns:
    - features (pd.DataFrame): DataFrame containing the features.
    - targets (pd.Series): Series containing the target values.
    """
    
    # Getting unique location IDs from the data
    location_ids = ts_data["pickup_location_id"].unique()
    all_features = []  # List to store features DataFrames for each location
    all_targets = []  # List to store targets Series for each location

    # Processing data per location
    for location_id in tqdm(location_ids):
        # Filtering data for one specific location
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id, ["pickup_hour", "rides"]]
        
        # Getting indices for creating subsequences
        indices = get_cutoff_indices(ts_data_one_location, input_seq_len, step_size)
        
        # Pre-allocating numpy arrays for features and targets
        x = np.zeros((len(indices), input_seq_len))
        y = np.zeros(len(indices))
        pickup_hours = []  # List to store pickup hours corresponding to targets

        # Populating the feature and target arrays using calculated indices
        for i, (start, mid, end) in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[start:mid]["rides"].values
            y[i] = ts_data_one_location.iloc[mid:end]["rides"].values[0]
            pickup_hours.append(ts_data_one_location.iloc[mid]["pickup_hour"])
        
        # Creating a DataFrame for features and appending it to the list
        features_one_location = pd.DataFrame(x, columns=[f"rides_previous_{i+1}_hour" for i in reversed(range(input_seq_len))])
        features_one_location["pickup_hour"] = pickup_hours
        features_one_location["pickup_location_id"] = location_id
        
        # Creating a Series for targets and appending it to the list
        targets_one_location = pd.Series(y, name="target_rides_next_hour")
        
        all_features.append(features_one_location)
        all_targets.append(targets_one_location)
    
    # Concatenating all DataFrames and Series in the lists to get final features and targets
    features = pd.concat(all_features).reset_index(drop=True)
    targets = pd.concat(all_targets).reset_index(drop=True)

    return features, targets


In [16]:
features, targets = transform_ts_data_into_features_and_targets(
    ts_data=ts_data,
    input_seq_len=24*7*1,
    step_size=24,
)

print(f"{features.shape=}")
print(f"{targets.shape=}")

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["rides"].values
  y[i] = t

features.shape=(6168, 170)
targets.shape=(6168,)



