In [19]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [2]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import calendar

# Add the parent directory to the Python path
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import pytz
import requests

from src.config import RAW_DATA_DIR

def transform_ts_data_info_features_and_target(
    df, feature_col="rides", window_size=12, step_size=1
):
    """
    Transforms time series data for all unique location IDs into a tabular format.
    The first `window_size` rows are used as features, and the next row is the target.
    The process slides down by `step_size` rows at a time to create the next set of features and target.
    Feature columns are named based on their hour offsets relative to the target.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing time series data with 'pickup_hour' column.
        feature_col (str): The column name containing the values to use as features and target (default is "rides").
        window_size (int): The number of rows to use as features (default is 12).
        step_size (int): The number of rows to slide the window by (default is 1).

    Returns:
        tuple: (features DataFrame with pickup_hour, targets Series, complete DataFrame)
    """
    # Get all unique location IDs
    location_ids = df["pickup_location_id"].unique()
    # List to store transformed data for each location
    transformed_data = []

    # Loop through each location ID and transform the data
    for location_id in location_ids:
        try:
            # Filter the data for the given location ID
            location_data = df[df["pickup_location_id"] == location_id].reset_index(
                drop=True
            )

            # Extract the feature column and pickup_hour as NumPy arrays
            values = location_data[feature_col].values
            times = location_data["pickup_hour"].values

            # Ensure there are enough rows to create at least one window
            if len(values) <= window_size:
                raise ValueError("Not enough data to create even one window.")

            # Create the tabular data using a sliding window approach
            rows = []
            for i in range(0, len(values) - window_size, step_size):
                # The first `window_size` values are features, and the next value is the target
                features = values[i : i + window_size]
                target = values[i + window_size]
                # Get the corresponding target timestamp
                target_time = times[i + window_size]
                # Combine features, target, location_id, and timestamp
                row = np.append(features, [target, location_id, target_time])
                rows.append(row)

            # Convert the list of rows into a DataFrame
            feature_columns = [
                f"{feature_col}_t-{window_size - i}" for i in range(window_size)
            ]
            all_columns = feature_columns + [
                "target",
                "pickup_location_id",
                "pickup_hour",
            ]
            transformed_df = pd.DataFrame(rows, columns=all_columns)

            # Append the transformed data to the list
            transformed_data.append(transformed_df)

        except ValueError as e:
            print(f"Skipping location_id {location_id}: {str(e)}")

    # Combine all transformed data into a single DataFrame
    if not transformed_data:
        raise ValueError(
            "No data could be transformed. Check if input DataFrame is empty or window size is too large."
        )

    final_df = pd.concat(transformed_data, ignore_index=True)

    # Extract features (including pickup_hour), targets, and keep the complete DataFrame
    features = final_df[feature_columns + ["pickup_hour", "pickup_location_id"]]
    targets = final_df["target"]



    return features, targets

In [5]:
from scipy.fft import fft, fftfreq

In [6]:
import hopsworks

project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

# Create a feature view if it doesn't already exist
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all(),
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) created successfully.")
except Exception as e:
    print(f"Error creating feature view: {e}")

# Retrieve the feature view
try:
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) retrieved successfully.")
except Exception as e:
    print(f"Error retrieving feature view: {e}")

ts_data, _ = feature_view.training_data(
    description="Time-series hourly taxi rides"
)

ts_data = ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

  from .autonotebook import tqdm as notebook_tqdm


2025-03-01 15:37:00,283 INFO: Initializing external client
2025-03-01 15:37:00,284 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-01 15:37:03,215 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Error creating feature view: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1212635/featurestores/1200268/featureview). Server response: 
HTTP code: 400, HTTP reason: Bad Request, body: b'{"errorCode":270179,"usrMsg":"Feature view: time_series_hourly_feature_view, version: 1","errorMsg":"The provided feature view name and version already exists"}', error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: time_series_hourly_feature_view, version: 1
Feature view 'time_series_hourly_feature_view' (version 1) retrieved successfully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (14.83s) 




In [7]:
ts_data["pickup_hour"] = pd.to_datetime(ts_data["pickup_hour"], errors="coerce")
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)  # Remove timezone

In [8]:
from src.data_utils import transform_ts_data_info_features_and_target

features, targets = transform_ts_data_info_features_and_target(ts_data, window_size=24*28, step_size=23)

In [9]:
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
2,2023-01-01 02:00:00,2,0
3,2023-01-01 03:00:00,2,0
4,2023-01-01 04:00:00,2,0
...,...,...,...
4925594,2025-03-01 16:00:00,263,168
4925595,2025-03-01 17:00:00,263,216
4925596,2025-03-01 18:00:00,263,237
4925597,2025-03-01 19:00:00,263,170


In [10]:
features

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2023-01-29 00:00:00,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-29 23:00:00,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-30 22:00:00,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-31 21:00:00,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-01 20:00:00,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206721,136,106,115,98,95,63,42,13,6,3,...,142,88,101,107,101,130,111,90,2025-02-25 17:00:00,263
206722,101,108,131,105,101,92,79,53,21,13,...,145,139,124,97,100,99,147,109,2025-02-26 16:00:00,263
206723,108,120,120,125,126,133,96,86,54,42,...,164,173,147,137,113,123,114,141,2025-02-27 15:00:00,263
206724,120,127,115,147,163,164,137,125,135,113,...,57,146,160,131,98,108,129,120,2025-02-28 14:00:00,263


In [11]:
# Apply FFT to each feature (column)
fft_features = []
col_names = []
idx = 0
for column in features.columns[0:672]:  # Skip the time or index column
    col_names.append(f"magnitude_t-{672-idx}")
    idx += 1
    fft_result = np.fft.fft(features[column])
    fft_magnitude = np.abs(fft_result)

    frequencies = np.fft.fftfreq(len(features[column]), d=1)  # assuming hourly data

    # Store the FFT magnitudes for each feature
    fft_features.append(fft_magnitude)

# Convert FFT features to a DataFrame (if necessary)
fft_features_df = pd.DataFrame(fft_features).T  # Each row will have FFT magnitude
fft_features_df.columns = col_names

In [12]:
len(fft_features[0])

206726

In [13]:
fft_features_df

Unnamed: 0,magnitude_t-672,magnitude_t-671,magnitude_t-670,magnitude_t-669,magnitude_t-668,magnitude_t-667,magnitude_t-666,magnitude_t-665,magnitude_t-664,magnitude_t-663,...,magnitude_t-10,magnitude_t-9,magnitude_t-8,magnitude_t-7,magnitude_t-6,magnitude_t-5,magnitude_t-4,magnitude_t-3,magnitude_t-2,magnitude_t-1
0,3.539675e+06,3.533770e+06,3.526683e+06,3.514628e+06,3.509963e+06,3.497010e+06,3.494242e+06,3.488696e+06,3.475090e+06,3.486084e+06,...,3.530014e+06,3.531663e+06,3.537916e+06,3.551135e+06,3.548870e+06,3.543154e+06,3.543484e+06,3.540947e+06,3.535641e+06,3.536427e+06
1,8.052157e+05,8.033336e+05,8.013300e+05,7.978235e+05,7.945220e+05,7.923725e+05,7.883765e+05,7.931467e+05,7.895434e+05,7.914975e+05,...,7.930865e+05,7.954382e+05,8.005082e+05,8.084643e+05,8.074613e+05,8.067887e+05,8.059661e+05,8.049636e+05,8.027543e+05,8.013005e+05
2,8.434705e+05,8.446293e+05,8.385159e+05,8.352248e+05,8.335815e+05,8.357059e+05,8.348507e+05,8.349654e+05,8.296359e+05,8.342519e+05,...,8.451751e+05,8.432160e+05,8.401663e+05,8.395022e+05,8.448455e+05,8.441469e+05,8.468133e+05,8.414927e+05,8.393537e+05,8.402155e+05
3,1.557531e+06,1.558114e+06,1.550816e+06,1.546341e+06,1.542680e+06,1.539685e+06,1.535847e+06,1.536154e+06,1.530236e+06,1.536856e+06,...,1.561072e+06,1.558481e+06,1.563193e+06,1.567732e+06,1.565108e+06,1.560828e+06,1.563947e+06,1.558585e+06,1.556732e+06,1.557180e+06
4,6.925566e+05,6.852768e+05,6.886921e+05,6.887779e+05,6.818361e+05,6.777058e+05,6.782140e+05,6.762782e+05,6.771227e+05,6.809865e+05,...,6.963269e+05,6.990283e+05,7.036423e+05,7.047857e+05,7.020454e+05,6.968070e+05,6.899742e+05,6.943891e+05,6.957043e+05,6.897349e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206721,5.691037e+05,5.707208e+05,5.679837e+05,5.663023e+05,5.651034e+05,5.650229e+05,5.655153e+05,5.646825e+05,5.638889e+05,5.686009e+05,...,5.658252e+05,5.664965e+05,5.607786e+05,5.625001e+05,5.685347e+05,5.697457e+05,5.709922e+05,5.680261e+05,5.665392e+05,5.678481e+05
206722,6.925566e+05,6.852768e+05,6.886921e+05,6.887779e+05,6.818361e+05,6.777058e+05,6.782140e+05,6.762782e+05,6.771227e+05,6.809865e+05,...,6.963269e+05,6.990283e+05,7.036423e+05,7.047857e+05,7.020454e+05,6.968070e+05,6.899742e+05,6.943891e+05,6.957043e+05,6.897349e+05
206723,1.557531e+06,1.558114e+06,1.550816e+06,1.546341e+06,1.542680e+06,1.539685e+06,1.535847e+06,1.536154e+06,1.530236e+06,1.536856e+06,...,1.561072e+06,1.558481e+06,1.563193e+06,1.567732e+06,1.565108e+06,1.560828e+06,1.563947e+06,1.558585e+06,1.556732e+06,1.557180e+06
206724,8.434705e+05,8.446293e+05,8.385159e+05,8.352248e+05,8.335815e+05,8.357059e+05,8.348507e+05,8.349654e+05,8.296359e+05,8.342519e+05,...,8.451751e+05,8.432160e+05,8.401663e+05,8.395022e+05,8.448455e+05,8.441469e+05,8.468133e+05,8.414927e+05,8.393537e+05,8.402155e+05


In [14]:
targets

0           0
1           0
2           0
3           0
4           0
         ... 
206721    122
206722    108
206723    124
206724    142
206725    209
Name: target, Length: 206726, dtype: int32

In [15]:
info_df = features[["pickup_hour", "pickup_location_id"]]

In [16]:
info_df

Unnamed: 0,pickup_hour,pickup_location_id
0,2023-01-29 00:00:00,2
1,2023-01-29 23:00:00,2
2,2023-01-30 22:00:00,2
3,2023-01-31 21:00:00,2
4,2023-02-01 20:00:00,2
...,...,...
206721,2025-02-25 17:00:00,263
206722,2025-02-26 16:00:00,263
206723,2025-02-27 15:00:00,263
206724,2025-02-28 14:00:00,263


In [21]:
final_fft = pd.concat([fft_features_df, info_df, targets], axis=1)

In [33]:
final_fft

Unnamed: 0,magnitude_t-672,magnitude_t-671,magnitude_t-670,magnitude_t-669,magnitude_t-668,magnitude_t-667,magnitude_t-666,magnitude_t-665,magnitude_t-664,magnitude_t-663,...,magnitude_t-7,magnitude_t-6,magnitude_t-5,magnitude_t-4,magnitude_t-3,magnitude_t-2,magnitude_t-1,pickup_hour,pickup_location_id,target
0,3.539675e+06,3.533770e+06,3.526683e+06,3.514628e+06,3.509963e+06,3.497010e+06,3.494242e+06,3.488696e+06,3.475090e+06,3.486084e+06,...,3.551135e+06,3.548870e+06,3.543154e+06,3.543484e+06,3.540947e+06,3.535641e+06,3.536427e+06,2023-01-29 00:00:00,2,0
1,8.052157e+05,8.033336e+05,8.013300e+05,7.978235e+05,7.945220e+05,7.923725e+05,7.883765e+05,7.931467e+05,7.895434e+05,7.914975e+05,...,8.084643e+05,8.074613e+05,8.067887e+05,8.059661e+05,8.049636e+05,8.027543e+05,8.013005e+05,2023-01-29 23:00:00,2,0
2,8.434705e+05,8.446293e+05,8.385159e+05,8.352248e+05,8.335815e+05,8.357059e+05,8.348507e+05,8.349654e+05,8.296359e+05,8.342519e+05,...,8.395022e+05,8.448455e+05,8.441469e+05,8.468133e+05,8.414927e+05,8.393537e+05,8.402155e+05,2023-01-30 22:00:00,2,0
3,1.557531e+06,1.558114e+06,1.550816e+06,1.546341e+06,1.542680e+06,1.539685e+06,1.535847e+06,1.536154e+06,1.530236e+06,1.536856e+06,...,1.567732e+06,1.565108e+06,1.560828e+06,1.563947e+06,1.558585e+06,1.556732e+06,1.557180e+06,2023-01-31 21:00:00,2,0
4,6.925566e+05,6.852768e+05,6.886921e+05,6.887779e+05,6.818361e+05,6.777058e+05,6.782140e+05,6.762782e+05,6.771227e+05,6.809865e+05,...,7.047857e+05,7.020454e+05,6.968070e+05,6.899742e+05,6.943891e+05,6.957043e+05,6.897349e+05,2023-02-01 20:00:00,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206721,5.691037e+05,5.707208e+05,5.679837e+05,5.663023e+05,5.651034e+05,5.650229e+05,5.655153e+05,5.646825e+05,5.638889e+05,5.686009e+05,...,5.625001e+05,5.685347e+05,5.697457e+05,5.709922e+05,5.680261e+05,5.665392e+05,5.678481e+05,2025-02-25 17:00:00,263,122
206722,6.925566e+05,6.852768e+05,6.886921e+05,6.887779e+05,6.818361e+05,6.777058e+05,6.782140e+05,6.762782e+05,6.771227e+05,6.809865e+05,...,7.047857e+05,7.020454e+05,6.968070e+05,6.899742e+05,6.943891e+05,6.957043e+05,6.897349e+05,2025-02-26 16:00:00,263,108
206723,1.557531e+06,1.558114e+06,1.550816e+06,1.546341e+06,1.542680e+06,1.539685e+06,1.535847e+06,1.536154e+06,1.530236e+06,1.536856e+06,...,1.567732e+06,1.565108e+06,1.560828e+06,1.563947e+06,1.558585e+06,1.556732e+06,1.557180e+06,2025-02-27 15:00:00,263,124
206724,8.434705e+05,8.446293e+05,8.385159e+05,8.352248e+05,8.335815e+05,8.357059e+05,8.348507e+05,8.349654e+05,8.296359e+05,8.342519e+05,...,8.395022e+05,8.448455e+05,8.441469e+05,8.468133e+05,8.414927e+05,8.393537e+05,8.402155e+05,2025-02-28 14:00:00,263,142


In [38]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    final_fft,
    cutoff_date=datetime(2024, 10, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(165880, 674)
(165880,)
(40846, 674)
(40846,)


In [42]:
def average_magnitudes_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"magnitude_t-{7*24}",  # 1 week ago
            f"magnitude_t-{14*24}", # 2 weeks ago
            f"magnitude_t-{21*24}", # 3 weeks ago
            f"magnitude_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_magnitudes_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

from sklearn.preprocessing import FunctionTransformer

add_feature_average_magnitudes_last_4_weeks = FunctionTransformer(
    average_magnitudes_last_4_weeks, validate=False
)

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()

In [47]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_magnitudes_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [48]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# Define the parameter grid for LGBMRegressor
param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "lgbmregressor__n_estimators": [100, 200, 500, 1000],
    "lgbmregressor__min_child_samples": [10, 20, 30, 50],
    "lgbmregressor__subsample": [0.6, 0.8, 1.0],
    "lgbmregressor__colsample_bytree": [0.6, 0.8, 1.0],
    "lgbmregressor__reg_alpha": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__reg_lambda": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],
    "lgbmregressor__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0],
    "lgbmregressor__bagging_freq": [1, 5, 10],
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,  # Number of parameter settings sampled
    scoring="neg_mean_absolute_error",  # Use MAE as the scoring metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.275227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171646
[LightGBM] [Info] Number of data points in the train set: 110586, number of used features: 675
[LightGBM] [Info] Start training from score 21.706798
[CV] END lgbmregressor__bagging_fraction=0.8, lgbmregressor__bagging_freq=10, lgbmregressor__colsample_bytree=1.0, lgbmregressor__feature_fraction=0.6, lgbmregressor__learning_rate=0.1, lgbmregressor__max_depth=20, lgbmregressor__min_child_samples=50, lgbmregressor__n_estimators=1000, lgbmregressor__num_leaves=2, lgbmregressor__reg_alpha=0.5, lgbmregressor__reg_lambda=0.1, lgbmregressor__subsample=0.8; total time=  30.1s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.298563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[Light