In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config as config
import pandas as pd

In [3]:
import hopsworks

# connect to the project

project = hopsworks.login(
    project = config.HOPSWORKS_PROJECT_NAME,
    api_key_value= config.HOPSWORKS_API_KEY
)

# connect to the feature store

feature_store = project.get_feature_store()

# connect to the feature group

feature_group = feature_store.get_feature_group(
    name=config.FEATRURE_GROUP_NAME,
    version=config.FEATRURE_GROUP_VERSION
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/696468
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# create a feature view (if it doesn't exit yet)
# This feature view daily uses on feature group, as the query is trivial

try :
    #create feature group if it does not exist
    feature_store.create_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version = config.FEATRURE_VIEW_VERSION,
        query= feature_group.select_all()
    )

except :
        print("feature view already existed; skip creation")

# get feature view

feature_view = feature_store.get_feature_view(
      name = config.FEATURE_VIEW_NAME,
      version = config.FEATURE_VIEW_VERSION
)

feature view already existed; skip creation


In [5]:
ts_data, _ = feature_view.training_data(
    description= "time-series hourly taxi rides"
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (393.03s) 



In [6]:
ts_data.sort_values(by = ['pickup_location_id', 'pickup_hour'], inplace=True)
# ts_data['pickup_hour']=pd.to_datetime(ts_data['pickup_hour'] )
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
1311814,1970-01-01 00:28:33.895200+00:00,0,1
1254544,1970-01-01 00:28:33.898800+00:00,0,1
1338280,1970-01-01 00:28:33.902400+00:00,0,1
1300488,1970-01-01 00:28:33.906000+00:00,0,1
1264408,1970-01-01 00:28:33.909600+00:00,0,1
...,...,...,...
3218559,2024-02-29 19:00:00+00:00,1,265
4948023,2024-02-29 20:00:00+00:00,3,265
3020329,2024-02-29 21:00:00+00:00,1,265
717202,2024-02-29 22:00:00+00:00,2,265


In [7]:
from src.data import transform_ts_data_into_features_and_target

features, targets =  transform_ts_data_into_features_and_target(
    ts_data,
   input_seq_len=24*28, # ONE MONTH
   step_size = 23
   )

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f"{features_and_target.shape=}")

100%|██████████| 263/263 [03:17<00:00,  1.33it/s]


features_and_target.shape=(216917, 675)


In [8]:
features_and_target['pickup_hour']=pd.to_datetime(features_and_target['pickup_hour'])

In [18]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> from Jan 2022 up untill 6 months ago
# test data -> last 3 months
cutoff_date = pd.to_datetime(date.today()- timedelta(days=28*6), utc=True)

print(f"{cutoff_date=}")

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name= "target_rides_next_hour"
)

print(f"{X_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_train.shape=}")
print(f"{y_test.shape=}")

cutoff_date=Timestamp('2023-12-25 00:00:00+0000', tz='UTC')
X_train.shape=(198507, 674)
X_test.shape=(18410, 674)
y_train.shape=(198507,)
y_test.shape=(18410,)


In [19]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna


from src.model import get_pipeline


def objective(trial:optuna.trial.Trial) -> float :
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit

    """

# pick hyper-parameters
    hyperparams = {
        'metric': 'mae',
        'verbose': -1,
        'num_leaves': trial.suggest_int("num_leaves", 2, 256),
        'feature_fraction': trial.suggest_float("feature_fraction", 0.2, 1.0),
        'bagging_fraction' : trial.suggest_float("bagging_fraction", 0.2, 1.0),
        'min_child_samples' : trial.suggest_int("min_child_samples", 3, 100)
    }

    tss = TimeSeriesSplit(n_splits= 5)
    scores = []

    for train_index,val_index in tss.split(X_train):

        #split data for training and validation

        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model

        pipeline =get_pipeline(**hyperparams)
        pipeline.fit(X_train_,y_train_)
        
        # evaluate the model

        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)


        # return the mean score

    return np.array(scores).mean()






In [20]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)

[I 2024-06-10 14:32:02,402] A new study created in memory with name: no-name-c33b6675-c698-4c9c-89c8-0d7f858245c3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-06-10 14:32:57,904] Trial 0 finished with value: 3.28185846307582 and parameters: {'num_leaves': 14, 'feature_fraction': 0.8549122706287391, 'bagging_fraction': 0.4151201751240528, 'min_child_samples': 92}. Best is trial 0 with value: 3.28185846307582.


In [21]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 14, 'feature_fraction': 0.8549122706287391, 'bagging_fraction': 0.4151201751240528, 'min_child_samples': 92}


In [22]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.035388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171900
[LightGBM] [Info] Number of data points in the train set: 198507, number of used features: 676
[LightGBM] [Info] Start training from score 16.934577


In [23]:
predictions = pipeline.predict(X_test)
test_mae  = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=3.0583


In [24]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR/ "model.pkl")

['C:\\Users\\karthikeya\\taxi_demand_predictor\\models\\model.pkl']

In [25]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema= input_schema, output_schema= output_schema)

In [26]:
model_registry  = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name = "taxi_demad_predictor_next_hour",
    metrics= {"test_mae": test_mae},
    description = "LightGBM regressor with a bit of hyper-parameter tuning",
    input_example = X_train.sample(),
    model_schema  = model_schema
)

model_file_path = MODELS_DIR / "model.pkl"

# Convert the Path object to a string before passing it to the save method
model.save(str(model_file_path))

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/194418 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3400 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60849 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/696468/models/taxi_demad_predictor_next_hour/3


Model(name: 'taxi_demad_predictor_next_hour', version: 3)