In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [27]:
for year in range(2022, 2025):
    print(year)

2022
2023
2024


In [28]:
from datetime import datetime
import pandas as pd
from src.data_utils import load_and_process_taxi_data

from_year = 2022
# to_year = datetime.now().year
to_year = 2024
print(f"Download raw data from {from_year} to {to_year}")

rides = pd.DataFrame()
chunks = []
for year in range(from_year, to_year+1):

    rides_one_year = load_and_process_taxi_data(year)

    chunks.append(rides_one_year)


# Concatenate all chunks at the end
rides = pd.concat(chunks, ignore_index=True)
print("Data loading complete.")

Download raw data from 2022 to 2024
File already exists for 2022-01.
Loading data for 2022-01...
Total records: 2,463,931
Valid records: 2,415,141
Records dropped: 48,790 (1.98%)
Successfully processed data for 2022-01.
File already exists for 2022-02.
Loading data for 2022-02...
Total records: 2,979,431
Valid records: 2,921,118
Records dropped: 58,313 (1.96%)
Successfully processed data for 2022-02.
File already exists for 2022-03.
Loading data for 2022-03...
Total records: 3,627,882
Valid records: 3,551,986
Records dropped: 75,896 (2.09%)
Successfully processed data for 2022-03.
File already exists for 2022-04.
Loading data for 2022-04...
Total records: 3,599,920
Valid records: 3,522,113
Records dropped: 77,807 (2.16%)
Successfully processed data for 2022-04.
File already exists for 2022-05.
Loading data for 2022-05...
Total records: 3,588,295
Valid records: 3,509,056
Records dropped: 79,239 (2.21%)
Successfully processed data for 2022-05.
File already exists for 2022-06.
Loading dat

In [29]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
...,...,...
112952043,2024-11-30 23:11:15,162
112952044,2024-11-30 23:49:30,132
112952045,2024-11-30 23:31:46,100
112952046,2024-11-30 23:41:21,42


In [30]:
rides.shape

(112952048, 2)

In [31]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112952048 entries, 0 to 112952047
Data columns (total 2 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_datetime     datetime64[us]
 1   pickup_location_id  int64         
dtypes: datetime64[us](1), int64(1)
memory usage: 1.7 GB


In [32]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

In [33]:
ts_data.shape

(6645600, 3)

In [34]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6645600 entries, 0 to 6645599
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   pickup_location_id  int16         
 2   rides               int16         
dtypes: datetime64[ns](1), int16(2)
memory usage: 76.1 MB


In [35]:
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01 00:00:00,2,0
1,2022-01-01 01:00:00,2,0
2,2022-01-01 02:00:00,2,0
3,2022-01-01 03:00:00,2,0
4,2022-01-01 04:00:00,2,0
...,...,...,...
6645595,2024-11-30 19:00:00,263,132
6645596,2024-11-30 20:00:00,263,97
6645597,2024-11-30 21:00:00,263,82
6645598,2024-11-30 22:00:00,263,91


In [36]:
ts_data['pickup_hour'] = ts_data['pickup_hour'].dt.tz_localize(tz='US/Eastern', ambiguous=True, nonexistent='shift_forward')

In [37]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6645600 entries, 0 to 6645599
Data columns (total 3 columns):
 #   Column              Dtype                     
---  ------              -----                     
 0   pickup_hour         datetime64[ns, US/Eastern]
 1   pickup_location_id  int16                     
 2   rides               int16                     
dtypes: datetime64[ns, US/Eastern](1), int16(2)
memory usage: 76.1 MB


In [40]:
import hopsworks

api_key = os.getenv('HOPSWORKS_API_KEY')
project_name = os.getenv('HOPSWORKS_PROJECT_NAME')

# pip install confluent-kafka
# Initialize connection to Hopsworks
project = hopsworks.login(
    api_key_value=api_key,
    project=project_name
)
print(f"Successfully connected to Hopsworks project: {project_name}")

2025-02-17 14:53:37,156 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-17 14:53:37,173 INFO: Initializing external client
2025-02-17 14:53:37,175 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-02-17 14:53:38,162 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212635
Successfully connected to Hopsworks project: nolanphi


In [41]:
feature_store = project.get_feature_store()

In [42]:
FEATURE_GROUP_NAME = "time_series_hourly_feature_group"
FEATURE_GROUP_VERSION = 1

In [43]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key=["pickup_location_id", "pickup_hour"],
    event_time="pickup_hour"
)

In [44]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1212635/fs/1200268/fg/1400941


Uploading Dataframe: 100.00% |██████████| Rows 6645600/6645600 | Elapsed Time: 12:40 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1212635/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)

In [45]:
df_memory_mb = rides.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"DataFrame size: {df_memory_mb:.2f} MB")

DataFrame size: 2585.27 MB


In [46]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6645600 entries, 0 to 6645599
Data columns (total 3 columns):
 #   Column              Dtype                     
---  ------              -----                     
 0   pickup_hour         datetime64[ns, US/Eastern]
 1   pickup_location_id  int16                     
 2   rides               int16                     
dtypes: datetime64[ns, US/Eastern](1), int16(2)
memory usage: 76.1 MB
