In [None]:
HOPSWORKS_PROJECT_NAME = "taxi_dmd"

In [None]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file located in the parent directory
load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [None]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2022
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year+1):
    
    # download data for the whole year
    rides_one_year = load_raw_data(year)
    
    # append rows
    rides = pd.concat([rides, rides_one_year])

In [None]:
print(f'{len(rides)=}')

In [None]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

In [None]:
ts_data.dtypes

In [None]:
# # string to datetime
# ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# # add column with Unix epoch milliseconds
# ts_data['pickup_ts'] = ts_data['pickup_hour'].astype(int) // 10**6

In [None]:
import hopsworks

In [None]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

In [None]:
feature_store = project.get_feature_store()

In [None]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 2

In [None]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description='Time-series data at hourly frequency',
    primary_key=['pickup_location_id', 'pickup_ts'],
    event_time='pickup_ts',
)

In [None]:
feature_group.insert(ts_data, write_options={'wait_for_job': False})