In [1]:
#!/usr/bin/env python3
"""
clean_preprocess_to_hopsworks.py

1) Loads raw Citibike CSVs,
2) Cleans & feature-engineers them,
3) Filters to the top-K busiest start stations,
4) Writes out a local Parquet, AND
5) Pushes the DataFrame into Hopsworks Feature Store.
"""

import os
import glob
import pandas as pd
import hopsworks

# ──────────────────────────────────────────────────────────────────────────────
# LOCAL CONFIGURATION — update these to your paths
CSV_DIR     = "/Users/manu/Desktop/cda_final/data/processed/raw_citibike_csvs"
OUTPUT_DIR  = "/Users/manu/Desktop/cda_final/data/processed/cleaned_citibike"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "citibike_2023_top3.parquet")
TOP_K       = 3
# ──────────────────────────────────────────────────────────────────────────────

# Columns to load
COLUMNS = [
    "ride_id",
    "rideable_type",
    "started_at",
    "ended_at",
    "start_station_name",
    "start_station_id",
    "end_station_name",
    "end_station_id",
    "start_lat",
    "start_lng",
    "end_lat",
    "end_lng",
    "member_casual",
]

def load_all_csvs(csv_dir: str) -> pd.DataFrame:
    paths = glob.glob(os.path.join(csv_dir, "*.csv"))
    dfs = []
    for p in paths:
        df = pd.read_csv(
            p,
            usecols=COLUMNS,
            dtype={
                "ride_id": str,
                "rideable_type": "category",
                "start_station_id": str,
                "end_station_id": str,
                "member_casual": "category",
            },
        )
        print(f"Loaded {len(df):,} rows from {os.path.basename(p)}")
        dfs.append(df)
    combined = pd.concat(dfs, ignore_index=True)
    print(f"Total rows after concat: {len(combined):,}")
    return combined


def parse_and_engineer(df: pd.DataFrame) -> pd.DataFrame:
    df["started_at"] = pd.to_datetime(df["started_at"])
    df["ended_at"]   = pd.to_datetime(df["ended_at"])
    df["trip_duration_min"] = (
        (df["ended_at"] - df["started_at"])
        .dt.total_seconds().div(60).clip(lower=0)
    )
    df["start_hour"]      = df["started_at"].dt.hour
    df["start_dayofweek"] = df["started_at"].dt.dayofweek
    return df


def filter_top_stations(df: pd.DataFrame, k: int) -> pd.DataFrame:
    top = df["start_station_name"].value_counts().nlargest(k).index.tolist()
    print(f"Top {k} stations:\n  " + "\n  ".join(top))
    return df[df["start_station_name"].isin(top)].copy()


def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    before = len(df)
    df = df.drop_duplicates(subset=["ride_id"])
    df = df.dropna(subset=[
        "started_at",
        "ended_at",
        "start_station_name",
        "end_station_name",
        "trip_duration_min",
    ])
    df = df[df["trip_duration_min"] > 0]
    print(f"Dropped {before - len(df):,} invalid/duplicate rows")
    return df


def store_to_hopsworks(df: pd.DataFrame):
    project = hopsworks.login(
        host=os.getenv("HOPSWORKS_HOST", "c.app.hopsworks.ai"),
        project="ny_taxi_manognat",
        api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    )
    fs = project.get_feature_store()
    fg = fs.get_or_create_feature_group(
        name="citibike_top3_trips",
        version=1,
        primary_key=["ride_id"],
        description="Cleaned Citibike 2023 trips for top-3 busiest start stations",
    )
    fg.insert(df, write_options={"wait_for_job": False})
    print("✅ Data inserted to Hopsworks feature store 'citibike_top3_trips'")


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = load_all_csvs(CSV_DIR)
    df = parse_and_engineer(df)
    df = filter_top_stations(df, TOP_K)
    df = clean_df(df)
    df.to_parquet(OUTPUT_FILE, index=False)
    print(f"\n✓ Saved cleaned data ({len(df):,} rows) to {OUTPUT_FILE}\n")
    store_to_hopsworks(df)

if __name__ == "__main__":
    main()


Loaded 1,000,000 rows from 202312-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202301-citibike-tripdata_1.csv
Loaded 204,874 rows from 202312-citibike-tripdata_3.csv
Loaded 453,152 rows from 202305-citibike-tripdata_4.csv
Loaded 1,000,000 rows from 202312-citibike-tripdata_1.csv
Loaded 795,412 rows from 202301-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_1.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_1.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_1.csv
Loaded 451,549 rows from 202306-citibike-tripdata_4.csv
Loaded 1,000,000 rows from 202311-citibike-tripdata_1.csv
Loaded 696,171 rows fr

Uploading Dataframe: 100.00% |█| Rows 365696/365696 | Elapsed Time: 00:27 | Rema


Launching job: citibike_top3_trips_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1215653/jobs/named/citibike_top3_trips_1_offline_fg_materialization/executions
✅ Data inserted to Hopsworks feature store 'citibike_top3_trips'


In [3]:
#!/usr/bin/env python3
"""
clean_preprocess_to_hopsworks.py

1) Loads raw Citibike CSVs,
2) Cleans & feature-engineers them,
3) Filters to the top-K busiest start stations,
4) Writes out a local Parquet, AND
5) Pushes the DataFrame into Hopsworks Feature Store.
"""

import os
import glob
import pandas as pd
import hopsworks

# ──────────────────────────────────────────────────────────────────────────────
# LOCAL CONFIGURATION — update these to your paths
CSV_DIR     = "/Users/manu/Desktop/cda_final/data/processed/raw_citibike_csvs"
OUTPUT_DIR  = "/Users/manu/Desktop/cda_final/data/processed/cleaned_citibike"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "citibike_2023_top3.parquet")
TOP_K       = 3
# ──────────────────────────────────────────────────────────────────────────────

# Columns to load
COLUMNS = [
    "ride_id",
    "rideable_type",
    "started_at",
    "ended_at",
    "start_station_name",
    "start_station_id",
    "end_station_name",
    "end_station_id",
    "start_lat",
    "start_lng",
    "end_lat",
    "end_lng",
    "member_casual",
]

# Hopsworks credentials
HOPSWORKS_PROJECT = "cda_finalproject"
HOPSWORKS_API_KEY = "6gzWAMNbeMfiWgiV.spXHHGk4dHXcQPVR11bSPrr2xSdpqzvj5SrtBolFJCb1V2wSu4v1nUYSB25aNPTt"


def load_all_csvs(csv_dir: str) -> pd.DataFrame:
    paths = glob.glob(os.path.join(csv_dir, "*.csv"))
    dfs = []
    for p in paths:
        df = pd.read_csv(
            p,
            usecols=COLUMNS,
            dtype={
                "ride_id": str,
                "rideable_type": "category",
                "start_station_id": str,
                "end_station_id": str,
                "member_casual": "category",
            },
        )
        print(f"Loaded {len(df):,} rows from {os.path.basename(p)}")
        dfs.append(df)
    combined = pd.concat(dfs, ignore_index=True)
    print(f"Total rows after concat: {len(combined):,}")
    return combined


def parse_and_engineer(df: pd.DataFrame) -> pd.DataFrame:
    df["started_at"] = pd.to_datetime(df["started_at"])
    df["ended_at"]   = pd.to_datetime(df["ended_at"])
    df["trip_duration_min"] = (
        (df["ended_at"] - df["started_at"])
        .dt.total_seconds().div(60).clip(lower=0)
    )
    df["start_hour"]      = df["started_at"].dt.hour
    df["start_dayofweek"] = df["started_at"].dt.dayofweek
    return df


def filter_top_stations(df: pd.DataFrame, k: int) -> pd.DataFrame:
    top = df["start_station_name"].value_counts().nlargest(k).index.tolist()
    print(f"Top {k} stations:\n  " + "\n  ".join(top))
    return df[df["start_station_name"].isin(top)].copy()


def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    before = len(df)
    df = df.drop_duplicates(subset=["ride_id"])
    df = df.dropna(subset=[
        "started_at",
        "ended_at",
        "start_station_name",
        "end_station_name",
        "trip_duration_min",
    ])
    df = df[df["trip_duration_min"] > 0]
    print(f"Dropped {before - len(df):,} invalid/duplicate rows")
    return df


def store_to_hopsworks(df: pd.DataFrame):
    # Login to Hopsworks with embedded credentials
    project = hopsworks.login(
        host="c.app.hopsworks.ai",
        project=HOPSWORKS_PROJECT,
        api_key_value=HOPSWORKS_API_KEY
    )
    print(f"✅ Logged into Hopsworks project: {project.name}")
    fs = project.get_feature_store()
    fg = fs.get_or_create_feature_group(
        name="citibike_top3_trips",
        version=1,
        primary_key=["ride_id"],
        description="Cleaned Citibike 2023 trips for top-3 busiest start stations",
        event_time="started_at",
        online_enabled=True
    )
    fg.insert(df, write_options={"wait_for_job": False})
    print("✅ Data inserted to Hopsworks feature store 'citibike_top3_trips'")


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = load_all_csvs(CSV_DIR)
    df = parse_and_engineer(df)
    df = filter_top_stations(df, TOP_K)
    df = clean_df(df)
    df.to_parquet(OUTPUT_FILE, index=False)
    print(f"\n✓ Saved cleaned data ({len(df):,} rows) to {OUTPUT_FILE}\n")
    store_to_hopsworks(df)


if __name__ == "__main__":
    main()


Loaded 1,000,000 rows from 202312-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202301-citibike-tripdata_1.csv
Loaded 204,874 rows from 202312-citibike-tripdata_3.csv
Loaded 453,152 rows from 202305-citibike-tripdata_4.csv
Loaded 1,000,000 rows from 202312-citibike-tripdata_1.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_1.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202305-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_2.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_3.csv
Loaded 1,000,000 rows from 202306-citibike-tripdata_1.csv
Loaded 1,000,000 rows from 202309-citibike-tripdata_1.csv
Loaded 451,549 rows from 202306-citibike-tripdata_4.csv
Loaded 1,000,000 rows from 202311-citibike-tripdata_1.csv
Loaded 696,171 rows from 202302-citibike-tripdata_2.csv
Loaded 471,150 rows fr

Uploading Dataframe: 100.00% |█| Rows 346605/346605 | Elapsed Time: 00:26 | Rema


Launching job: citibike_top3_trips_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1231004/jobs/named/citibike_top3_trips_1_offline_fg_materialization/executions
✅ Data inserted to Hopsworks feature store 'citibike_top3_trips'


In [4]:
import hopsworks

# 1. Login (you’ve already done this)
project = hopsworks.login()

# 2. Get the Feature Store handle
fs = project.get_feature_store()

# 3. Retrieve your offline feature group (default version is 1)
fg = fs.get_feature_group(name="citibike_top3_trips", version=1)

# 4. Read it as a Pandas DataFrame
df = fg.read()
print(df.head())


2025-05-10 21:46:09,464 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 21:46:09,470 INFO: Initializing external client
2025-05-10 21:46:09,470 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 21:46:10,097 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215653
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (9.99s) 
            ride_id  rideable_type                       started_at  \
0  439DA3C765AD8DF2   classic_bike 2023-05-23 19:31:31.187000+00:00   
1  AB8C9E939FE8275B  electric_bike 2023-06-25 13:15:42.200000+00:00   
2  7D93A6F009E0D3BE   classic_bike 2023-10-25 08:54:27.573000+00:00   
3  A6F9CC7562535822   classic_bike 2023-11-02 12:57:54.247000+00:00   
4  8E5E51833C743C3D   classic_bike 2023-08-03 00:31:16.463000+00:00   

                          ended_at  start_station_name start_station_id  \
0 2023-05-23 19:34:58.165000+00:00     W 2