# Merge Processed Batches

This notebook merges all the individual processed batch files from Google Cloud Storage (GCS) into a single, final Parquet file. This creates a clean, unified dataset that will be the input for the modeling phase.

In [None]:
import pandas as pd
import gcsfs
from google.cloud import storage

In [None]:
GCS_BUCKET_NAME = 'llm-feature-engineering-thesis-bucket'
PROCESSED_BATCHES_PREFIX = 'processed_batches/' # The "folder" in GCS where batches are stored
FINAL_OUTPUT_FILE = 'final_llm_features_dataset.parquet'

In [None]:
# Authenticate and Initialize Clients
from google.colab import auth
auth.authenticate_user()
storage_client = storage.Client()
gcs = gcsfs.GCSFileSystem()

In [None]:
# List all processed batch files
print(f"Listing files from gs://{GCS_BUCKET_NAME}/{PROCESSED_BATCHES_PREFIX}...")
batch_files = gcs.glob(f"gs://{GCS_BUCKET_NAME}/{PROCESSED_BATCHES_PREFIX}*.parquet")
print(f"Found {len(batch_files)} batch files to merge.")

if not batch_files:
    print("No batch files found. Exiting.")
else:
    # Load all dataframes into a list
    df_list = []
    for f in batch_files:
        print(f"Reading {f}...")
        with gcs.open(f, 'rb') as f_handle:
            df_list.append(pd.read_parquet(f_handle))

    # Concatenate into a single dataframe
    print("Concatenating all dataframes...")
    final_df = pd.concat(df_list, ignore_index=True)

    print(f"Total rows before deduplication: {len(final_df)}")

    # Remove duplicate rows based on user_id and movie_id
    final_df.drop_duplicates(subset=['user_id', 'movie_id'], inplace=True)

    print(f"Total rows after deduplication: {len(final_df)}")

    # Save the final merged dataframe to GCS
    final_gcs_path = f'gs://{GCS_BUCKET_NAME}/{FINAL_OUTPUT_FILE}'
    print(f"Saving merged dataframe to {final_gcs_path}...")
    final_df.to_parquet(final_gcs_path)

    print("\nVerification")
    print(f"Successfully created final dataset with {len(final_df)} rows.")
    print("Final Dataframe Info:")
    final_df.info()
    print("\nFirst 5 rows:")
    print(final_df.head())