In [None]:
import pandas as pd
import pyarrow.parquet as pq
from google.cloud import storage

# Initialize Google Cloud Storage client
storage_client = storage.Client()

# Define the GCS bucket and folder paths
bucket_name = 'project-batch-processing'
input_folder = 'processed_files/parquet_cleaned_data'
output_folder = 'processed_files/parquet_modified_data'

# List all the Parquet files in the input folder
blobs = storage_client.list_blobs(bucket_name, prefix=input_folder)

# Process each file
for blob in blobs:
    if blob.name.endswith('.parquet'):  # Make sure it's a Parquet file
        input_uri = f"gs://{bucket_name}/{blob.name}"

        # Create the output file path by replacing the input folder with the output folder
        output_uri = blob.name.replace(input_folder, output_folder)

        # Read the Parquet file into a DataFrame
        df = pd.read_parquet(f"gs://{bucket_name}/{blob.name}")

        # Rename the problematic column
        df.rename(columns={'Distance(mi)': 'distance_mi',
                  'Temperature(F)': 'temperature_f'},
                  inplace=True)

        # Save the modified DataFrame back as Parquet
        output_path = f"gs://{bucket_name}/{output_uri}"
        df.to_parquet(output_path)

        print(f"Processed and saved {output_uri}")
