### Setup and authentication

In [None]:
# Install required packages
!pip install google-cloud-storage

# Authenticate
from google.colab import auth
auth.authenticate_user()

# Imports
from google.cloud import storage
import pandas as pd
import io

### Helper function to save a pandas DataFrame as a parquet file to the data bucket

In [None]:
# Install required packages if not already installed
!pip install google-cloud-storage pandas pyarrow

# Authenticate
from google.colab import auth
auth.authenticate_user()

# Imports
from google.cloud import storage
import pandas as pd
import io

def save_parquet_to_bucket(
    df: pd.DataFrame, 
    project_id: str, 
    destination_path: str, 
    row_group_size: int = 750,
    environment: str = "dev"
):
    """
    Save a pandas DataFrame as a parquet file with specified row group size
    
    Args:
        df: pandas DataFrame to save
        project_id: your GCP project ID
        destination_path: path within the bucket (e.g., 'raw-data/myfile.parquet')
        row_group_size: number of rows per group in parquet file (default: 750)
        environment: environment name (default: "dev")
    """
    # Initialize storage client
    client = storage.Client(project=project_id)
    bucket_name = f"{project_id}-{environment}-data"
    bucket = client.bucket(bucket_name)
    
    # Create a blob reference
    blob = bucket.blob(destination_path)
    
    # Save DataFrame to parquet in memory with specified row group size
    parquet_buffer = io.BytesIO()
    df.to_parquet(
        parquet_buffer,
        row_group_size=row_group_size,
        engine='pyarrow'
    )
    
    # Upload to bucket
    parquet_buffer.seek(0)
    blob.upload_from_string(
        parquet_buffer.getvalue(),
        content_type='application/octet-stream'
    )
    
    print(f"DataFrame saved as parquet to gs://{bucket_name}/{destination_path}")
    print(f"Row group size: {row_group_size}")


### Sample usage

In [None]:
# Create sample DataFrame
df = pd.DataFrame({
    'column1': range(2000),
    'column2': [f'value_{i}' for i in range(2000)]
})

PROJECT_ID = "your-project-id"  # Replace with your project ID

# Save to bucket with row group size of 750
save_parquet_to_bucket(
    df=df,
    project_id=PROJECT_ID,
    destination_path="raw-data/my_data.parquet",
    row_group_size=750
)


### List files in bucket

In [None]:
def list_files(bucket, prefix: str = None):
    """List files in the bucket/prefix"""
    blobs = bucket.list_blobs(prefix=prefix)
    for blob in blobs:
        print(f"- {blob.name} ({blob.size} bytes)")

# List all files in raw-data/
list_files(bucket, prefix='raw-data/')

In [None]:
def read_dataframe(source_path: str, bucket) -> pd.DataFrame:
    """Read a CSV file from cloud storage into a DataFrame"""
    blob = bucket.blob(source_path)
    content = blob.download_as_string()
    return pd.read_csv(io.BytesIO(content))

# Usage example:
df = read_dataframe('raw-data/my_data.csv', bucket)
print(df.head())