# R2 Clickstream Data to Iceberg

This notebook loads clickstream data from R2 for a specific day and hour, then loads it into an Iceberg table.

In [None]:
import os
import pandas as pd
import pyarrow as pa
import sys
import json
# Install Boto3 if not already installed
!{sys.executable} -m pip install boto3
import boto3
# Install duckdb if not already installed
!{sys.executable} -m pip install duckdb
import duckdb
# Install PyIceberg if not already installed
!{sys.executable} -m pip install pyiceberg
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.exceptions import NamespaceAlreadyExistsError

# Import our UUID helper functions
sys.path.append(os.path.join(os.getcwd()))
from uuid_fix import stringify_uuids, check_for_uuids, convert_uuids_in_dataframe

## Configure Environment

Set up environment variables for both R2 and Iceberg catalog connections.

In [None]:
r2_access_key = os.environ.get("R2_ACCESS_KEY_ID")
r2_secret_key = os.environ.get("R2_SECRET_ACCESS_KEY")
r2_endpoint = os.environ.get("R2_ENDPOINT")
r2_bucket = os.environ.get("R2_CLICKSTREAM_BUCKET", "analytics-pipeline")

warehouse = os.environ.get("WAREHOUSE")
token = os.environ.get("TOKEN")
catalog_uri = os.environ.get("CATALOG_URI")

missing_r2 = []
if not r2_endpoint: missing_r2.append("R2_ENDPOINT")
if not r2_access_key: missing_r2.append("R2_ACCESS_KEY")
if not r2_secret_key: missing_r2.append("R2_SECRET_KEY")

missing_iceberg = []
if not warehouse: missing_iceberg.append("WAREHOUSE")
if not token: missing_iceberg.append("TOKEN")
if not catalog_uri: missing_iceberg.append("CATALOG_URI")

if missing_r2:
    print(f"Warning: Missing required R2 credentials: {', '.join(missing_r2)}")
if missing_iceberg:
    print(f"Warning: Missing required Iceberg credentials: {', '.join(missing_iceberg)}")

print(f"R2 Bucket: {r2_bucket}")
print(f"Warehouse: {warehouse}")
print(f"Catalog URI: {catalog_uri}")

## Connect to R2 and Iceberg

Establish connections to R2 Storage and Iceberg catalog.

In [None]:
try:
    if all([r2_access_key, r2_secret_key]):
        s3_client = boto3.client(
            's3',
            aws_access_key_id=r2_access_key,
            aws_secret_access_key=r2_secret_key,
            endpoint_url=r2_endpoint
        )
        print("Connected to R2 successfully!")
    else:
        print("Cannot connect to R2 - missing required credentials")
        s3_client = None
except Exception as e:
    print(f"R2 connection failed: {str(e)}")
    s3_client = None

try:
    if all([warehouse, token, catalog_uri]):
        catalog = RestCatalog(
            name="my_catalog",
            warehouse=warehouse,
            uri=catalog_uri,
            token=token,
        )
        print("Connected to Iceberg catalog successfully!")
    else:
        print("Cannot connect to Iceberg catalog - missing required credentials")
        catalog = None
except Exception as e:
    print(f"Iceberg connection failed: {str(e)}")
    catalog = None

## Setup Parameters

Configure which day and hour of data to process.

In [None]:
event_date = "2025-05-11"  # Format: YYYY-MM-DD
event_hour = "01"          # Format: HH (24-hour)

s3_prefix = f"event_date={event_date}/hr={event_hour}/"
print(f"Will process data in: {s3_prefix}")

iceberg_namespace = "default"
iceberg_table = "clickstream_events"

## Create Namespace (if needed)

Ensure the target namespace exists in the Iceberg catalog.

In [None]:
if catalog is not None:
    try:
        catalog.create_namespace(iceberg_namespace)
        print(f"Created '{iceberg_namespace}' namespace")
    except NamespaceAlreadyExistsError:
        print(f"'{iceberg_namespace}' namespace already exists")
    except Exception as e:
        print(f"Error creating namespace: {str(e)}")

## List Available S3 Objects

Check what data files are available for the selected day and hour.

In [None]:
def list_s3_objects(bucket, prefix):
    if s3_client is None:
        return "Not connected to R2"
    
    try:
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
        if 'Contents' in response:
            return [obj['Key'] for obj in response['Contents']]
        else:
            return []
    except Exception as e:
        return f"Error listing objects: {str(e)}"

s3_objects = list_s3_objects(r2_bucket, s3_prefix)

if isinstance(s3_objects, list):
    print(f"Found {len(s3_objects)} objects in the specified path")
    if s3_objects:
        for i, obj in enumerate(s3_objects[:5]):
            print(f"- {obj}")
        if len(s3_objects) > 5:
            print(f"... and {len(s3_objects) - 5} more")
    else:
        print("No data files found for the specified date and hour")
else:
    print(s3_objects)

## Load Data from R2

Load the clickstream data from R2 into a pandas DataFrame.

In [None]:
db_file = "/home/jovyan/duckdb_data/clickstream.db"
os.makedirs(os.path.dirname(db_file), exist_ok=True)

con = duckdb.connect(db_file)

# Optional: Set some pragmas for better performance
con.execute("PRAGMA temp_directory='/home/jovyan/duckdb_data/temp'")
con.execute("PRAGMA memory_limit='100GB'")

def read_s3_with_duckdb(bucket, key):
    try:
        con.execute(f"""
            SET s3_region='auto';
            SET s3_endpoint='{r2_endpoint.replace('https://', '')}';
            SET s3_access_key_id='{r2_access_key}';
            SET s3_secret_access_key='{r2_secret_key}';
        """)
        
        s3_url = f"s3://{bucket}/{key}"
        print(f"Reading {s3_url}")
        
        if key.endswith('.json.gz'):
            query = f"""
                SELECT * FROM read_json_auto('{s3_url}', 
                    filename=true, 
                    maximum_object_size=3000000000,
                    ignore_errors=true)
            """
        elif key.endswith('.parquet'):
            query = f"SELECT * FROM read_parquet('{s3_url}')"
        elif key.endswith('.csv') or key.endswith('.tsv') or key.endswith('.txt'):
            query = f"SELECT * FROM read_csv_auto('{s3_url}')"
        else:
            return None, f"Unsupported file type: {key}"
        
        result = con.execute(query).fetchdf()
        return result, None
    except Exception as e:
        return None, f"Error reading file with DuckDB: {str(e)}"

all_data = []
error_count = 0
valid_count = 0

if isinstance(s3_objects, list) and s3_objects:
    for obj_key in s3_objects:
        print(f"Processing {obj_key} with DuckDB...")
        
        df, error = read_s3_with_duckdb(r2_bucket, obj_key)
            
        if df is not None:
            row_count = len(df)
            print(f"Successfully loaded {row_count} rows from {obj_key}")
            all_data.append(df)
            valid_count += 1
        else:
            print(f"Failed to load {obj_key}: {error}")
            error_count += 1
    
    if all_data:
        clickstream_data = pd.concat(all_data, ignore_index=True)
        print(f"Loaded {len(clickstream_data)} total rows from {valid_count} files")
        print(f"Failed to load {error_count} files")
        
        print(clickstream_data.head())
    else:
        print("No data was loaded successfully")
        clickstream_data = None
else:
    print("No data files to process")
    clickstream_data = None

## Process Data for Iceberg

Clean and prepare the data for Iceberg storage.

In [None]:
def process_clickstream_data(df):
    if df is None or len(df) == 0:
        return None
    
    processed_df = df.copy()
    
    if 'timestamp' in processed_df.columns and processed_df['timestamp'].dtype == 'object':
        processed_df['timestamp'] = pd.to_datetime(processed_df['timestamp'])
    
    return processed_df

if clickstream_data is not None:
    processed_data = process_clickstream_data(clickstream_data)
    if processed_data is not None:
        print(f"Data processed successfully: {len(processed_data)} rows")
        processed_data.head()
    else:
        print("Data processing failed")
else:
    processed_data = None

## Create Iceberg Table

Create a new Iceberg table if it doesn't exist yet.

In [None]:
def df_to_pyarrow(df):
    if df is None:
        return None
    
    try:
        df_copy = convert_uuids_in_dataframe(df)
        
        if 'timestamp' in df_copy.columns and pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
            print("Converting timestamp to ISO format string")
            if df_copy['timestamp'].dt.tz is not None:
                df_copy['timestamp'] = df_copy['timestamp'].dt.tz_convert('UTC').dt.tz_localize(None)
            df_copy['timestamp'] = df_copy['timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f')
        
        if 'event_date' in df_copy.columns:
            if pd.api.types.is_datetime64_any_dtype(df_copy['event_date']):
                print("Converting event_date datetime to string format")
                df_copy['event_date'] = df_copy['event_date'].dt.strftime('%Y-%m-%d')
            else:
                print("event_date is not a datetime, keeping as is")
        
        for col in ['session_data', 'device_info', 'event_data', 'raw_event']:
            if col in df_copy.columns:
                print(f"Converting column {col} to JSON strings")
                df_copy[col] = df_copy[col].apply(
                    lambda x: json.dumps(x) if isinstance(x, (dict, list)) else 
                             (str(x) if not isinstance(x, (str, int, float, bool, type(None))) else x)
                )
        
        return pa.Table.from_pandas(df_copy, preserve_index=False)
    except Exception as e:
        print(f"Error converting to PyArrow: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def create_iceberg_table(catalog, namespace, table_name, schema):
    if catalog is None:
        return "Not connected to Iceberg catalog"
    
    try:
        table_identifier = (namespace, table_name)
        if not catalog.table_exists(table_identifier):
            table = catalog.create_table(
                table_identifier,
                schema=schema,
                partition_spec=["event_date", "hr"]
            )
            return f"Created table: {table_name}"
        else:
            table = catalog.load_table(table_identifier)
            return f"Table already exists: {table_name}"
    except Exception as e:
        return f"Error creating/loading table: {str(e)}"

if processed_data is not None:
    arrow_table = df_to_pyarrow(processed_data)
    if arrow_table is not None:
        print("Data converted to PyArrow Table successfully")
        print(f"Schema: {arrow_table.schema}")
        
        table_result = create_iceberg_table(catalog, iceberg_namespace, iceberg_table, arrow_table.schema)
        print(table_result)
    else:
        print("Failed to convert data to PyArrow Table")
else:
    print("No processed data available")

## Write Data to Iceberg Table

Write the processed clickstream data to the Iceberg table.

In [None]:
def append_to_iceberg_table(catalog, namespace, table_name, data):
    if catalog is None:
        return "Not connected to Iceberg catalog"
    
    if data is None:
        return "No data to append"
        
    try:
        table_identifier = (namespace, table_name)
        if catalog.table_exists(table_identifier):
            table = catalog.load_table(table_identifier)
            
            table.append(data)
            return f"Data appended to table: {table_name}"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error appending data: {str(e)}"

if arrow_table is not None:
    append_result = append_to_iceberg_table(catalog, iceberg_namespace, iceberg_table, arrow_table)
    print(append_result)

## Verify Data in Iceberg Table

Query the Iceberg table to verify the data was written successfully.

In [None]:
def query_iceberg_table(catalog, namespace, table_name):
    if catalog is None:
        return "Not connected to Iceberg catalog"
        
    try:
        table_identifier = (namespace, table_name)
        if catalog.table_exists(table_identifier):
            table = catalog.load_table(table_identifier)
            
            scanned = table.scan().to_arrow()
            if len(scanned) > 0:
                return scanned.to_pandas()
            else:
                return "Table exists but has no data"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error querying table: {str(e)}"

result = query_iceberg_table(catalog, iceberg_namespace, iceberg_table)
if isinstance(result, pd.DataFrame):
    print(f"Retrieved {len(result)} rows from Iceberg table")
    result.head()
else:
    print(result)

## Filter Data by Partition

Query the Iceberg table with partition filters to verify partitioning works correctly.

In [None]:
def query_with_filters(catalog, namespace, table_name, filters):
    if catalog is None:
        return "Not connected to Iceberg catalog"
        
    try:
        table_identifier = (namespace, table_name)
        if catalog.table_exists(table_identifier):
            table = catalog.load_table(table_identifier)
            
            scan = table.scan()
            for col, op, val in filters:
                scan = scan.filter(getattr(scan, col)[op](val))
                
            results = scan.to_arrow()
            if len(results) > 0:
                return results.to_pandas()
            else:
                return "No data matching filters"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error querying table: {str(e)}"

filters = [
    ("event_date", "equals", event_date),
    ("hr", "equals", event_hour)
]

filtered_results = query_with_filters(catalog, iceberg_namespace, iceberg_table, filters)
if isinstance(filtered_results, pd.DataFrame):
    print(f"Retrieved {len(filtered_results)} rows matching filters")
    filtered_results.head()
else:
    print(filtered_results)