# Connecting to Iceberg Tables

This notebook demonstrates how to connect to and interact with Iceberg tables using PyIceberg.

In [None]:
# Import required libraries
import os
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import sys
!{sys.executable} -m pip install pyiceberg
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.exceptions import NamespaceAlreadyExistsError

## Connection Setup

Connect to the Cloudflare R2 Data Catalog using credentials from environment variables.

In [None]:
# Get connection details from environment variables
warehouse = os.environ.get("WAREHOUSE")
token = os.environ.get("TOKEN")
catalog_uri = os.environ.get("CATALOG_URI")

# Show connection information (without revealing token)
print(f"Warehouse: {warehouse}")
print(f"Catalog URI: {catalog_uri}")
print(f"Token: {'Provided' if token else 'Not provided'}")

# Check if all required credentials are available
if not all([warehouse, token, catalog_uri]):
    missing = []
    if not warehouse: missing.append("WAREHOUSE")
    if not token: missing.append("TOKEN")
    if not catalog_uri: missing.append("CATALOG_URI")
    print(f"Warning: Missing required environment variables: {', '.join(missing)}")

In [None]:
# Connect to R2 Data Catalog
try:
    if all([warehouse, token, catalog_uri]):
        catalog = RestCatalog(
            name="my_catalog",
            warehouse=warehouse,
            uri=catalog_uri,
            token=token,
        )
        print("Connected to R2 Data Catalog successfully!")
    else:
        print("Cannot connect - missing required credentials")
        catalog = None
except Exception as e:
    print(f"Connection failed: {str(e)}")
    catalog = None

## Namespace Management

Create a namespace if it doesn't exist.

In [None]:
# Create default namespace if needed
if catalog is not None:
    try:
        catalog.create_namespace("default")
        print("Created 'default' namespace")
    except NamespaceAlreadyExistsError:
        print("'default' namespace already exists")
    except Exception as e:
        print(f"Error creating namespace: {str(e)}")

## List Available Tables

List all tables in the default namespace.

In [None]:
# List tables in the default namespace
if catalog is not None:
    try:
        tables = catalog.list_tables("default")
        print("Available tables in 'default' namespace:")
        if tables:
            for table in tables:
                print(f"- {table.name}")
        else:
            print("No tables found")
    except Exception as e:
        print(f"Error listing tables: {str(e)}")

## Create Sample Data

Create a sample PyArrow table for demonstration.

In [None]:
# Create sample data
sample_data = pa.table({
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
    "score": [80.0, 92.5, 88.0, 74.5, 95.0],
    "department": ["Engineering", "Marketing", "Sales", "Engineering", "Product"]
})

# Display the sample data
sample_data.to_pandas()

## Create a New Table

Create a new Iceberg table using the sample data schema.

In [None]:
# Function to create a new table
def create_table(table_name):
    if catalog is None:
        return "Not connected to catalog"
        
    test_table = ("default", table_name)
    try:
        if not catalog.table_exists(test_table):
            table = catalog.create_table(
                test_table,
                schema=sample_data.schema,
            )
            return f"Created table: {table_name}"
        else:
            table = catalog.load_table(test_table)
            return f"Table already exists: {table_name}"
    except Exception as e:
        return f"Error creating/loading table: {str(e)}"

# Create a new table called 'employees'
create_table("employees")

## Load a Table

Load a specific table and display its contents.

In [None]:
# Function to load and display table contents
def load_table(table_name):
    if catalog is None:
        return "Not connected to catalog"
        
    try:
        test_table = ("default", table_name)
        if catalog.table_exists(test_table):
            table = catalog.load_table(test_table)
            return table
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error loading table: {str(e)}"

# Try to load the employees table
employees_table = load_table("employees")
if isinstance(employees_table, str):
    print(employees_table)
else:
    print(f"Table loaded: {employees_table.identifier}")

## Append Data to a Table

Append sample data to an existing table.

In [None]:
# Function to append data to a table
def append_to_table(table_name, data):
    if catalog is None:
        return "Not connected to catalog"
        
    try:
        test_table = ("default", table_name)
        if catalog.table_exists(test_table):
            table = catalog.load_table(test_table)
            table.append(data)
            return f"Data appended to table: {table_name}"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error appending data: {str(e)}"

# Append the sample data to the employees table
append_to_table("employees", sample_data)

## Query Table Data

Query and display data from a table.

In [None]:
# Function to query and display table data
def query_table(table_name):
    if catalog is None:
        return "Not connected to catalog"
        
    try:
        test_table = ("default", table_name)
        if catalog.table_exists(test_table):
            table = catalog.load_table(test_table)
            # Scan table data and convert to pandas
            scanned = table.scan().to_arrow()
            if len(scanned) > 0:
                return scanned.to_pandas()
            else:
                return "Table exists but has no data"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error querying table: {str(e)}"

# Query the employees table
query_table("employees")

## Drop a Table

Delete a table when it's no longer needed.

In [None]:
# Function to drop a table
def drop_table(table_name):
    if catalog is None:
        return "Not connected to catalog"
        
    try:
        test_table = ("default", table_name)
        if catalog.table_exists(test_table):
            catalog.drop_table(test_table)
            return f"Table dropped: {table_name}"
        else:
            return f"Table does not exist: {table_name}"
    except Exception as e:
        return f"Error dropping table: {str(e)}"

# Uncomment to drop the employees table
# drop_table("employees")