# 02 - Unity Catalog Connection Setup

This notebook configures Unity Catalog connections for SFTP data sources:
- Create Unity Catalog connections for source and target SFTP
- Test connections using AutoLoader
- Configure external locations

## 1. Configure Catalog and Schema

Set the catalog and schema names. These should match the values used in notebook 01.

In [0]:
# Create widgets for catalog and schema configuration
dbutils.widgets.text("catalog_name", "sftp_demo", "Catalog Name")
dbutils.widgets.text("schema_name", "default", "Schema Name")

# Get widget values
CATALOG_NAME = dbutils.widgets.get("catalog_name")
SCHEMA_NAME = dbutils.widgets.get("schema_name")

print(f"Catalog: {CATALOG_NAME}")
print(f"Schema: {SCHEMA_NAME}")

## 2. Load Configuration from Previous Setup

In [None]:
# Create widgets for catalog and schema configuration
dbutils.widgets.text("catalog_name", "sftp_demo", "Catalog Name")
dbutils.widgets.text("schema_name", "default", "Schema Name")
dbutils.widgets.text("source_connection_name", "source_sftp_connection", "Source Connection Name")
dbutils.widgets.text("target_connection_name", "target_sftp_connection", "Target Connection Name")

# Get widget values
CATALOG_NAME = dbutils.widgets.get("catalog_name")
SCHEMA_NAME = dbutils.widgets.get("schema_name")
SOURCE_CONNECTION_NAME = dbutils.widgets.get("source_connection_name")
TARGET_CONNECTION_NAME = dbutils.widgets.get("target_connection_name")

print(f"Catalog: {CATALOG_NAME}")
print(f"Schema: {SCHEMA_NAME}")
print(f"Source Connection: {SOURCE_CONNECTION_NAME}")
print(f"Target Connection: {TARGET_CONNECTION_NAME}")

## 3. Create Unity Catalog Connection for Source SFTP

**Note:** This requires Databricks workspace admin privileges.

In [None]:
# Load configuration
config_df = spark.table(f"{CATALOG_NAME}.config.connection_params")
config_dict = {row.key: row.value for row in config_df.collect()}

# Get configuration values
catalog_name = config_dict.get("catalog_name", CATALOG_NAME)
schema_name = config_dict.get("schema_name", SCHEMA_NAME)
source_connection_name = config_dict.get("source_connection_name", SOURCE_CONNECTION_NAME)
target_connection_name = config_dict.get("target_connection_name", TARGET_CONNECTION_NAME)

source_host = config_dict["source_host"]
source_username = config_dict["source_username"]
target_host = config_dict["target_host"]
target_username = config_dict["target_username"]
secret_scope = config_dict["secret_scope"]
ssh_key_secret = config_dict["ssh_key_secret"]

print("Configuration loaded successfully")
print(f"Catalog: {catalog_name}")
print(f"Schema: {schema_name}")
print(f"Source Connection: {source_connection_name}")
print(f"Target Connection: {target_connection_name}")
print(f"Secret scope: {secret_scope}")
print(f"SSH key secret: {ssh_key_secret}")

## 4. Create Unity Catalog Connection for Target SFTP

In [None]:
# Set catalog context first
spark.sql(f"USE CATALOG {CATALOG_NAME}")

# Create source SFTP connection using correct syntax
# Note: Unity Catalog requires 'user' and 'pem_private_key' (not 'username' and 'privateKey')
spark.sql(f"""
CREATE CONNECTION IF NOT EXISTS {SOURCE_CONNECTION_NAME}
TYPE sftp
OPTIONS (
  host '{source_host}',
  port '22',
  user SECRET ('{secret_scope}', 'source-username'),
  pem_private_key SECRET ('{secret_scope}', '{ssh_key_secret}')
)
""")

print(f"Source SFTP connection created: {CATALOG_NAME}.{SOURCE_CONNECTION_NAME}")

## 5. Verify Connections

In [None]:
# Create target SFTP connection using correct syntax
spark.sql(f"""
CREATE CONNECTION IF NOT EXISTS {TARGET_CONNECTION_NAME}
TYPE sftp
OPTIONS (
  host '{target_host}',
  port '22',
  user SECRET ('{secret_scope}', 'target-username'),
  pem_private_key SECRET ('{secret_scope}', '{ssh_key_secret}')
)
""")

print(f"Target SFTP connection created: {CATALOG_NAME}.{TARGET_CONNECTION_NAME}")

## 6. Test Source Connection with AutoLoader

Read data from source SFTP using AutoLoader to verify the connection works.

In [0]:
# Test reading customers.csv from source SFTP
source_path = f"sftp://{source_host}/customers.csv"

customers_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.source_sftp_connection")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(source_path)
)

# Display schema
customers_df.printSchema()

# Write to temporary table for verification
(
    customers_df.writeStream
    .format("memory")
    .queryName("test_customers")
    .outputMode("append")
    .start()
)

print("Source SFTP connection verified successfully")

In [0]:
# Display sample data
display(spark.sql("SELECT * FROM test_customers LIMIT 10"))

# Test reading customers.csv from source SFTP
source_path = f"sftp://{source_host}/customers.csv"

customers_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.{SOURCE_CONNECTION_NAME}")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(source_path)
)

# Display schema
customers_df.printSchema()

# Write to temporary table for verification
(
    customers_df.writeStream
    .format("memory")
    .queryName("test_customers")
    .outputMode("append")
    .start()
)

print("Source SFTP connection verified successfully")

In [0]:
# Create catalog structure for DLT pipeline
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.bronze")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.silver")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.gold")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

print("Catalog structure created:")
print(f"  - {CATALOG_NAME}.bronze (raw data from source SFTP)")
print(f"  - {CATALOG_NAME}.silver (cleaned and validated data)")
print(f"  - {CATALOG_NAME}.gold (aggregated business-level data)")
print(f"  - {CATALOG_NAME}.{SCHEMA_NAME} (default schema)")

## 8. Create External Location for Checkpoints

In [0]:
# Create checkpoint location in DBFS
checkpoint_location = f"/dbfs/{CATALOG_NAME}/checkpoints"
dbutils.fs.mkdirs(checkpoint_location)

print(f"Checkpoint location created: {checkpoint_location}")

## 9. Grant Permissions (if needed)

Grant necessary permissions to use the connections in DLT pipelines.

In [0]:
# Grant USAGE on connections to all users (adjust as needed)
# Uncomment if you need to grant permissions:

# spark.sql(f"""
# GRANT USAGE ON CONNECTION {CATALOG_NAME}.source_sftp_connection 
# TO `account users`
# """)

# spark.sql(f"""
# GRANT USAGE ON CONNECTION {CATALOG_NAME}.target_sftp_connection 
# TO `account users`
# """)

print("Connection permissions configured")

## 10. Test Complete Data Flow

In [0]:
# Read orders.csv from source SFTP
orders_path = f"sftp://{source_host}/orders.csv"

orders_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.source_sftp_connection")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(orders_path)
)

# Display schema
orders_df.printSchema()

# Write to temporary table
(
    orders_df.writeStream
    .format("memory")
    .queryName("test_orders")
    .outputMode("append")
    .start()
)

print("Orders data loaded successfully")

In [0]:
# Display sample orders data
display(spark.sql("SELECT * FROM test_orders LIMIT 10"))

# Read orders.csv from source SFTP
orders_path = f"sftp://{source_host}/orders.csv"

orders_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.{SOURCE_CONNECTION_NAME}")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(orders_path)
)

# Display schema
orders_df.printSchema()

# Write to temporary table
(
    orders_df.writeStream
    .format("memory")
    .queryName("test_orders")
    .outputMode("append")
    .start()
)

print("Orders data loaded successfully")