# 02 - Unity Catalog Connection Setup

This notebook configures Unity Catalog connections for SFTP data sources:
- Create Unity Catalog connections for source and target SFTP
- Test connections using AutoLoader
- Configure external locations

## 1. Load Configuration

In [None]:
# No imports needed - using built-in Databricks functionality

## 1. Configure Catalog and Schema

Set the catalog and schema names. These should match the values used in notebook 01.

In [None]:
# Create widgets for catalog and schema configuration
dbutils.widgets.text("catalog_name", "sftp_demo", "Catalog Name")
dbutils.widgets.text("schema_name", "default", "Schema Name")

# Get widget values
CATALOG_NAME = dbutils.widgets.get("catalog_name")
SCHEMA_NAME = dbutils.widgets.get("schema_name")

print(f"Catalog: {CATALOG_NAME}")
print(f"Schema: {SCHEMA_NAME}")

## 2. Load Configuration from Previous Setup

In [None]:
# Load configuration
config_df = spark.table(f"{CATALOG_NAME}.config.connection_params")
config_dict = {row.key: row.value for row in config_df.collect()}

# Get catalog and schema from config (verify they match)
catalog_name = config_dict.get("catalog_name", CATALOG_NAME)
schema_name = config_dict.get("schema_name", SCHEMA_NAME)

source_host = config_dict["source_host"]
source_username = config_dict["source_username"]
target_host = config_dict["target_host"]
target_username = config_dict["target_username"]
secret_scope = config_dict["secret_scope"]
ssh_key_secret = config_dict["ssh_key_secret"]

print("Configuration loaded successfully")
print(f"Catalog: {catalog_name}")
print(f"Schema: {schema_name}")
print(f"Secret scope: {secret_scope}")
print(f"SSH key secret: {ssh_key_secret}")

## 3. Create Unity Catalog Connection for Source SFTP

**Note:** This requires Databricks workspace admin privileges.

In [None]:
# Create source SFTP connection using SQL
spark.sql(f"""
CREATE CONNECTION IF NOT EXISTS {CATALOG_NAME}.source_sftp_connection
TYPE sftp
OPTIONS (
  host '{source_host}',
  port '22',
  username '{source_username}',
  privateKey SECRET ('{secret_scope}', '{ssh_key_secret}')
)
""")

print(f"Source SFTP connection created: {CATALOG_NAME}.source_sftp_connection")

## 4. Create Unity Catalog Connection for Target SFTP

In [None]:
# Create target SFTP connection using SQL
spark.sql(f"""
CREATE CONNECTION IF NOT EXISTS {CATALOG_NAME}.target_sftp_connection
TYPE sftp
OPTIONS (
  host '{target_host}',
  port '22',
  username '{target_username}',
  privateKey SECRET ('{secret_scope}', '{ssh_key_secret}')
)
""")

print(f"Target SFTP connection created: {CATALOG_NAME}.target_sftp_connection")

## 5. Verify Connections

In [None]:
# List all connections
connections_df = spark.sql(f"SHOW CONNECTIONS IN {CATALOG_NAME}")
display(connections_df)

## 6. Test Source Connection with AutoLoader

Read data from source SFTP using AutoLoader to verify the connection works.

In [None]:
# Test reading customers.csv from source SFTP
source_path = f"sftp://{source_host}/customers.csv"

customers_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.source_sftp_connection")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(source_path)
)

# Display schema
customers_df.printSchema()

# Write to temporary table for verification
(
    customers_df.writeStream
    .format("memory")
    .queryName("test_customers")
    .outputMode("append")
    .start()
)

print("Source SFTP connection verified successfully")

In [None]:
# Display sample data
display(spark.sql("SELECT * FROM test_customers LIMIT 10"))

## 7. Create Catalog and Schema for Pipeline

In [None]:
# Create catalog structure for DLT pipeline
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.bronze")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.silver")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.gold")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

print("Catalog structure created:")
print(f"  - {CATALOG_NAME}.bronze (raw data from source SFTP)")
print(f"  - {CATALOG_NAME}.silver (cleaned and validated data)")
print(f"  - {CATALOG_NAME}.gold (aggregated business-level data)")
print(f"  - {CATALOG_NAME}.{SCHEMA_NAME} (default schema)")

## 8. Create External Location for Checkpoints

In [None]:
# Create checkpoint location in DBFS
checkpoint_location = f"/dbfs/{CATALOG_NAME}/checkpoints"
dbutils.fs.mkdirs(checkpoint_location)

print(f"Checkpoint location created: {checkpoint_location}")

## 9. Grant Permissions (if needed)

Grant necessary permissions to use the connections in DLT pipelines.

In [None]:
# Grant USAGE on connections to all users (adjust as needed)
# Uncomment if you need to grant permissions:

# spark.sql(f"""
# GRANT USAGE ON CONNECTION {CATALOG_NAME}.source_sftp_connection 
# TO `account users`
# """)

# spark.sql(f"""
# GRANT USAGE ON CONNECTION {CATALOG_NAME}.target_sftp_connection 
# TO `account users`
# """)

print("Connection permissions configured")

## 10. Test Complete Data Flow

In [None]:
# Read orders.csv from source SFTP
orders_path = f"sftp://{source_host}/orders.csv"

orders_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.connectionName", f"{CATALOG_NAME}.source_sftp_connection")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(orders_path)
)

# Display schema
orders_df.printSchema()

# Write to temporary table
(
    orders_df.writeStream
    .format("memory")
    .queryName("test_orders")
    .outputMode("append")
    .start()
)

print("Orders data loaded successfully")

In [None]:
# Display sample orders data
display(spark.sql("SELECT * FROM test_orders LIMIT 10"))

## Summary

Unity Catalog connection setup completed:
- ✓ Source SFTP connection created and tested
- ✓ Target SFTP connection created
- ✓ Catalog and schema structure created (bronze, silver, gold)
- ✓ AutoLoader successfully reading from source SFTP
- ✓ Checkpoint locations configured

Next step: Run notebook `03_dlt_pipeline.ipynb` to create and execute the Delta Live Tables pipeline