# 02 - Unity Catalog Connection Setup

This notebook configures Unity Catalog connections for SFTP data sources and demonstrates both reading and writing with SFTP.

**Note:** AutoLoader with SFTP is straightforward and built-in to Databricks. The main purpose of this repository is to demonstrate the **custom SFTPWriter** for writing data back to SFTP servers.

This notebook will:
- Create Unity Catalog SFTP connections (stores credentials securely)
- Verify AutoLoader can read from SFTP using simple URI format
- **Demonstrate custom SFTPWriter API for writing to SFTP**
- Create catalog and schema structure for the DLT pipeline

In [0]:
# Install dependencies from requirements.txt
%pip install -r ../requirements.txt
#%pip install -q -e ../
dbutils.library.restartPython()

## 1. Load Configuration from Previous Setup

In [0]:
# Create widgets for catalog and schema configuration
dbutils.widgets.text("catalog_name", "sftp_demo", "Catalog Name")
dbutils.widgets.text("schema_name", "default", "Schema Name")
dbutils.widgets.text("source_connection_name", "source_sftp_connection", "Source Connection Name")
dbutils.widgets.text("target_connection_name", "target_sftp_connection", "Target Connection Name")

# Get widget values
CATALOG_NAME = dbutils.widgets.get("catalog_name")
SCHEMA_NAME = dbutils.widgets.get("schema_name")
SOURCE_CONNECTION_NAME = dbutils.widgets.get("source_connection_name")
TARGET_CONNECTION_NAME = dbutils.widgets.get("target_connection_name")

print(f"Catalog: {CATALOG_NAME}")
print(f"Schema: {SCHEMA_NAME}")
print(f"Source Connection: {SOURCE_CONNECTION_NAME}")
print(f"Target Connection: {TARGET_CONNECTION_NAME}")

In [0]:
# Load configuration
config_df = spark.table(f"{CATALOG_NAME}.config.connection_params")
config_dict = {row.key: row.value for row in config_df.collect()}

# Get configuration values
catalog_name = config_dict.get("catalog_name", CATALOG_NAME)
schema_name = config_dict.get("schema_name", SCHEMA_NAME)
source_connection_name = config_dict.get("source_connection_name", SOURCE_CONNECTION_NAME)
target_connection_name = config_dict.get("target_connection_name", TARGET_CONNECTION_NAME)

source_host = config_dict["source_host"]
source_username = config_dict["source_username"]
target_host = config_dict["target_host"]
target_username = config_dict["target_username"]
secret_scope = config_dict["secret_scope"]
ssh_key_secret = config_dict["ssh_key_secret"]
ssh_key_fingerprint = config_dict["ssh_key_fingerprint"]

print("Configuration loaded successfully")
print(f"Catalog: {catalog_name}")
print(f"Schema: {schema_name}")
print(f"Source Connection: {source_connection_name}")
print(f"Target Connection: {target_connection_name}")
print(f"Secret scope: {secret_scope}")
print(f"SSH key secret: {ssh_key_secret}")
print(f"SSH key fingerprint: {ssh_key_fingerprint}")

## 2. Create Unity Catalog Connection for Source SFTP

**Note:** This requires Databricks workspace admin privileges.

In [0]:
# Set catalog context first
spark.sql(f"USE CATALOG {CATALOG_NAME}")

# Debug: Print the values being used
print("Creating source SFTP connection with:")
print(f"  host: {source_host}")
print(f"  port: 22")
print(f"  user: SECRET('{secret_scope}', 'source-username')")
print(f"  pem_private_key: SECRET('{secret_scope}', '{ssh_key_secret}')")
print(f"  key_fingerprint: {ssh_key_fingerprint}")

# Build the SQL statement
create_source_sql = f"""
CREATE CONNECTION IF NOT EXISTS {SOURCE_CONNECTION_NAME}
TYPE sftp
OPTIONS (
  host '{source_host}',
  port '22',
  user SECRET ('{secret_scope}', 'source-username'),
  pem_private_key SECRET ('{secret_scope}', '{ssh_key_secret}'),
  key_fingerprint '{ssh_key_fingerprint}'
)
"""

# Create source SFTP connection
spark.sql(create_source_sql)

print(f"\n✓ Source SFTP connection created: {SOURCE_CONNECTION_NAME} (in catalog {CATALOG_NAME})")

## 3. Create Unity Catalog Connection for Target SFTP

In [0]:
# Debug: Print the values being used
print("Creating target SFTP connection with:")
print(f"  host: {target_host}")
print(f"  port: 22")
print(f"  user: SECRET('{secret_scope}', 'target-username')")
print(f"  pem_private_key: SECRET('{secret_scope}', '{ssh_key_secret}')")
print(f"  key_fingerprint: {ssh_key_fingerprint}")

# Build the SQL statement
create_target_sql = f"""
CREATE CONNECTION IF NOT EXISTS {TARGET_CONNECTION_NAME}
TYPE sftp
OPTIONS (
  host '{target_host}',
  port '22',
  user SECRET ('{secret_scope}', 'target-username'),
  pem_private_key SECRET ('{secret_scope}', '{ssh_key_secret}'),
  key_fingerprint '{ssh_key_fingerprint}'
)
"""

# Create target SFTP connection
spark.sql(create_target_sql)

print(f"\n✓ Target SFTP connection created: {TARGET_CONNECTION_NAME} (in catalog {CATALOG_NAME})")

## 4. Verify AutoLoader with SFTP

AutoLoader automatically finds the Unity Catalog connection based on the host in the SFTP URI.

In [0]:
# Test reading customers.csv from source SFTP using AutoLoader
# AutoLoader automatically finds the connection based on the host in the URI
source_sftp_uri = f"sftp://{source_username}@{source_host}:22/customers.csv"

customers_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", f"/tmp/{CATALOG_NAME}/schema/customers")
    .option("header", "true")
    .load(source_sftp_uri)
)

# Display schema
print("Schema:")
customers_df.printSchema()

# Write to temporary table for verification (use availableNow for serverless)
query = (
    customers_df.writeStream
    .format("memory")
    .queryName("test_customers")
    .outputMode("append")
    .trigger(availableNow=True)
    .start()
)

# Wait for the micro-batch to complete
query.awaitTermination()

In [0]:
# Display sample data
print("\nSample data:")
display(spark.sql("SELECT * FROM test_customers LIMIT 10"))

print(f"\n✓ Source SFTP AutoLoader verified successfully")
print(f"  URI: {source_sftp_uri}")
print(f"  Connection matched automatically by host: {source_host}")

## 5. Demonstrate Custom SFTPWriter API

The main focus of this repository is the **custom SFTPWriter** for writing data to SFTP using **Paramiko**.

In [0]:
import sys
import os
import tempfile

# Add src folder to Python path
notebook_path = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
repo_root = os.path.dirname(notebook_path)
src_path = os.path.join(repo_root, 'src')

if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import custom SFTP Data Source
from src.ingest import SFTPDataSource

# Register the SFTP data source with Spark
spark.dataSource.register(SFTPDataSource)

print(f"✓ Custom SFTP package imported from: {src_path}")
print(f"✓ SFTP data source registered with Spark")
print(f"  Usage: df.write.format('sftp').option(...).save()")

In [0]:
# Create demo DataFrame to write to target SFTP
from datetime import datetime

demo_data = [
    (1, "Demo Customer 1", "demo1@example.com", "USA", datetime.now().strftime("%Y-%m-%d")),
    (2, "Demo Customer 2", "demo2@example.com", "UK", datetime.now().strftime("%Y-%m-%d")),
    (3, "Demo Customer 3", "demo3@example.com", "Canada", datetime.now().strftime("%Y-%m-%d"))
]

demo_df = spark.createDataFrame(demo_data, ["customer_id", "name", "email", "country", "signup_date"])

print("Demo DataFrame created:")
demo_df.show()

# Convert to Pandas for writing with SFTPWriter
demo_pdf = demo_df.toPandas()

print(f"\n✓ Created demo DataFrame with {len(demo_pdf)} rows")

In [0]:
# Write demo data to target SFTP using Databricks Python Data Source API
# This demonstrates the proper Spark-based approach with Paramiko

# Get SSH private key from secrets and write to temporary file
ssh_key_content = dbutils.secrets.get(scope=secret_scope, key=ssh_key_secret)
tmp_key_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_sftp_key')
tmp_key_file.write(ssh_key_content)
tmp_key_file.close()
os.chmod(tmp_key_file.name, 0o600)

remote_path = "/demo_customers.csv"

print(f"Writing demo DataFrame to SFTP using Spark DataSource API")
print(f"Target: {target_username}@{target_host}{remote_path}")
print(f"Technology: Paramiko SSHv2 library (version 3.4.0)\n")

# Write using Spark DataSource API - THIS IS THE PROPER WAY
demo_df.write \\
    .format("sftp") \\
    .option("host", target_host) \\
    .option("username", target_username) \\
    .option("private_key_path", tmp_key_file.name) \\
    .option("port", "22") \\
    .option("path", remote_path) \\
    .option("format", "csv") \\
    .option("header", "true") \\
    .mode("overwrite") \\
    .save()

# Clean up temporary SSH key file
if os.path.exists(tmp_key_file.name):
    os.remove(tmp_key_file.name)
    print(f"\n✓ Cleaned up temporary SSH key file")

print("\n" + "="*70)
print("Custom SFTP Data Source Demo Complete")
print("="*70)
print(f"Technology: Paramiko SSHv2 library")
print(f"API: Databricks Python Data Source API")
print(f"Pattern: spark.dataSource.register() + df.write.format('sftp')")
print(f"Written: {demo_df.count()} rows to {remote_path}")
print("="*70)

## 6. Create Catalog Structure for DLT Pipeline

In [0]:
# Create catalog structure for DLT pipeline
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.bronze")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.silver")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.gold")

print("Catalog structure created:")
print(f"  - {CATALOG_NAME}.bronze (raw data from source SFTP)")
print(f"  - {CATALOG_NAME}.silver (cleaned and validated data)")
print(f"  - {CATALOG_NAME}.gold (aggregated business-level data)")
print(f"  - {CATALOG_NAME}.{SCHEMA_NAME} (default schema)")