# Delta Lake Viewer

This notebook demonstrates how to read and query Delta Lake tables stored in MinIO (S3-compatible storage).

## 1. Setup Spark Session with Delta Lake

In [None]:
from pyspark.sql import SparkSession
from delta import *

# Create Spark session with Delta Lake configuration
spark = SparkSession.builder \
    .appName("DeltaLakeViewer") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

print("Spark session created successfully!")
print(f"Spark version: {spark.version}")

## 2. Read Delta Lake Table

In [None]:
# Define Delta table path
delta_table_path = "s3a://delta-lake/tables/customers"

# Read Delta table
df = spark.read.format("delta").load(delta_table_path)

# Show schema
print("Table Schema:")
df.printSchema()

# Show data
print("\nTable Data:")
df.show(truncate=False)

## 3. Basic Queries

In [None]:
# Count total records
total_count = df.count()
print(f"Total records: {total_count}")

# Show statistics
df.describe().show()

In [None]:
# Filter by name
from pyspark.sql.functions import col

# Example: Filter customers whose name contains 'John'
filtered_df = df.filter(col("name").contains("John"))
filtered_df.show(truncate=False)

## 4. Delta Lake Time Travel

In [None]:
# View Delta Lake table history
deltaTable = DeltaTable.forPath(spark, delta_table_path)

print("Delta Table History:")
deltaTable.history().select("version", "timestamp", "operation", "operationMetrics").show(truncate=False)

In [None]:
# Read a specific version (time travel)
# Example: Read version 0
version_0_df = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
print("Data at version 0:")
version_0_df.show(truncate=False)

## 5. SQL Queries

In [None]:
# Register table as SQL temporary view
df.createOrReplaceTempView("customers")

# Run SQL query
result = spark.sql("""
    SELECT 
        name,
        email,
        DATE(created_at) as signup_date
    FROM customers
    ORDER BY created_at DESC
""")

result.show(truncate=False)

## 6. Data Visualization (Optional)

In [None]:
# Convert to Pandas for visualization
import pandas as pd
import matplotlib.pyplot as plt

pandas_df = df.toPandas()
print(pandas_df.head())

# Example: Plot count by operation type (if available)
if 'operation' in pandas_df.columns:
    operation_counts = pandas_df['operation'].value_counts()
    operation_counts.plot(kind='bar', title='Operations Count')
    plt.show()

## 7. Cleanup

In [None]:
# Stop Spark session (optional)
# spark.stop()