In [0]:
# Parameters
storage_account_name = "ungcapstor01"
container_name = "guided"

# Retrieve SAS token securely
sas_token = dbutils.secrets.get(scope="azure-secrets", key="guided-container-sas")

# Configure Spark for this specific container
spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

# ✅ Connection test - use the container path
try:
    print("Listing container contents:")
    display(dbutils.fs.ls(f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"))
    print("✅ Azure Blob access confirmed.")
except Exception as e:
    print("❌ Azure Blob connection failed:", e)


In [0]:
# ============================================================
# Guided Capstone Step 2 – Data Ingestion (Databricks CE version)
# Works with Azure Blob Storage (HNS = false)
# ============================================================

from pyspark.sql.functions import input_file_name, regexp_extract, current_timestamp, lit
import json

spark = SparkSession.builder.appName("guided_step2_ingestion").getOrCreate()

# === 1. Common schema ===
schema = T.StructType([
    T.StructField("trade_dt", T.StringType()),
    T.StructField("rec_type", T.StringType()),
    T.StructField("symbol", T.StringType()),
    T.StructField("exchange", T.StringType()),
    T.StructField("event_tm", T.StringType()),
    T.StructField("event_seq_nb", T.IntegerType()),
    T.StructField("arrival_tm", T.StringType()),
    T.StructField("trade_pr", T.DoubleType()),
    T.StructField("bid_pr", T.DoubleType()),
    T.StructField("bid_size", T.IntegerType()),
    T.StructField("ask_pr", T.DoubleType()),
    T.StructField("ask_size", T.IntegerType()),
    T.StructField("partition", T.StringType())
])

# === 2. CSV parser ===
def parse_csv(line: str):
    try:
        vals = [x.strip() for x in line.split(",") if x.strip()]
        return {
            "trade_dt": vals[0],
            "arrival_tm": vals[1],
            "rec_type": vals[2],
            "symbol": vals[3],
            "event_tm": vals[4],
            "event_seq_nb": int(vals[5]),
            "exchange": vals[6],
            "bid_pr": float(vals[7]),
            "bid_size": int(vals[8]),
            "ask_pr": float(vals[9]),
            "ask_size": int(vals[10]),
            "partition": vals[2]
        }
    except Exception:
        return None

# === 3. JSON parser ===
def parse_json(line: str):
    try:
        rec = json.loads(line)
        rec["partition"] = rec.get("event_type", "B")
        return rec
    except Exception:
        return None

# === 4. Paths ===
base_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/data"
csv_path = f"{base_path}/csv/*/*/*.txt"
json_path = f"{base_path}/json/*/*/*.txt"
output_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/output_dir/"

# Convert to DataFrames
csv_rdd = spark.sparkContext.textFile(csv_path).map(parse_csv).filter(lambda r: r is not None)
json_rdd = spark.sparkContext.textFile(json_path).map(parse_json).filter(lambda r: r is not None)

csv_df = spark.createDataFrame(csv_rdd, schema=schema)
json_df = spark.createDataFrame(json_rdd, schema=schema)

# Combine both DataFrames
combined_df = csv_df.unionByName(json_df, allowMissingColumns=True)

# === Add audit metadata ===
# Capture full file path from source
combined_df = combined_df.withColumn("source_path", input_file_name())

# Extract only the base filename from path (after last '/')
combined_df = combined_df.withColumn(
    "source_file",
    regexp_extract("source_path", r"([^/]+)$", 1)
)

# Add ingestion timestamp
combined_df = combined_df.withColumn("ingest_ts", current_timestamp())

# Optional: drop source_path if you only want the short name
# combined_df = combined_df.drop("source_path")

# === Write output ===
combined_count = combined_df.count()
print("Combined Count:", combined_count)

if combined_count > 0:
    combined_df.groupBy("partition").count().show()
    combined_df.write.partitionBy("partition").mode("overwrite").parquet(output_path)
    print(f"✅ Data written successfully to: {output_path}")
else:
    print("⚠️ No data to write – check parser output.")

combined_df.printSchema()


In [0]:
# data/csv/2020-08-05/NYSE/
print(dbutils.fs.head("wasbs://guided@ungcapstor01.blob.core.windows.net/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt", 1000))
print("----")
# data/json/2020-08-06/NASDAQ/
print(dbutils.fs.head("wasbs://guided@ungcapstor01.blob.core.windows.net/data/json/2020-08-06/NASDAQ/part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt", 1000))
