# Data Ingestion Notebook
This notebook demonstrates how to ingest data from various sources into Azure Data Lake Storage Gen2.

## Prerequisites
- Azure Synapse workspace configured
- Data Lake Storage Gen2 connected
- Appropriate permissions set up

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import pandas as pd

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()
print(f"Spark version: {spark.version}")
print(f"Ingestion started at: {datetime.now()}")

In [None]:
# Configuration - Update these values
storage_account_name = "your_storage_account_name"
container_name = "synapsefs"
raw_data_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/raw/"
bronze_data_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/bronze/"

print(f"Raw data path: {raw_data_path}")
print(f"Bronze data path: {bronze_data_path}")

## Ingest CSV Data

In [None]:
# Define schema for customer data
customer_schema = StructType([
    StructField("CustomerID", StringType(), False),
    StructField("CustomerName", StringType(), False),
    StructField("CustomerType", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Phone", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Region", StringType(), True)
])

# Read CSV file
customer_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .schema(customer_schema) \
    .csv(f"{raw_data_path}customers.csv")

# Add audit columns
customer_df = customer_df \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("source_file", input_file_name())

print(f"Loaded {customer_df.count()} customer records")
customer_df.show(5)

## Ingest JSON Data

In [None]:
# Read JSON files
product_df = spark.read \
    .option("multiline", "true") \
    .json(f"{raw_data_path}products.json")

# Add audit columns
product_df = product_df \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("source_file", input_file_name())

print(f"Loaded {product_df.count()} product records")
product_df.printSchema()
product_df.show(5)

## Ingest Parquet Data

In [None]:
# Read Parquet files
sales_df = spark.read.parquet(f"{raw_data_path}sales/*.parquet")

# Add audit columns
sales_df = sales_df \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("source_file", input_file_name())

print(f"Loaded {sales_df.count()} sales records")
sales_df.show(5)

## Data Quality Checks

In [None]:
# Check for null values in key columns
print("Customer Data Quality:")
customer_df.select([count(when(col(c).isNull(), c)).alias(c) for c in customer_df.columns]).show()

print("\nProduct Data Quality:")
product_df.select([count(when(col(c).isNull(), c)).alias(c) for c in product_df.columns]).show()

print("\nSales Data Quality:")
sales_df.select([count(when(col(c).isNull(), c)).alias(c) for c in sales_df.columns]).show()

In [None]:
# Check for duplicates
customer_duplicates = customer_df.count() - customer_df.dropDuplicates(["CustomerID"]).count()
product_duplicates = product_df.count() - product_df.dropDuplicates(["ProductID"]).count()

print(f"Customer duplicates: {customer_duplicates}")
print(f"Product duplicates: {product_duplicates}")

## Write to Bronze Layer

In [None]:
# Write customer data to Bronze layer
customer_df.write \
    .mode("overwrite") \
    .partitionBy("ingestion_date") \
    .parquet(f"{bronze_data_path}customers/")

print("Customer data written to Bronze layer")

In [None]:
# Write product data to Bronze layer
product_df.write \
    .mode("overwrite") \
    .partitionBy("ingestion_date") \
    .parquet(f"{bronze_data_path}products/")

print("Product data written to Bronze layer")

In [None]:
# Write sales data to Bronze layer with partitioning
sales_df.write \
    .mode("overwrite") \
    .partitionBy("ingestion_date") \
    .parquet(f"{bronze_data_path}sales/")

print("Sales data written to Bronze layer")

## Summary

In [None]:
# Print ingestion summary
print("=" * 50)
print("Data Ingestion Summary")
print("=" * 50)
print(f"Ingestion completed at: {datetime.now()}")
print(f"Customer records: {customer_df.count()}")
print(f"Product records: {product_df.count()}")
print(f"Sales records: {sales_df.count()}")
print("=" * 50)