In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark.sql("USE CATALOG `midterm`")
spark.sql("USE SCHEMA `source1_layer`")


In [0]:


CSV_PATH = "/Volumes/midterm/source1_layer/datastore/Final_Merged_Output.csv"
TARGET_TABLE = "midterm.source1_layer.raw_table"

# ============================================
# STEP 1: Read CSV with CORRECT delimiter
# ============================================
print(f" Reading CSV from: {CSV_PATH}")

df = spark.read \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("sep", ",") \
    .option("escape", "\"") \
    .option("multiLine", "true") \
    .csv(CSV_PATH)

print(f" CSV loaded successfully!")
print(f"   Total columns: {len(df.columns)}")
print(f"   Total records: {df.count():,}")

# Display column names to verify
print("\n Column names:")
for i, col in enumerate(df.columns, 1):
    print(f"   {i}. {col}")

# ============================================
# STEP 2: Write to Delta Table with Column Mapping
# ============================================
print(f"\n Writing to table: {TARGET_TABLE}")

df.write \
    .mode("overwrite") \
    .option("delta.columnMapping.mode", "name") \
    .option("overwriteSchema", "true") \
    .saveAsTable(TARGET_TABLE)

print(f"\n SUCCESS! Table created: {TARGET_TABLE}")

# ============================================
# STEP 3: Verify
# ============================================
result_df = spark.table(TARGET_TABLE)
print(f"\n Verification:")
print(f"   Records in table: {result_df.count():,}")
print(f"   Columns in table: {len(result_df.columns)}")

display(result_df.limit(5))

In [0]:
%sql
ALTER TABLE midterm.source1_layer.raw_table
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
%sql 
desc table extended midterm.source1_layer.raw_table;

In [0]:
%sql
drop table midterm.source1_layer.restaurant_cdf_type2_stage ;

In [0]:
%sql
use catalog `midterm`; select * from `source1_layer`.`dim_violation` limit 100;

In [0]:
%sql
select count(*) from midterm.source1_layer.silver_table;

In [0]:
%sql
-- Find duplicate inspection_ids in fact_inspection
SELECT 
    inspection_id,
    COUNT(*) as duplicate_count,
    COUNT(DISTINCT inspection_fact_key) as distinct_keys
FROM midterm.source1_layer.fact_inspection
GROUP BY inspection_id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;

In [0]:
%sql
select * from midterm.source1_layer.fact_inspection
where restaurant_key = 2

In [0]:
%sql
use catalog `midterm`; select * from `source1_layer`.`fact_inspection` limit 100;

In [0]:
%sql
-- Check dim_restaurant
SELECT 
    restaurant_key,
    license_no,
    dba_name,
    city,
    is_current
FROM midterm.source1_layer.dim_restaurant
WHERE is_current = true
LIMIT 10;

In [0]:
%sql
-- Check dim_location
SELECT 
    location_key,
    location_business_key,
    address,
    city,
    zip_code
FROM midterm.source1_layer.dim_location
LIMIT 10;

In [0]:
%sql
-- Check what license_no and city look like in fact_inspection source
SELECT DISTINCT
    
    UPPER(TRIM(Inspection_ID)) as inspection_id,
    UPPER(TRIM(License_No)) as license_no,
    UPPER(TRIM(City)) as city
FROM midterm.source1_layer.silver_table
where City = "Chicago"
LIMIT 20;

In [0]:
%sql
use catalog `midterm`; select * from `source1_layer`.`fact_inspection` limit 100;

In [0]:
%sql
-- Compare with dim_restaurant
SELECT 
    'silver_table' as source,
    COUNT(DISTINCT CONCAT(UPPER(TRIM(License_No)), '-', UPPER(TRIM(City)))) as unique_combos
FROM midterm.source1_layer.silver_table

UNION ALL

SELECT 
    'dim_restaurant' as source,
    COUNT(DISTINCT CONCAT(UPPER(TRIM(license_no)), '-', UPPER(TRIM(city)))) as unique_combos
FROM midterm.source1_layer.dim_restaurant
WHERE is_current = true;

In [0]:
%sql
-- Find records in fact source that don't match dim_restaurant
SELECT 
    'Not matching' as status,
    COUNT(*) as count,
    f.license_no as fact_license_sample,
    f.city as fact_city_sample
FROM (
    SELECT DISTINCT
        TRIM(License_No) as license_no,
        TRIM(City) as city
    FROM midterm.source1_layer.silver_table
    LIMIT 10
) f
LEFT JOIN midterm.source1_layer.dim_restaurant r
    ON TRIM(f.license_no) = TRIM(r.license_no)
    AND TRIM(f.city) = TRIM(r.city)
    AND r.is_current = true
WHERE r.restaurant_key IS NULL
GROUP BY f.license_no, f.city
LIMIT 20;

In [0]:
%sql
select * from midterm.source1_layer.dim_restaurant where is_current = "False";