In [0]:
-- Set up session variable for our table
DECLARE OR REPLACE VARIABLE table_name STRING DEFAULT 'workspace.bronze.events_raw';
-----------------------------------BRONZE DATA CLEANING----------------------------------
-- Basic row count and date range
SELECT 
  COUNT(*) as total_events,
  COUNT(DISTINCT user_id) as unique_users,
  COUNT(DISTINCT user_session) as unique_sessions,
  COUNT(DISTINCT product_id) as unique_products,
  MIN(event_time) as earliest_event,
  MAX(event_time) as latest_event,
  DATEDIFF(MAX(event_time), MIN(event_time)) as days_of_data
FROM workspace.bronze.events_raw;

-- Event type distribution
SELECT 
  event_type,
  COUNT(*) as event_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
FROM workspace.bronze.events_raw
GROUP BY event_type
ORDER BY event_count DESC;

-- Check for NULL values across all columns
SELECT 
  COUNT(*) as total_rows,
  COUNT(*) - COUNT(event_time) as null_event_time,
  COUNT(*) - COUNT(event_type) as null_event_type,
  COUNT(*) - COUNT(product_id) as null_product_id,
  COUNT(*) - COUNT(category_id) as null_category_id,
  COUNT(*) - COUNT(category_code) as null_category_code,
  COUNT(*) - COUNT(brand) as null_brand,
  COUNT(*) - COUNT(price) as null_price,
  COUNT(*) - COUNT(user_id) as null_user_id,
  COUNT(*) - COUNT(user_session) as null_user_session,
  -- Percentages
  ROUND((COUNT(*) - COUNT(category_code)) * 100.0 / COUNT(*), 2) as pct_null_category_code,
  ROUND((COUNT(*) - COUNT(brand)) * 100.0 / COUNT(*), 2) as pct_null_brand
FROM workspace.bronze.events_raw;

------------------------
-- Create cleaned Silver table
CREATE SCHEMA IF NOT EXISTS workspace.silver;
CREATE OR REPLACE TABLE workspace.silver.events_cleaned AS
SELECT 
  -- Time fields
  event_time,
  DATE(event_time) as event_date,
  HOUR(event_time) as event_hour,
  DAYOFWEEK(event_time) as day_of_week,
  
  -- Event info
  LOWER(TRIM(event_type)) as event_type,
  
  -- Product info
  product_id,
  category_id,
  CASE 
    WHEN category_code IS NULL OR TRIM(category_code) = '' THEN 'uncategorized'
    ELSE LOWER(TRIM(category_code))
  END as category_code,
  CASE 
    WHEN brand IS NULL OR TRIM(brand) = '' THEN 'unknown'
    ELSE LOWER(TRIM(brand))
  END as brand,
  
  -- Price (handle any potential issues)
  CASE 
    WHEN price IS NULL OR price <= 0 THEN NULL
    ELSE ROUND(price, 2)
  END as price,
  
  -- User info
  user_id,
  user_session,
  
  -- Data quality flags
  CASE WHEN brand IS NULL THEN 1 ELSE 0 END as is_brand_missing,
  CASE WHEN category_code IS NULL THEN 1 ELSE 0 END as is_category_missing,
  CASE WHEN price IS NULL OR price <= 0 THEN 1 ELSE 0 END as is_price_invalid

FROM workspace.bronze.events_raw
WHERE 
  -- Remove obvious bad records
  event_time IS NOT NULL
  AND event_type IS NOT NULL
  AND product_id IS NOT NULL
  AND user_id IS NOT NULL
  AND user_session IS NOT NULL;

-- Check cleaned data
SELECT 
  COUNT(*) as total_cleaned_events,
  COUNT(DISTINCT user_id) as unique_users,
  COUNT(DISTINCT user_session) as unique_sessions,
  COUNT(DISTINCT product_id) as unique_products,
  COUNT(DISTINCT category_code) as unique_categories,
  COUNT(DISTINCT brand) as unique_brands
FROM workspace.silver.events_cleaned;

-- Check event type distribution after cleaning
SELECT 
  event_type,
  COUNT(*) as event_count,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
FROM workspace.silver.events_cleaned
GROUP BY event_type
ORDER BY event_count DESC;

-- Check data quality flags
SELECT 
  SUM(is_brand_missing) as records_without_brand,
  SUM(is_category_missing) as records_without_category,
  SUM(is_price_invalid) as records_with_invalid_price,
  ROUND(SUM(is_brand_missing) * 100.0 / COUNT(*), 2) as pct_brand_missing,
  ROUND(SUM(is_category_missing) * 100.0 / COUNT(*), 2) as pct_category_missing
FROM workspace.silver.events_cleaned;

-- Sample of cleaned data
SELECT * 
FROM workspace.silver.events_cleaned 
LIMIT 20;

-- Check for duplicate events (same user, product, event_type at same time)
SELECT 
  COUNT(*) as total_records,
  COUNT(DISTINCT CONCAT(user_id, product_id, event_type, event_time)) as unique_events,
  COUNT(*) - COUNT(DISTINCT CONCAT(user_id, product_id, event_type, event_time)) as potential_duplicates
FROM workspace.silver.events_cleaned;

-- Chcecking for the duplicates
SELECT 
  user_id,
  product_id,
  event_type,
  event_time,
  COUNT(*) as duplicate_count
FROM workspace.silver.events_cleaned
GROUP BY user_id, product_id, event_type, event_time
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC
LIMIT 10;


-- Price statistics
SELECT 
  event_type,
  COUNT(*) as event_count,
  ROUND(MIN(price), 2) as min_price,
  ROUND(MAX(price), 2) as max_price,
  ROUND(AVG(price), 2) as avg_price,
  ROUND(PERCENTILE(price, 0.5), 2) as median_price,
  ROUND(STDDEV(price), 2) as stddev_price
FROM workspace.silver.events_cleaned
WHERE price IS NOT NULL
GROUP BY event_type
ORDER BY event_type;

-- Finding potential price outliers (using IQR method)
WITH price_stats AS (
  SELECT 
    PERCENTILE(price, 0.25) as q1,
    PERCENTILE(price, 0.75) as q3,
    PERCENTILE(price, 0.75) - PERCENTILE(price, 0.25) as iqr
  FROM workspace.silver.events_cleaned
  WHERE price IS NOT NULL
)
SELECT 
  COUNT(*) as total_products,
  SUM(CASE WHEN price < (q1 - 1.5 * iqr) OR price > (q3 + 1.5 * iqr) THEN 1 ELSE 0 END) as outlier_count,
  ROUND(SUM(CASE WHEN price < (q1 - 1.5 * iqr) OR price > (q3 + 1.5 * iqr) THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as outlier_percentage
FROM workspace.silver.events_cleaned
CROSS JOIN price_stats
WHERE price IS NOT NULL;


