In [0]:
# Airflow Integration Parameters

try:
    # Create widgets for Airflow parameters
    dbutils.widgets.text("batch_id", "manual_run", "Batch ID from Airflow")
    dbutils.widgets.text("execution_date", "", "Execution Date from Airflow") 
    dbutils.widgets.text("force_refresh", "false", "Force data refresh")
    dbutils.widgets.text("quality_threshold", "0.8", "Data quality threshold")
    dbutils.widgets.text("dag_run_id", "", "DAG Run ID")
    
    # Get parameter values
    batch_id = dbutils.widgets.get("batch_id")
    execution_date = dbutils.widgets.get("execution_date")
    force_refresh = dbutils.widgets.get("force_refresh").lower() == "true"
    quality_threshold = float(dbutils.widgets.get("quality_threshold"))
    dag_run_id = dbutils.widgets.get("dag_run_id")
    
    print(f"🎯 Airflow Parameters:")
    print(f"   Batch ID: {batch_id}")
    print(f"   Execution Date: {execution_date}")
    print(f"   Force Refresh: {force_refresh}")
    print(f"   Quality Threshold: {quality_threshold}")
    print(f"   DAG Run ID: {dag_run_id}")
    
except Exception as e:
    print(f"⚠️ Widget creation failed (normal in some contexts): {e}")
    # Fallback values for manual runs
    batch_id = "manual_run"
    execution_date = ""
    force_refresh = False
    quality_threshold = 0.8
    dag_run_id = ""

In [0]:
print("Stock Data Consumer Script")
print("=" * 70)

# Store Python's built-in round before PySpark imports override it
import builtins
python_round = builtins.round

# Now import other modules
import json
import time
import threading
from datetime import datetime, timezone
from azure.eventhub import EventHubConsumerClient, TransportType
from pyspark.sql.functions import *
from pyspark.sql.types import *
import traceback

# Enhanced schema setup with ALL STRING types to prevent ANY conflicts
print("🏗️ Setting up database schemas with ALL STRING types...")

def create_schemas_and_tables():
    """Create catalog, schema, and tables for stock data - ALL STRING TYPES"""
    
    try:
        print("🔍 Checking existing schemas...")
        
        # Get current catalog name
        current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
        current_database = spark.sql("SELECT current_database()").collect()[0][0]
        
        print(f"📁 Current catalog: {current_catalog}")
        print(f"📁 Current database: {current_database}")
        
        # Show existing schemas with improved column access
        try:
            schemas_df = spark.sql("SHOW SCHEMAS")
            schema_columns = schemas_df.columns
            print(f"📋 Schema columns available: {schema_columns}")
            
            if 'namespace' in schema_columns:
                existing_schemas = [row['namespace'] for row in schemas_df.collect()]
            elif 'databaseName' in schema_columns:
                existing_schemas = [row['databaseName'] for row in schemas_df.collect()]
            elif 'schemaName' in schema_columns:
                existing_schemas = [row['schemaName'] for row in schemas_df.collect()]
            else:
                existing_schemas = [row[0] for row in schemas_df.collect()]
            
            print(f"📋 Existing schemas: {existing_schemas}")
            
        except Exception as schema_list_error:
            print(f"⚠️ Could not list schemas: {schema_list_error}")
            existing_schemas = []
        
        # Create silver schema if it doesn't exist
        if 'silver' not in existing_schemas:
            print("🔧 Creating silver schema...")
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {current_catalog}.silver")
            print("✅ Silver schema created")
        else:
            print("✅ Silver schema already exists")
            
        # Create bronze schema if it doesn't exist (for backup verification)
        if 'bronze' not in existing_schemas:
            print("🔧 Creating bronze schema...")
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {current_catalog}.bronze")
            print("✅ Bronze schema created")
        else:
            print("✅ Bronze schema already exists")
        
        # Create COMPLETELY NEW TABLE with ALL STRING types to avoid ALL conflicts
        new_stock_table = f"{current_catalog}.silver.stock_data_consumer"
        
        # Drop existing table if it exists
        try:
            spark.sql(f"DROP TABLE IF EXISTS {new_stock_table}")
            print("🗑️ Dropped any existing consumer table")
        except:
            pass
        
        # Create silver stock_data table with ALL STRING types
        stock_table_ddl = f"""
        CREATE TABLE {new_stock_table} (
            symbol STRING,
            timestamp STRING,
            open_price STRING,          -- STRING to avoid precision conflicts
            high_price STRING,          -- STRING to avoid precision conflicts
            low_price STRING,           -- STRING to avoid precision conflicts
            close_price STRING,         -- STRING to avoid precision conflicts
            volume STRING,              -- STRING to avoid type conflicts
            source STRING,
            volatility STRING,          -- STRING to avoid precision conflicts
            price_range STRING,         -- STRING to avoid precision conflicts
            volume_indicator STRING,
            consumer_timestamp STRING,
            partition_id STRING,
            processing_mode STRING,
            silver_processing_time STRING,
            data_quality_score STRING, -- STRING to avoid precision conflicts
            ingestion_time STRING,
            processed_date STRING,
            ingestion_batch STRING,
            layer STRING,
            ingestion_source STRING
        )
        USING DELTA
        """
        
        spark.sql(stock_table_ddl)
        print(f"✅ Table {new_stock_table} created with ALL STRING types")
        
        # Verify tables exist
        tables = spark.sql(f"SHOW TABLES IN {current_catalog}.silver").collect()
        table_names = [row.tableName for row in tables]
        print(f"📋 Available tables in silver schema: {table_names}")
        
        return current_catalog, new_stock_table
        
    except Exception as e:
        print(f"❌ Error creating schemas: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Create schemas and get catalog name
current_catalog, stock_table_name = create_schemas_and_tables()

print("\n" + "=" * 70)
print("📈 Running Stock Data Consumer")
print("=" * 70)

# Configuration with validation
try:
    eh_connection_string = dbutils.secrets.get(scope="stock-project", key="event-hub-connection-string")
    if "EntityPath=" not in eh_connection_string:
        eh_connection_string = f"{eh_connection_string};EntityPath=stock-data-hub"

    storage_account_key = dbutils.secrets.get(scope="stock-project", key="storage-account-key")
    storage_account_name = "dlsstocksentiment2025"
    container_name = "data"

    spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_account_key)

    adls_base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
    silver_stock_path = f"{adls_base_path}/silver/stock_data_consumer"

    print("✅ Configuration loaded")
    
except Exception as config_error:
    print(f"❌ Configuration error: {config_error}")
    raise

def safe_save_to_catalog(df, table_name, mode="append"):
    """Safely save to Unity Catalog with fallback"""
    try:
        (df.write
         .format("delta")
         .mode(mode)
         .option("mergeSchema", "false")  # No schema merging to avoid conflicts
         .saveAsTable(table_name))
        return True
    except Exception as e:
        print(f"⚠️ Unity Catalog save failed for {table_name}: {e}")
        return False

def safe_parse_json(message_body):
    """Safely parse JSON with better error handling"""
    try:
        # Handle empty or whitespace-only messages
        if not message_body or not message_body.strip():
            print(f"⚠️ Empty message body")
            return None
            
        # Handle non-JSON messages (like "Test connection message")
        message_body = message_body.strip()
        if not message_body.startswith('{') and not message_body.startswith('['):
            print(f"⚠️ Non-JSON message: {message_body[:50]}...")
            return None
            
        # Try to parse JSON
        data = json.loads(message_body)
        return data
        
    except json.JSONDecodeError as je:
        print(f"⚠️ JSON decode error: {je} - Raw message: {message_body[:100]}...")
        return None
    except Exception as e:
        print(f"⚠️ Message parsing error: {e}")
        return None

def debug_print_message_structure(event, index):
    """Debug function to print message structure"""
    try:
        print(f"\n🔍 DEBUG - Message {index} structure:")
        print(f"   Keys: {list(event.keys())}")
        print(f"   data_type: {event.get('data_type', 'MISSING')}")
        print(f"   symbol: {event.get('symbol', 'MISSING')}")
        
        # Print all key-value pairs for first few messages
        if index <= 2:
            for key, value in event.items():
                print(f"   {key}: {value} (type: {type(value).__name__})")
                
    except Exception as e:
        print(f"⚠️ Debug print error: {e}")

def stock_only_consume_and_process():
    """Stock-only consumer with ALL STRING processing to avoid conflicts"""
    
    print("\n🔄 Starting stock data consumption and processing...")
    
    # Step 1: Consume messages with timeout and validation
    messages = []
    stop_flag = threading.Event()
    
    def consumer_thread():
        try:
            client = EventHubConsumerClient.from_connection_string(
                eh_connection_string,
                consumer_group="$Default",
                transport_type=TransportType.AmqpOverWebsocket
            )
            
            def message_handler(partition_context, event):
                if stop_flag.is_set() or len(messages) >= 15:
                    return
                
                if event and hasattr(event, 'body_as_str'):
                    try:
                        body = event.body_as_str(encoding='UTF-8')
                        
                        # Use safe JSON parsing
                        data = safe_parse_json(body)
                        if data is None:
                            return  # Skip invalid messages
                        
                        # Filter for stock data only
                        if data.get('data_type') == 'stock_price':
                            data['consumer_timestamp'] = datetime.now(timezone.utc).isoformat()
                            data['partition_id'] = partition_context.partition_id
                            messages.append(data)
                            print(f"📈 {len(messages)}: STOCK - {data.get('symbol', 'N/A')} @ ${data.get('close_price', 'N/A')}")
                            
                            if len(messages) >= 15:
                                stop_flag.set()
                        else:
                            # Skip non-stock messages silently
                            pass
                            
                    except Exception as me:
                        print(f"⚠️ Message processing error: {me}")
            
            with client:
                client.receive(
                    on_event=message_handler,
                    starting_position="-1",
                    max_wait_time=10
                )
        except Exception as e:
            print(f"⚠️ Consumer error: {e}")
            stop_flag.set()
    
    # Start consumer thread
    thread = threading.Thread(target=consumer_thread, daemon=True)
    thread.start()
    
    # Wait with extended timeout
    start_time = time.time()
    timeout_seconds = 20
    while thread.is_alive() and time.time() - start_time < timeout_seconds and len(messages) < 15:
        time.sleep(0.5)
    
    stop_flag.set()
    thread.join(timeout=3)
    
    print(f"✅ Consumed {len(messages)} stock messages")
    
    if not messages:
        print("⚠️ No stock messages consumed - Event Hub might be empty or only contains news data")
        return
    
    # Debug: Print first message structure
    if messages:
        debug_print_message_structure(messages[0], 1)
    
    # Step 2: Process stock data with ALL STRING types
    stock_events = [m for m in messages if m.get('data_type') == 'stock_price']
    
    print(f"\n📈 Processing {len(stock_events)} stock events:")
    
    # Process stocks with ALL STRING conversion to avoid any conflicts
    processed_stocks = []
    for i, event in enumerate(stock_events):
        try:
            print(f"\n🔍 Processing stock {i+1}: {event.get('symbol', 'UNKNOWN')}")
            
            # Debug: Show available fields in this event
            print(f"   Available fields: {list(event.keys())}")
            
            # Safe field extraction with string conversion
            symbol = str(event.get('symbol', f'UNKNOWN_{i}'))
            timestamp = str(event.get('timestamp', datetime.now(timezone.utc).isoformat()))
            
            print(f"   Symbol: {symbol}")
            print(f"   Timestamp: {timestamp}")
            
            # Safe conversion functions that return STRINGS
            def safe_string_float(value, default="1.0", field_name="unknown"):
                try:
                    if value is None:
                        print(f"   {field_name} is None, using default {default}")
                        return str(default)
                    result = str(float(value))
                    print(f"   {field_name}: {value} -> {result}")
                    return result
                except (ValueError, TypeError) as e:
                    print(f"   {field_name} conversion error: {e}, using default {default}")
                    return str(default)
            
            def safe_string_int(value, default="0", field_name="unknown"):
                try:
                    if value is None:
                        print(f"   {field_name} is None, using default {default}")
                        return str(default)
                    result = str(int(float(value)))
                    print(f"   {field_name}: {value} -> {result}")
                    return result
                except (ValueError, TypeError) as e:
                    print(f"   {field_name} conversion error: {e}, using default {default}")
                    return str(default)
            
            # Extract price fields with fallbacks - ALL AS STRINGS
            close_price_str = safe_string_float(event.get('close_price', event.get('price', 1.0)), "1.0", 'close_price')
            open_price_str = safe_string_float(event.get('open_price', event.get('close_price', 1.0)), close_price_str, 'open_price')
            high_price_str = safe_string_float(event.get('high_price', event.get('close_price', 1.0)), close_price_str, 'high_price')
            low_price_str = safe_string_float(event.get('low_price', event.get('close_price', 1.0)), close_price_str, 'low_price')
            volume_str = safe_string_int(event.get('volume', 0), "0", 'volume')
            
            # Convert to float for calculations, then back to string
            try:
                close_price_float = float(close_price_str)
                high_price_float = float(high_price_str)
                low_price_float = float(low_price_str)
                
                # Calculate enhanced metrics
                price_range = high_price_float - low_price_float
                volatility = price_range / close_price_float if close_price_float > 0 else 0.0
                
                # Volume indicator
                volume_int = int(float(volume_str))
                if volume_int > 50000000:
                    volume_indicator = 'high'
                elif volume_int > 20000000:
                    volume_indicator = 'medium'
                else:
                    volume_indicator = 'low'
                
                print(f"   Final prices - O:{open_price_str}, H:{high_price_str}, L:{low_price_str}, C:{close_price_str}, V:{volume_str}")
                
                # ALL VALUES AS STRINGS to eliminate any type conflicts
                processed_stock = {
                    'symbol': str(symbol),
                    'timestamp': str(timestamp),
                    'open_price': str(open_price_str),
                    'high_price': str(high_price_str),
                    'low_price': str(low_price_str),
                    'close_price': str(close_price_str),
                    'volume': str(volume_str),
                    'source': str(event.get('source', 'event_hub')),
                    'volatility': str(python_round(volatility, 6)),
                    'price_range': str(python_round(price_range, 4)),
                    'volume_indicator': str(volume_indicator),
                    'consumer_timestamp': str(event.get('consumer_timestamp', '')),
                    'partition_id': str(event.get('partition_id', '')),
                    'processing_mode': str('streaming_stock_consumer'),
                    'silver_processing_time': str(datetime.now(timezone.utc).isoformat()),
                    'data_quality_score': str(1.0)
                }
                
                processed_stocks.append(processed_stock)
                print(f"✅ Stock {i+1}: {symbol} @ ${close_price_str} processed successfully")
                
            except Exception as calc_error:
                print(f"❌ Calculation error for {symbol}: {calc_error}")
                continue
            
        except Exception as e:
            print(f"❌ Stock processing error {i+1}: {e}")
            print(f"   Full error details:")
            traceback.print_exc()
            continue
    
    # Step 3: Save stock data with ALL STRING metadata
    batch_id = f"stock_consumer_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    save_success = False
    
    if processed_stocks:
        try:
            print(f"\n💾 Saving {len(processed_stocks)} processed stocks...")
            
            # Add metadata as strings
            current_time_str = datetime.now(timezone.utc).isoformat()
            current_date_str = datetime.now().strftime('%Y-%m-%d')
            
            for record in processed_stocks:
                record["ingestion_time"] = str(current_time_str)
                record["processed_date"] = str(current_date_str)
                record["ingestion_batch"] = str(batch_id)
                record["layer"] = str("silver")
                record["ingestion_source"] = str("stock_consumer")
            
            stock_df = spark.createDataFrame(processed_stocks)
            
            # Save to ADLS (primary) - NO SCHEMA MERGING
            (stock_df.write
             .format("delta")
             .mode("append")
             .option("mergeSchema", "false")
             .save(silver_stock_path))
            
            print("✅ Saved stocks to Silver layer (ADLS)")
            save_success = True
            
            # Try Unity Catalog (secondary)
            if current_catalog and stock_table_name and safe_save_to_catalog(stock_df, stock_table_name):
                print("✅ Also saved stocks to Unity Catalog")
            else:
                print("⚠️ Unity Catalog stock save skipped")
            
        except Exception as e:
            print(f"❌ Stock save error: {e}")
            traceback.print_exc()
    
    # Step 4: Verification and reporting
    print(f"\n🔍 Verification:")
    try:
        if save_success:
            try:
                # Verify ADLS storage
                stock_files = dbutils.fs.ls(silver_stock_path)
                print(f"✅ Silver layer stock files: {len(stock_files)}")
                
                # Verify data integrity
                stock_verify = spark.read.format("delta").load(silver_stock_path)
                total_stocks = stock_verify.count()
                recent_stocks = stock_verify.filter(col("ingestion_source") == "stock_consumer").count()
                print(f"✅ Verified: {recent_stocks} new stock records ({total_stocks} total)")
                
                # Show sample of what was saved
                print(f"\n📊 Sample Stock Records:")
                (stock_verify
                 .filter(col("ingestion_source") == "stock_consumer")
                 .select("symbol", "close_price", "volume", "volatility", "volume_indicator", "silver_processing_time")
                 .orderBy(col("silver_processing_time").desc())
                 .limit(5)
                 .show(truncate=False))
                
                # Show summary statistics
                print(f"\n📈 Stock Data Summary:")
                (stock_verify
                 .filter(col("ingestion_source") == "stock_consumer")
                 .groupBy("symbol")
                 .agg(
                     count("*").alias("record_count")
                 )
                 .orderBy("symbol")
                 .show(truncate=False))
                
            except Exception as ve:
                print(f"⚠️ Stock verification error: {ve}")
        
    except Exception as e:
        print(f"⚠️ Verification error: {e}")
        traceback.print_exc()
    
    # Final summary
    print(f"\n📋 Final Summary:")
    print(f"✅ Consumed: {len(messages)} stock messages")
    print(f"✅ Processed: {len(processed_stocks)} stock records")
    print(f"✅ ADLS save: {save_success}")
    print(f"📁 Batch ID: {batch_id}")
    print(f"🎯 Focus: Stock data with ALL STRING types (no conflicts possible)")
    print(f"📋 New Table: {stock_table_name}")

# Execute consumer with top-level error handling
try:
    print("🚀 Starting Stock Consumer Script")
    print(f"⏰ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    stock_only_consume_and_process()
    
except Exception as e:
    print(f"❌ Stock consumer failed: {e}")
    print("📋 Full error traceback:")
    traceback.print_exc()

print(f"\n⏰ COMPLETED: {datetime.now().strftime('%H:%M:%S')}")
print("🎯 Stock data consumer with ALL STRING types")

Stock Data Consumer Script
🏗️ Setting up database schemas with ALL STRING types...
🔍 Checking existing schemas...
📁 Current catalog: databricks_stock_sentiment_canada
📁 Current database: default
📋 Schema columns available: ['databaseName']
📋 Existing schemas: ['bronze', 'default', 'information_schema']
🔧 Creating silver schema...
✅ Silver schema created
✅ Bronze schema already exists
🗑️ Dropped any existing consumer table
✅ Table databricks_stock_sentiment_canada.silver.stock_data_consumer created with ALL STRING types
📋 Available tables in silver schema: ['stock_data_consumer']

📈 Running Stock Data Consumer
✅ Configuration loaded
🚀 Starting Stock Consumer Script
⏰ 2025-07-21 22:12:10

🔄 Starting stock data consumption and processing...
📈 1: STOCK - AAPL @ $211.18
📈 2: STOCK - GOOGL @ $185.06
📈 3: STOCK - MSFT @ $510.05
📈 4: STOCK - AMZN @ $226.13
📈 5: STOCK - META @ $704.28
📈 6: STOCK - TSLA @ $329.65
✅ Consumed 6 stock messages

🔍 DEBUG - Message 1 structure:
   Keys: ['symbol', 'ti

In [0]:
%python
import json
from datetime import datetime

# Airflow Integration - Success/Failure Reporting

try:
    # If we reach here, notebook executed successfully
    success_result = {
        "status": "SUCCESS",
        "message": "Notebook execution completed successfully",
        "batch_id": batch_id,
        "execution_timestamp": datetime.now().isoformat(),
        "records_processed": locals().get('total_records_processed', 0),  # Update based on your variables
        "data_quality_score": locals().get('data_quality_score', 1.0)     # Update based on your variables
    }
    
    print(f"✅ Notebook Success:")
    print(json.dumps(success_result, indent=2))
    
    # Exit with success status for Airflow
    dbutils.notebook.exit(success_result)
    
except Exception as e:
    # If any error occurs, report failure
    failure_result = {
        "status": "FAILED", 
        "message": f"Notebook execution failed: {str(e)}",
        "batch_id": batch_id,
        "execution_timestamp": datetime.now().isoformat(),
        "error_type": type(e).__name__
    }
    
    print(f"❌ Notebook Failure:")
    print(json.dumps(failure_result, indent=2))
    
    # Exit with failure status for Airflow
    dbutils.notebook.exit(failure_result)