In [12]:
from kafka import KafkaConsumer
import json
import os
import pandas as pd
from sqlalchemy import create_engine

# PostgreSQL connection
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5435")
POSTGRES_DB = os.getenv("POSTGRES_DB")

# SQLAlchemy engine
engine = create_engine(f'postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}')


# Define Kafka consumer
consumer = KafkaConsumer(
    os.getenv("TOPIC_NAME"),  # The topic to consume messages from
    bootstrap_servers=os.getenv("KAFKA_BROKER"),  # List of Kafka brokers to connect to
    auto_offset_reset='earliest',  # Where to start reading messages when no offset is stored ('earliest' to read from the beginning)
    enable_auto_commit=True,  # Automatically commit offsets after consuming messages
    value_deserializer=lambda x: x.decode('utf-8') if x else None  # Deserialize message values from bytes to UTF-8 strings
)

# Consume messages with error handling for non-JSON messages
for msg in consumer:
    try:
        data = json.loads(msg.value)

        room_id = data.get('room_id')
        room_created_at = data.get('room_created_at')
        channel = data.get('channel')
        customer = data.get('customer', {})
        messages = data.get('messages', [])

        rows = []
        for message in messages:
            rows.append({
                "message_id": message.get("message_id"),
                "room_id": room_id,
                "room_created_at": room_created_at,
                "channel": channel,
                "customer_id": customer.get("customer_id"),
                "customer_name": customer.get("customer_name"),
                "phone": customer.get("phone"),
                "sender_type": message.get("sender_type"),
                "message_text": message.get("message_text"),
                "message_date": message.get("message_date")
            })

        df = pd.DataFrame(rows)

        # Insert into Postgres
        df.to_sql("fact_message", engine, if_exists='append', index=False)

        print(f"Inserted {len(df)} messages into Postgres.")

    except Exception as e:
        print(f"Error processing message: {e}")

Inserted 4 messages into Postgres.
Inserted 2 messages into Postgres.
Inserted 1 messages into Postgres.
Inserted 3 messages into Postgres.
Inserted 2 messages into Postgres.
Inserted 2 messages into Postgres.
Inserted 2 messages into Postgres.
Error processing message: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "fact_message_pkey"
DETAIL:  Key (message_id)=(msg_001) already exists.

[SQL: INSERT INTO fact_message (message_id, room_id, room_created_at, channel, customer_id, customer_name, phone, sender_type, message_text, message_date) VALUES (%(message_id__0)s, %(room_id__0)s, %(room_created_at__0)s, %(channel__0)s, %(customer_id__0)s ... 584 characters truncated ... s, %(customer_name__3)s, %(phone__3)s, %(sender_type__3)s, %(message_text__3)s, %(message_date__3)s)]
[parameters: {'channel__0': 'ads', 'customer_id__0': 'cust_001', 'room_id__0': '300001', 'room_created_at__0': '2024-10-22T04:42:30', 'customer_name__0': 'Alice Johnson', 'sender_type

KeyboardInterrupt: 

In [None]:
import json
import logging
import os
import sys
from kafka import KafkaConsumer
from minio import Minio


def setup_logging():
    """Configure logging for the application."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    return logging.getLogger(__name__)


def create_minio_client():
    """Create MinIO client with environment variables."""
    access_key = os.getenv("MINIO_ACCESS_KEY")
    secret_key = os.getenv("MINIO_SECRET_KEY")
    endpoint = os.getenv("MINIO_ENDPOINT", "localhost:9000")
    
    if not access_key or not secret_key:
        raise ValueError("MINIO_ACCESS_KEY and MINIO_SECRET_KEY are required")
    
    return Minio(
        endpoint=endpoint,
        access_key=access_key,
        secret_key=secret_key,
        secure=False
    )


def main():
    """Main function to run the Kafka consumer."""
    logger = setup_logging()
    
    try:
        # Create MinIO client
        client = create_minio_client()
        bucket_name = os.getenv("MINIO_BUCKET", "kafka-messages")
        
        # Ensure bucket exists
        if not client.bucket_exists(bucket_name):
            client.make_bucket(bucket_name)
            logger.info(f"Created bucket: {bucket_name}")
        
        # Create Kafka consumer
        consumer = KafkaConsumer(
            'test-topic',
            bootstrap_servers=['localhost:9092'],
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: x.decode('utf-8') if x else None
        )
        
        logger.info("Starting Kafka message consumer...")
        
        # Simple consumer loop
        for count, message in enumerate(consumer):
            try:
                # Create object name
                object_name = f"messages/message_{count}.json"
                
                # Upload message to MinIO
                message_data = json.dumps(json.loads(message.value), indent=2)
                client.put_object(
                    bucket_name,
                    object_name,
                    data=message_data.encode('utf-8'),
                    length=len(message_data.encode('utf-8')),
                    content_type='application/json'
                )
                
                logger.info(f"Uploaded message {count} to {bucket_name}/{object_name}")
                
            except Exception as e:
                logger.error(f"Error processing message {count}: {e}")
                
    except KeyboardInterrupt:
        logger.info("Consumer interrupted by user")
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        sys.exit(1)
    finally:
        try:
            consumer.close()
            logger.info("Kafka consumer closed")
        except:
            pass


if __name__ == "__main__":
    main()

2025-06-04 22:40:44,352 - INFO - Database connection established successfully
2025-06-04 22:40:44,352 - INFO - <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: connecting to localhost:9092 [('::1', 9092, 0, 0) IPv6]
2025-06-04 22:40:44,364 - INFO - Probing node bootstrap-0 broker version
2025-06-04 22:40:44,364 - INFO - <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: Connection complete.
2025-06-04 22:40:44,468 - INFO - Broker version identified as 2.6.0
2025-06-04 22:40:44,468 - INFO - Set configuration api_version=(2, 6, 0) to skip auto check_version requests on startup
2025-06-04 22:40:44,468 - INFO - Updating subscribed topics to: ('test-topic',)
2025-06-04 22:40:44,468 - INFO - Kafka consumer created for topic: test-topic
2025-06-04 22:40:44,468 - INFO - Starting Kafka message consumer...
2025-06-04 22:40:44,479 - INFO - Updated partition assignment: [TopicPartition(topic='test-t