In [None]:
import os
import json
import boto3
import psycopg2
import numpy as np
from openai import OpenAI
from datetime import datetime
from dotenv import load_dotenv
from psycopg2.extras import Json

In [None]:
load_dotenv()

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DB_HOST = "localhost"
DB_PORT = "5435"
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASSWORD = "postgres"
S3_BUCKET_NAME = "xyz-support-images"
AWS_ACCESS_KEY_ID = "test"
AWS_SECRET_ACCESS_KEY = "test"
AWS_REGION = "us-east-1"
AWS_ENDPOINT_URL = "http://localhost:4566"

In [None]:
DB_HOST

In [None]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION,
    endpoint_url=AWS_ENDPOINT_URL,
)

In [None]:
support_tickets = [
    {
        "ticket_id": "XYZ-1001",
        "subject": "Unable to login to dashboard after password reset",
        "description": "After requesting a password reset this morning, I'm unable to login to my account on the customer dashboard. I receive 'Invalid credentials' error even though I'm sure I'm using the correct new password.",
        "customer": {
            "id": "C8932",
            "name": "John Davis",
            "email": "john.davis@example.com",
            "company": "Acme Corp",
            "account_type": "Enterprise",
        },
        "metadata": {
            "created_at": "2025-03-12T09:23:45Z",
            "updated_at": "2025-03-12T14:30:22Z",
            "status": "resolved",
            "priority": "medium",
            "category": "authentication",
            "product": "XYZ Dashboard",
            "version": "3.2.1",
            "platform": "web",
            "browser": "Chrome 124.0",
            "images": [
                {
                    "s3_key": "tickets/XYZ-1001/screenshot_error.png",
                    "description": "Screenshot of error message",
                    "uploaded_at": "2025-03-12T09:25:30Z",
                }
            ],
        },
        "resolution": {
            "agent": "Sarah Miller",
            "resolution_time": "4h 46m",
            "solution": "Identified cache issue in the authentication flow. Instructed customer to clear browser cache and cookies, after which login was successful. Added case to knowledge base for future reference.",
            "tags": ["login", "cache", "authentication", "password-reset"],
        },
    },
    {
        "ticket_id": "XYZ-1002",
        "subject": "XYZ Analytics API returning 503 errors intermittently",
        "description": "We're experiencing intermittent 503 Service Unavailable errors when calling the /analytics/reports/daily endpoint. This started occurring approximately 2 hours ago. About 30% of our requests are failing. We've verified our authentication tokens are valid and rate limits haven't been exceeded.",
        "customer": {
            "id": "C5467",
            "name": "Maria Rodriguez",
            "email": "m.rodriguez@techfirm.co",
            "company": "TechFirm Inc.",
            "account_type": "Premium",
        },
        "metadata": {
            "created_at": "2025-03-15T16:42:18Z",
            "updated_at": "2025-03-16T08:10:05Z",
            "status": "resolved",
            "priority": "high",
            "category": "api",
            "product": "XYZ Analytics API",
            "version": "v2",
            "platform": "server",
            "images": [
                {
                    "s3_key": "tickets/XYZ-1002/api_error_log.png",
                    "description": "API error log screenshot",
                    "uploaded_at": "2025-03-15T16:45:10Z",
                },
                {
                    "s3_key": "tickets/XYZ-1002/monitoring_dashboard.png",
                    "description": "Monitoring dashboard showing error spike",
                    "uploaded_at": "2025-03-15T17:12:23Z",
                },
            ],
        },
        "resolution": {
            "agent": "David Chen",
            "resolution_time": "15h 28m",
            "solution": "Identified database connection pool saturation in one of our API service clusters. Scaled up database connection pool and implemented circuit breaker pattern to prevent cascading failures. Monitored for 1 hour after changes with no further 503 errors. Engineering team has scheduled permanent capacity increase for next maintenance window.",
            "tags": ["api", "503-error", "database", "connection-pool", "scaling"],
        },
    },
    {
        "ticket_id": "XYZ-1003",
        "subject": "Data export feature timing out for large datasets",
        "description": "When attempting to export more than 50,000 records from our account, the export process starts but then times out after approximately 3 minutes. We need to be able to export our full dataset which contains roughly 200,000 records for quarterly compliance reporting.",
        "customer": {
            "id": "C2390",
            "name": "Robert Johnson",
            "email": "r.johnson@financeplus.com",
            "company": "Finance Plus",
            "account_type": "Enterprise",
        },
        "metadata": {
            "created_at": "2025-03-18T11:05:32Z",
            "updated_at": "2025-03-19T15:22:10Z",
            "status": "resolved",
            "priority": "medium",
            "category": "data",
            "product": "XYZ Data Management",
            "version": "4.1.0",
            "platform": "web",
            "images": [
                {
                    "s3_key": "tickets/XYZ-1003/export_timeout.png",
                    "description": "Export process timeout error",
                    "uploaded_at": "2025-03-18T11:10:15Z",
                }
            ],
        },
        "resolution": {
            "agent": "Amanda Wright",
            "resolution_time": "28h 17m",
            "solution": "Identified timeout issue in the export service. Implemented immediate workaround by enabling asynchronous export for the customer's account, allowing them to request the export and receive an email notification when the file is ready for download. Product team has scheduled enhancement in next sprint to make this option available to all Enterprise customers by default.",
            "tags": ["export", "timeout", "large-dataset", "async-processing"],
        },
    },
    {
        "ticket_id": "XYZ-1004",
        "subject": "Mobile app crashes on notification tap",
        "description": "Since updating to the latest version of the XYZ Mobile app (version 2.5.0) on my iPhone 15 Pro, the app crashes whenever I tap on a notification. I've already tried uninstalling and reinstalling the app, but the issue persists.",
        "customer": {
            "id": "C7721",
            "name": "Emma Thompson",
            "email": "emma.t@smallbusiness.org",
            "company": "Thompson Consulting",
            "account_type": "Standard",
        },
        "metadata": {
            "created_at": "2025-03-20T13:45:12Z",
            "updated_at": "2025-03-20T16:30:18Z",
            "status": "resolved",
            "priority": "medium",
            "category": "mobile",
            "product": "XYZ Mobile App",
            "version": "2.5.0",
            "platform": "iOS",
            "device": "iPhone 15 Pro",
            "os_version": "iOS 18.2",
            "images": [
                {
                    "s3_key": "tickets/XYZ-1004/crash_report.png",
                    "description": "Crash report screenshot",
                    "uploaded_at": "2025-03-20T13:48:30Z",
                },
                {
                    "s3_key": "tickets/XYZ-1004/notification_screenshot.png",
                    "description": "Notification before crash",
                    "uploaded_at": "2025-03-20T13:49:12Z",
                },
            ],
        },
        "resolution": {
            "agent": "James Wilson",
            "resolution_time": "2h 45m",
            "solution": "Identified bug in the notification handling logic for iOS 18.2. Provided customer with beta access to version 2.5.1 which contains the fix. Verified with customer that the issue is resolved with the update. Version 2.5.1 is scheduled for public release in 3 days.",
            "tags": ["mobile", "crash", "notifications", "ios", "bug"],
        },
    },
    {
        "ticket_id": "XYZ-1005",
        "subject": "Need help setting up multi-factor authentication for team",
        "description": "We recently purchased an Enterprise account and need assistance setting up multi-factor authentication for our team of 25 users. Specifically, we want to require MFA for all admin accounts but make it optional for regular users. We also would like to know if we can integrate with our existing Azure AD setup.",
        "customer": {
            "id": "C9103",
            "name": "Michael Patel",
            "email": "m.patel@innovatech.com",
            "company": "InnovaTech Solutions",
            "account_type": "Enterprise",
        },
        "metadata": {
            "created_at": "2025-03-22T10:18:40Z",
            "updated_at": "2025-03-23T11:45:55Z",
            "status": "resolved",
            "priority": "low",
            "category": "account",
            "product": "XYZ Platform",
            "version": "Enterprise",
            "platform": "web",
            "images": [
                {
                    "s3_key": "tickets/XYZ-1005/current_settings.png",
                    "description": "Current account settings screenshot",
                    "uploaded_at": "2025-03-22T10:22:18Z",
                },
                {
                    "s3_key": "tickets/XYZ-1005/azure_ad_config.png",
                    "description": "Azure AD configuration",
                    "uploaded_at": "2025-03-22T10:25:45Z",
                },
            ],
        },
        "resolution": {
            "agent": "Olivia Parker",
            "resolution_time": "25h 27m",
            "solution": "Scheduled and conducted 1-hour video call with customer's IT team. Walked through the MFA setup process, demonstrated role-based MFA policies, and configured Azure AD integration for their account. Provided additional documentation for reference and created custom setup guide for their specific requirements. Customer confirmed successful implementation.",
            "tags": ["mfa", "enterprise", "azure-ad", "security", "onboarding"],
        },
    },
]

In [None]:
def create_database_tables():
    """Create necessary tables and extensions in the PostgreSQL database."""
    conn = None
    try:
        # Connect to PostgreSQL
        conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
        )

        # Create a cursor
        cur = conn.cursor()

        # First try creating the extension - this will fail gracefully if it doesn't exist
        try:
            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
            conn.commit()
            print("pgvector extension enabled successfully.")

            # Create ticket table with vector column if pgvector is available
            cur.execute(
                """
            CREATE TABLE IF NOT EXISTS support_tickets (
                id SERIAL PRIMARY KEY,
                ticket_id VARCHAR(20) UNIQUE NOT NULL,
                subject TEXT NOT NULL,
                description TEXT NOT NULL,
                customer JSONB NOT NULL,
                metadata JSONB NOT NULL,
                resolution JSONB,
                embedding vector(1536),
                created_at TIMESTAMP NOT NULL DEFAULT NOW()
            );
            """
            )
            conn.commit()
            print("Table with vector column created successfully.")
            return True

        except Exception as e:
            conn.rollback()
            print(f"Warning: Could not create pgvector extension: {e}")
            print("Creating table without vector column...")

            # Create table without vector column as fallback
            cur.execute(
                """
            CREATE TABLE IF NOT EXISTS support_tickets (
                id SERIAL PRIMARY KEY,
                ticket_id VARCHAR(20) UNIQUE NOT NULL,
                subject TEXT NOT NULL,
                description TEXT NOT NULL,
                customer JSONB NOT NULL,
                metadata JSONB NOT NULL,
                resolution JSONB,
                created_at TIMESTAMP NOT NULL DEFAULT NOW()
            );
            """
            )
            conn.commit()
            print("Table without vector column created successfully.")
            return False

    except Exception as e:
        print(f"Error creating database tables: {e}")
        if conn:
            conn.rollback()
        return False
    finally:
        if conn:
            conn.close()


def create_s3_bucket_if_not_exists():
    """Create the S3 bucket if it doesn't exist."""
    try:
        # Check if bucket exists
        s3_client.head_bucket(Bucket=S3_BUCKET_NAME)
        print(f"Bucket {S3_BUCKET_NAME} already exists.")
    except Exception:
        # Create the bucket
        try:
            # For LocalStack we don't need CreateBucketConfiguration
            s3_client.create_bucket(Bucket=S3_BUCKET_NAME)
            print(f"Created S3 bucket: {S3_BUCKET_NAME}")
        except Exception as e:
            print(f"Error creating S3 bucket: {e}")


def upload_mock_images_with_content():
    """Upload mock images for the tickets to S3 with visible content."""
    from PIL import Image, ImageDraw, ImageFont
    import io

    # First create the bucket
    try:
        create_s3_bucket_if_not_exists()
    except Exception as e:
        print(f"Error with S3 setup: {e}")
        return

    # Gather all image paths and descriptions from the tickets
    image_data = []
    for ticket in support_tickets:
        if "metadata" in ticket and "images" in ticket["metadata"]:
            ticket_id = ticket["ticket_id"]
            for image in ticket["metadata"]["images"]:
                image_data.append(
                    {
                        "s3_key": image["s3_key"],
                        "description": image["description"],
                        "ticket_id": ticket_id,
                    }
                )

    # Create and upload images with content
    for img_info in image_data:
        try:
            # Create a new image with white background
            img = Image.new("RGB", (800, 600), color=(255, 255, 255))
            draw = ImageDraw.Draw(img)

            # Try to use a system font or default to a basic font
            try:
                # Use a common system font
                font = ImageFont.truetype("Arial.ttf", 24)
                small_font = ImageFont.truetype("Arial.ttf", 16)
            except IOError:
                # If font not found, use default
                font = ImageFont.load_default()
                small_font = ImageFont.load_default()

            # Add text to the image
            draw.text(
                (50, 50), f"Ticket: {img_info['ticket_id']}", fill=(0, 0, 0), font=font
            )
            draw.text(
                (50, 100),
                f"Description: {img_info['description']}",
                fill=(0, 0, 0),
                font=font,
            )
            draw.text(
                (50, 200),
                "This is a mock image for RAG testing",
                fill=(0, 0, 150),
                font=font,
            )
            draw.text(
                (50, 500),
                f"Image path: {img_info['s3_key']}",
                fill=(100, 100, 100),
                font=small_font,
            )

            # Add a border
            draw.rectangle([(20, 20), (780, 580)], outline=(0, 0, 0), width=2)

            # Save to a bytes buffer
            buffer = io.BytesIO()
            img.save(buffer, format="PNG")
            buffer.seek(0)

            # Upload to S3
            s3_client.put_object(
                Bucket=S3_BUCKET_NAME,
                Key=img_info["s3_key"],
                Body=buffer.getvalue(),
                ContentType="image/png",
            )
            print(
                f"Uploaded image with content to s3://{S3_BUCKET_NAME}/{img_info['s3_key']}"
            )
        except Exception as e:
            print(f"Error creating/uploading image to {img_info['s3_key']}: {e}")


def generate_embedding(text):
    """Generate embedding vector using OpenAI's embedding model."""
    try:
        # Updated API call for OpenAI client
        response = client.embeddings.create(model="text-embedding-ada-002", input=text)
        embedding = response.data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None


def verify_s3_images(ticket):
    """Verify that images exist in S3 bucket and add presigned URLs."""
    if "metadata" in ticket and "images" in ticket["metadata"]:
        for image in ticket["metadata"]["images"]:
            s3_key = image["s3_key"]
            try:
                # Check if the image exists in S3
                s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=s3_key)

                # Add a presigned URL (valid for 1 hour)
                presigned_url = s3_client.generate_presigned_url(
                    "get_object",
                    Params={"Bucket": S3_BUCKET_NAME, "Key": s3_key},
                    ExpiresIn=3600,
                )
                image["presigned_url"] = presigned_url
                image["exists"] = True

            except Exception as e:
                print(f"Warning: Image {s3_key} not found in S3: {e}")
                image["exists"] = False

    return ticket


def insert_ticket_data(ticket, has_vector=True):
    """Insert a support ticket with its embedding into the database."""
    conn = None
    try:
        # Verify S3 images
        ticket = verify_s3_images(ticket)

        # Connect to PostgreSQL
        conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
        )

        # Create a cursor
        cur = conn.cursor()

        # Generate embedding if vector support is available
        embedding = None
        if has_vector:
            # Generate text for embedding
            embedding_text = f"{ticket['subject']} {ticket['description']}"
            if "resolution" in ticket and "solution" in ticket["resolution"]:
                embedding_text += f" {ticket['resolution']['solution']}"

            # Generate embedding
            embedding = generate_embedding(embedding_text)

        if has_vector and embedding:
            # Insert ticket data with embedding - cast to vector type
            cur.execute(
                """
            INSERT INTO support_tickets 
            (ticket_id, subject, description, customer, metadata, resolution, embedding, created_at)
            VALUES (%s, %s, %s, %s, %s, %s, %s::vector, %s)
            ON CONFLICT (ticket_id) 
            DO UPDATE SET
                subject = EXCLUDED.subject,
                description = EXCLUDED.description,
                customer = EXCLUDED.customer,
                metadata = EXCLUDED.metadata,
                resolution = EXCLUDED.resolution,
                embedding = EXCLUDED.embedding,
                created_at = EXCLUDED.created_at
            """,
                (
                    ticket["ticket_id"],
                    ticket["subject"],
                    ticket["description"],
                    Json(ticket["customer"]),
                    Json(ticket["metadata"]),
                    Json(ticket["resolution"]) if "resolution" in ticket else None,
                    str(embedding),  # Convert list to string for casting
                    datetime.now(),
                ),
            )
        else:
            # Insert ticket data without embedding
            cur.execute(
                """
            INSERT INTO support_tickets 
            (ticket_id, subject, description, customer, metadata, resolution, created_at)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (ticket_id) 
            DO UPDATE SET
                subject = EXCLUDED.subject,
                description = EXCLUDED.description,
                customer = EXCLUDED.customer,
                metadata = EXCLUDED.metadata,
                resolution = EXCLUDED.resolution,
                created_at = EXCLUDED.created_at
            """,
                (
                    ticket["ticket_id"],
                    ticket["subject"],
                    ticket["description"],
                    Json(ticket["customer"]),
                    Json(ticket["metadata"]),
                    Json(ticket["resolution"]) if "resolution" in ticket else None,
                    datetime.now(),
                ),
            )

        # Commit the transaction
        conn.commit()
        print(f"Ticket {ticket['ticket_id']} inserted successfully.")
        return True

    except Exception as e:
        print(f"Error inserting ticket {ticket['ticket_id']}: {e}")
        if conn:
            conn.rollback()
        return False
    finally:
        if conn:
            conn.close()


def perform_vector_search(query_text, limit=5):
    """Perform a vector similarity search on the support tickets."""
    conn = None
    try:
        # Generate embedding for the query
        query_embedding = generate_embedding(query_text)

        if not query_embedding:
            return []

        # Connect to PostgreSQL
        conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
        )

        # Create a cursor
        cur = conn.cursor()

        # Cast the embedding to the vector type explicitly
        # Use Euclidean distance (L2 norm)
        cur.execute(
            """
        SELECT 
            ticket_id, 
            subject, 
            description, 
            customer, 
            metadata, 
            resolution,
            embedding <-> %s::vector as distance
        FROM support_tickets
        ORDER BY embedding <-> %s::vector
        LIMIT %s;
        """,
            (str(query_embedding), str(query_embedding), limit),
        )

        # Fetch results
        results = []
        for row in cur.fetchall():
            results.append(
                {
                    "ticket_id": row[0],
                    "subject": row[1],
                    "description": row[2],
                    "customer": row[3],
                    "metadata": row[4],
                    "resolution": row[5],
                    "distance": row[6],
                }
            )

        return results

    except Exception as e:
        print(f"Error performing vector search: {e}")
        # Try an alternative query if the first one fails
        try:
            if conn:
                cur = conn.cursor()
                # Fall back to a non-vector search
                cur.execute(
                    """
                SELECT 
                    ticket_id, 
                    subject, 
                    description, 
                    customer, 
                    metadata, 
                    resolution
                FROM support_tickets
                WHERE 
                    subject ILIKE %s OR 
                    description ILIKE %s
                LIMIT %s;
                """,
                    (f"%{query_text}%", f"%{query_text}%", limit),
                )

                results = []
                for row in cur.fetchall():
                    results.append(
                        {
                            "ticket_id": row[0],
                            "subject": row[1],
                            "description": row[2],
                            "customer": row[3],
                            "metadata": row[4],
                            "resolution": row[5],
                            "distance": None,  # No distance in text search
                            "method": "text_fallback",
                        }
                    )

                print("Used text search fallback instead of vector search")
                return results
        except Exception as inner_e:
            print(f"Fallback search also failed: {inner_e}")
        return []
    finally:
        if conn:
            conn.close()


def debug_test_connection():
    """Test database and S3 connections."""
    print("\n--- Testing Database Connection ---")
    try:
        conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
        )
        cur = conn.cursor()
        cur.execute("SELECT version();")
        version = cur.fetchone()
        print(f"Connected to PostgreSQL: {version[0]}")

        # Check extensions
        cur.execute(
            "SELECT name, installed_version FROM pg_available_extensions WHERE name = 'vector';"
        )
        ext = cur.fetchone()
        if ext:
            print(f"pgvector is available: {ext}")

            # Check if it's installed
            cur.execute(
                "SELECT extname, extversion FROM pg_extension WHERE extname = 'vector';"
            )
            installed = cur.fetchone()
            if installed:
                print(f"pgvector is installed: {installed}")
            else:
                print("pgvector is available but not installed yet")
        else:
            print("pgvector extension is not available")

        conn.close()
    except Exception as e:
        print(f"PostgreSQL connection error: {e}")

    print("\n--- Testing S3 Connection ---")
    try:
        # List S3 buckets
        response = s3_client.list_buckets()
        print(f"Connected to S3, found {len(response['Buckets'])} buckets")
        for bucket in response["Buckets"]:
            print(f" - {bucket['Name']}")
    except Exception as e:
        print(f"S3 connection error: {e}")

    print("\n--- End of Connection Tests ---\n")

In [None]:
def main():
    """Main function to process support tickets and store them in the database."""
    print("Starting support ticket data ingestion process...")

    # Debug connection test
    debug_test_connection()

    # Set up database tables - continue even if pgvector is not available
    has_vector = create_database_tables()

    # Set up S3 bucket and mock images with content
    upload_mock_images_with_content()

    # Insert tickets
    for ticket in support_tickets:
        success = insert_ticket_data(ticket, has_vector)
        if not success:
            print(f"Failed to process ticket {ticket['ticket_id']}")

    # Only perform vector search if pgvector is available
    if has_vector:
        print("\nPerforming example vector search:")
        search_query = "login problems after password reset"
        results = perform_vector_search(search_query, limit=2)

        print(f"Search results for query: '{search_query}'")
        for i, result in enumerate(results):
            print(f"\nResult {i+1}:")
            if "distance" in result and result["distance"] is not None:
                print(f"Distance: {result['distance']:.4f}")
            if "method" in result and result["method"] == "text_fallback":
                print("(Using text search fallback)")
            print(f"Ticket ID: {result['ticket_id']}")
            print(f"Subject: {result['subject']}")
            print(
                f"Resolution: {result['resolution']['solution'] if result['resolution'] else 'N/A'}"
            )
    else:
        print("\nSkipping vector search (pgvector not available)")

In [None]:
main()