In [1]:

import os
from openai import OpenAI
import json
from dotenv import load_dotenv
from app.document_processor import JabilInvoice, ElcamInvoice

# Load environment variables
load_dotenv()

# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set.")
    
client = OpenAI(api_key=api_key)

def validate_pdf(file):
    """
    Validate if the uploaded file is a PDF
    """
    try:
        # Check file extension
        if not file.filename.lower().endswith('.pdf'):
            return False
            
        # Try to read it as a PDF
        reader = PyPDF2.PdfReader(file)
        # If we get here, it's a valid PDF
        file.seek(0)  # Reset file pointer for later use
        return True
    except Exception as e:
        print(f"PDF validation error: {str(e)}")
        return False

def extract_pdf_fields(file_path):
    """
    Extract fields from PDF file
    Returns a dictionary of extracted fields
    """
    try:
        reader = PyPDF2.PdfReader(file_path)
        num_pages = len(reader.pages)
        text = ""
        
        # Extract text from all pages
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
            
        # For now, just return basic info about the PDF
        # You would implement your specific field extraction logic here
        extracted_data = {
            'filename': os.path.basename(file_path),
            'pages': num_pages,
            'text_sample': text[:200] + '...' if len(text) > 200 else text
            # Add more extracted fields here based on your requirements
        }
        
        return extracted_data
    except Exception as e:
        print(f"Error extracting PDF fields: {str(e)}")
        return {'error': str(e)}

def extract_jabil_data_with_openai(file_path):
    """
    Extract Jabil invoice data using OpenAI API.
    
    Args:
        file_path (str): Path to the Jabil PDF file
        
    Returns:
        dict: Extracted data in JSON format or error message
    """
    try:
        print(f"Extracting Jabil data using OpenAI from: {file_path}")
        # Upload file to OpenAI
        with open(file_path, "rb") as file_data:
            file = client.files.create(
                file=file_data,
                purpose="user_data"
            )
        print(f"File uploaded with ID: {file.id}")
        
        # Make the API call
        print("Starting Jabil data extraction API call")
        completion = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "You are a helpful assistant that can extract information from Jabil invoices. You will be provided with an invoice file and need to extract all required fields."
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "file",
                            "file": {
                                "file_id": file.id,
                            }
                        },
                        {
                            "type": "text",
                            "text": "Extract all fields from this Jabil invoice including Part Name/Description, Jabil Part, Jabil Lot, Press #, Mold #, Date of Manufacture, Ship Date, Box Qty, Purchasing Specification #, Purchasing Specification Rev., Customer PO, Qty Released, Order Qty Shipped, Elcam Drawing #, Elcam Drawing Rev, Elcam Part #, Abbvie Part #, Expiration Date, and Tailgate Qty Sent."
                        }
                    ]
                }
            ],
            response_format=JabilInvoice,
        )
        print("Jabil API call completed")
        return completion.choices[0].message.content
        # Clean up the file from OpenAI
        client.files.delete(file.id)
        
        # Get the content and add debug info
        result = completion.choices[0].message.content
        print(f"Jabil extraction result type: {type(result)}")
        
        if hasattr(result, 'model_dump'):
            # If it has model_dump method, use it
            data = result.model_dump()
            print(f"Jabil data after model_dump: {data}")
            return data
        elif isinstance(result, dict):
            # If it's already a dictionary
            print(f"Jabil data (already dict): {result}")
            return result
        else:
            # If it's something else, return an error
            print(f"Unexpected result type: {type(result)}")
            return {"error": f"Unexpected result type: {type(result)}"}
    
    except Exception as e:
        # In case of error, return an error message
        print(f"Error in extract_jabil_data_with_openai: {str(e)}")
        return {"error": f"Failed to extract Jabil invoice data: {str(e)}"}

def extract_elcam_data_with_openai(file_path):
    """
    Extract Elcam invoice data using OpenAI API.
    
    Args:
        file_path (str): Path to the Elcam PDF file
        
    Returns:
        dict: Extracted data in JSON format or error message
    """
    try:
        print(f"Extracting Elcam data using OpenAI from: {file_path}")
        # Upload file to OpenAI
        with open(file_path, "rb") as file_data:
            file = client.files.create(
                file=file_data,
                purpose="user_data"
            )
        print(f"File uploaded with ID: {file.id}")
        
        # Make the API call
        print("Starting Elcam data extraction API call")
        completion = client.beta.chat.completions.parse(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "You are a helpful assistant that can extract information from Elcam invoices. You will be provided with an invoice file and need to extract all required fields."
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "file",
                            "file": {
                                "file_id": file.id,
                            }
                        },
                        {
                            "type": "text",
                            "text": "Extract all fields from this Elcam invoice: Elcam's Part #, AbbVie Part #, Specs # and Rev., Part Name, Batch #, Manufacture Date, Expiry Date, Quantity, PO #, and Tailgate samples Quantity."
                        }
                    ]
                }
            ],
            response_format=ElcamInvoice,
        )
        print("Elcam API call completed")
        
        # Clean up the file from OpenAI
        client.files.delete(file.id)
        
        # Get the content and add debug info
        result = completion.choices[0].message.content
        print(f"Elcam extraction result type: {type(result)}")
        
        if hasattr(result, 'model_dump'):
            # If it has model_dump method, use it
            data = result.model_dump()
            print(f"Elcam data after model_dump: {data}")
            return data
        elif isinstance(result, dict):
            # If it's already a dictionary
            print(f"Elcam data (already dict): {result}")
            return result
        else:
            # If it's something else, return an error
            print(f"Unexpected result type: {type(result)}")
            return {"error": f"Unexpected result type: {type(result)}"}
    
    except Exception as e:
        # In case of error, return an error message
        print(f"Error in extract_elcam_data_with_openai: {str(e)}")
        return {"error": f"Failed to extract Elcam invoice data: {str(e)}"}


In [5]:
extract_elcam_data_with_openai('/Users/matansharon/python/AI/COC/app/uploads/Elcan_COC.pdf')

Extracting Elcam data using OpenAI from: /Users/matansharon/python/AI/COC/app/uploads/Elcan_COC.pdf
File uploaded with ID: file-V4VWCsnSThcgrhbJmzYHz2
Starting Elcam data extraction API call
Elcam API call completed
Elcam extraction result type: <class 'str'>
Unexpected result type: <class 'str'>


{'error': "Unexpected result type: <class 'str'>"}