In [8]:
from dotenv import load_dotenv
load_dotenv()
import os

In [11]:
import openai
import json

# Sample Vietnamese product text
text = """üíªMacBook Pro - Gi√°: 5.5 tri·ªáu

üî• C·∫•u h√¨nh : Chip i5, ram 8G, SSD 256G ,m√°y 99%

/~Li M√°y ch√≠nh h√£ng zin 100%
/~Li Gi√° s·ªâ TO√ÄN QU·ªêC si√™u r√™
/~Li M√°y ch·ªçn ƒë·∫πp 99%, b·∫£o h√†nh
/~Li M√†n ƒë·∫πp, loa to, m√°y nhanh
/~Li Free ship To√†n Qu·ªëc + 4 Qu√†
/~Li ƒê·ªïi tu·ª≥ √Ω 15 ng√†y ƒë·∫ßu Free
üìû Hotline : 0982.729.830"""

# Template for extraction
template = """
# Task: R√∫t tr√≠ch th√¥ng tin chi ti·∫øt v·ªÅ s·∫£n ph·∫©m m√°y t√≠nh t·ª´ m·∫´u qu·∫£ng c√°o.
Bao g·ªìm c√°c th√¥ng tin sau:
- T√™n s·∫£n ph·∫©m
- Gi√°
- C·∫•u h√¨nh m√°y
- Hotline ho·∫∑c th√¥ng tin li√™n h·ªá

# Steps:
1. X√°c ƒë·ªãnh t√™n s·∫£n ph·∫©m.
2. X√°c ƒë·ªãnh gi√° s·∫£n ph·∫©m.
3. X√°c ƒë·ªãnh c√°c th√¥ng tin v·ªÅ c·∫•u h√¨nh m√°y (chip, RAM, SSD, t√¨nh tr·∫°ng m√°y, v.v.).
4. Tr√≠ch xu·∫•t th√¥ng tin li√™n h·ªá (s·ªë ƒëi·ªán tho·∫°i hotline).

# Output format:
{
"product_name": "<t√™n s·∫£n ph·∫©m>",
"price": "<gi√° s·∫£n ph·∫©m>",
"features": "<c·∫•u h√¨nh m√°y>",
"hotline": "<ƒëi·ªán tho·∫°i li√™n h·ªá>",
}

# Input:
{{text}}
"""

# Initialize OpenAI client
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])

def extract_vietnamese_product_info(text):
    """
    Extract product information from Vietnamese text using the template.
    """
    # Create the prompt by replacing {{text}} with actual text
    prompt = template.replace("{{text}}", text)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that extracts product information from Vietnamese product advertisements. Always respond with valid JSON."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        response_format={"type": "json_object"}
    )

    # Print the raw response for debugging
    print("Raw response content:", response.choices[0].message.content)
    # Parse the JSON response
    result = json.loads(response.choices[0].message.content)
    return result


# Alternative: Using structured output with JSON schema
def extract_with_schema(text):
    """
    Extract Vietnamese product information using structured outputs.
    """
    prompt = f"""R√∫t tr√≠ch th√¥ng tin chi ti·∫øt v·ªÅ s·∫£n ph·∫©m m√°y t√≠nh t·ª´ m·∫´u qu·∫£ng c√°o sau:

{text}

Bao g·ªìm: t√™n s·∫£n ph·∫©m, gi√°, c·∫•u h√¨nh m√°y, v√† hotline."""

    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": "B·∫°n l√† tr·ª£ l√Ω tr√≠ch xu·∫•t th√¥ng tin s·∫£n ph·∫©m t·ª´ qu·∫£ng c√°o ti·∫øng Vi·ªát."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "product_extraction",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "product_name": {
                            "type": "string",
                            "description": "T√™n s·∫£n ph·∫©m"
                        },
                        "price": {
                            "type": "string",
                            "description": "Gi√° s·∫£n ph·∫©m"
                        },
                        "features": {
                            "type": "string",
                            "description": "Th√¥ng tin c·∫•u h√¨nh m√°y"
                        },
                        "hotline": {
                            "type": "string",
                            "description": "S·ªë ƒëi·ªán tho·∫°i li√™n h·ªá"
                        }
                    },
                    "required": ["product_name", "price", "features", "hotline"],
                    "additionalProperties": False
                }
            }
        }
    )

    result = json.loads(response.choices[0].message.content)
    return result


# Example usage
if __name__ == "__main__":
    print("Vietnamese Product Information Extraction")
    print("=" * 60)
    print(f"\nInput Text:\n{text}\n")
    print("-" * 60)

    try:
        # Method 1: Template-based
        print("\nMethod 1: Template-based extraction")
        result1 = extract_vietnamese_product_info(text)
        print(json.dumps(result1, indent=2, ensure_ascii=False))

        print("\n" + "-" * 60)

        # Method 2: Structured output
        print("\nMethod 2: Structured output extraction")
        result2 = extract_with_schema(text)
        print(json.dumps(result2, indent=2, ensure_ascii=False))

    except Exception as e:
        print(f"Error: {str(e)}")
        print("\nNote: Make sure to replace 'your-api-key-here' with your actual OpenAI API key")

Vietnamese Product Information Extraction

Input Text:
üíªMacBook Pro - Gi√°: 5.5 tri·ªáu

üî• C·∫•u h√¨nh : Chip i5, ram 8G, SSD 256G ,m√°y 99%

/~Li M√°y ch√≠nh h√£ng zin 100%
/~Li Gi√° s·ªâ TO√ÄN QU·ªêC si√™u r√™
/~Li M√°y ch·ªçn ƒë·∫πp 99%, b·∫£o h√†nh
/~Li M√†n ƒë·∫πp, loa to, m√°y nhanh
/~Li Free ship To√†n Qu·ªëc + 4 Qu√†
/~Li ƒê·ªïi tu·ª≥ √Ω 15 ng√†y ƒë·∫ßu Free
üìû Hotline : 0982.729.830

------------------------------------------------------------

Method 1: Template-based extraction
Raw response content: {
  "product_name": "MacBook Pro",
  "price": "5.5 tri·ªáu",
  "features": "Chip i5, ram 8G, SSD 256G, m√°y 99%",
  "hotline": "0982.729.830"
}
{
  "product_name": "MacBook Pro",
  "price": "5.5 tri·ªáu",
  "features": "Chip i5, ram 8G, SSD 256G, m√°y 99%",
  "hotline": "0982.729.830"
}

------------------------------------------------------------

Method 2: Structured output extraction
{
  "product_name": "MacBook Pro",
  "price": "5.5 tri·ªáu",
  "features": "Chip i5, r