In [1]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import re

# Load your API key from .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
folder_path = "food"
file_names = os.listdir(folder_path)

file_names = [
    folder_path + "/" + f
    for f in file_names
    if os.path.isfile(folder_path + "/" + f) and f.endswith(".txt")
]

print(file_names)


['food/izmir_foods.txt']


In [3]:
import re
import json

def build_coref_prompt(text: str) -> str:
    return (
        "Resolve all coreferences in the text below. Replace pronouns (he, she, it, there, this, etc.) "
        "and indirect references with the appropriate named entities to make the text fully self-contained.\n\n"
        "example-1:\n"
        "Original: \"Elon Musk was born in South Africa. There, he briefly attended classes at the University of Pretoria.\"\n"
        "After Coreference Resolution: \"Elon Musk was born in South Africa. In South Africa, Elon Musk briefly attended classes at the University of Pretoria.\"\n\n"
        "example-2:\n"
        "Original: \"İzmir is a bustling city on the Aegean coast. It is known for its vibrant culture and seaside promenades.\"\n"
        "After Coreference Resolution: \"İzmir is a bustling city on the Aegean coast. İzmir is known for İzmir's vibrant culture and seaside promenades.\"\n\n"
        f"Text:\n{text}\n\nRewritten Text:"
    )

def chunk_text_prompt(text: str) -> str:
    return f"""
You are an assistant helping organize culinary information for a Smart Travel Planner in İzmir.

Split the input text into chunks, each chunk describing **exactly one food or drink**.

Return your output in this format:
Food: <name of food>
Chunk: <related description and features>
Other part: <remaining text not yet chunked>

Only return one food chunk at a time. If no more food remains, return:
Food: None
Chunk: None
Other part: None

Here is the food text:
\"\"\"{text}\"\"\"
"""

def get_food_and_chunk(chunked_response):
    food_match = re.search(r"Food:\s*(.+?)\s*Chunk:", chunked_response, re.DOTALL)
    chunk_match = re.search(r"Chunk:\s*(.+?)\s*Other part:", chunked_response, re.DOTALL)
    if not food_match or not chunk_match:
        return None
    return {
        "food": food_match.group(1).strip(),
        "chunk": chunk_match.group(1).strip()
    }

def stringjson2json(output_text: str):
    try:
        if "```json" in output_text:
            start = output_text.find("```json") + len("```json")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        elif "```" in output_text:
            start = output_text.find("```") + len("```")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        else:
            json_str = output_text.strip()
        return json.loads(json_str)
    except Exception as e:
        print("JSON Parse Error:", e)
        return None

def text2feature(food_name: str, text: str) -> str:
    return f"""
You are a structured knowledge extraction assistant for a smart travel recommendation system.

Below is a food-related mention extracted from a travel paragraph:

\"\"\"{text}\"\"\"

Your task is to enrich the food item using only the information explicitly or implicitly present in the paragraph.

Extract the following fields:
- "name": \"\"\"{food_name}\"\"\"
- "type": one of ["pastry", "sandwich", "dessert", "drink", "seafood", "offal dish", "street food", "snack"]
- "ingredients": list of key ingredients mentioned or clearly implied
- "description": a concise, accurate summary (1–2 sentences)
- "where_to_eat": list of named locations (neighborhoods, districts, streets) mentioned in the text where this food is commonly found

Return a valid JSON list containing one object. Do not guess or invent locations.
"""


def validate_food_features(entity_list: list, paragraph: str) -> str:
    return f"""
You are an expert assistant reviewing enriched food data extracted from a travel paragraph.

Your task is to validate and correct each food entity based on the paragraph provided.

For each entity, check:
- Is the "type" appropriate based on the description?
- Are the "ingredients" mentioned or strongly implied in the paragraph?
- Is the "description" accurate, concise, and supported by the text?
- Is the "where_to_eat" field consistent with place names mentioned in the paragraph? Remove hallucinated or vague locations.

Fix any hallucinated or missing values.

Return only a valid JSON list with the corrected fields.

Here is the paragraph:
\"\"\"{paragraph}\"\"\"

Here is the extracted food entity:
{json.dumps(entity_list, indent=2, ensure_ascii=False)}
"""


def modelRequest(prompt, temperature=0.5):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    return response.choices[0].message.content

def process_food_text(text: str):
    # Step 1: Coreference resolution
    coref_prompt = build_coref_prompt(text)
    resolved_text = modelRequest(coref_prompt, temperature=0.5)
    print("Step-1: Coreference resolution complete.")

    # Step 2: Chunking
    chunks = []
    remaining_text = resolved_text
    while True:
        prompt = chunk_text_prompt(remaining_text)
        chunked_response = modelRequest(prompt, temperature=0.2)
        chunked_part = get_food_and_chunk(chunked_response)

        if not chunked_part or chunked_part["food"].lower() == "none":
            break
        if chunks and chunks[-1]["food"].lower() == chunked_part["food"].lower():
            break

        chunks.append(chunked_part)
        print(f"{chunked_part['food']} extracted.")

        last_chunked_text = chunked_part["chunk"][-10:]
        match = re.search(re.escape(last_chunked_text), remaining_text)
        if match:
            remaining_text = remaining_text[match.span()[1] + 1:]
        else:
            break
    print("Step-2: Chunking complete.")

    # Step 3: Feature Extraction
    last_features = []
    for chunk in chunks:
        food_name = chunk["food"]
        food_chunk = chunk["chunk"]
        feature_prompt = text2feature(food_name, food_chunk)
        feature_response = modelRequest(feature_prompt, temperature=0.2)
        parsed = stringjson2json(feature_response)
        if parsed:
            last_features.append(parsed)
        else:
            print(f"Feature extraction failed for {food_name}")
    print("Step-3: Feature extraction complete.")

    # Step 4: Validation
    validated_features = []
    for to_validate, chunk in zip(last_features, chunks):
        validation_prompt = validate_food_features(to_validate, chunk["chunk"])
        validation_response = modelRequest(validation_prompt, temperature=0.2)
        corrected = stringjson2json(validation_response)
        if corrected:
            validated_features.extend(corrected)
        else:
            print(f"Validation failed for {to_validate[0]['name']}")
    print("Step-4: Validation complete.")
    return validated_features



In [4]:
for file_name in file_names:
    with open(file_name, "r", encoding="utf-8") as f:
        input_text = f.read()
    validated_features = process_food_text(input_text)
    with open(file_name[:-4] + ".json", "w", encoding="utf-8") as f:
        json.dump(validated_features, f, indent=4, ensure_ascii=False)
    print("-------------------------------")
    print(f"{file_name} HAS BEEN PROCESSED!")
    print("-------------------------------")


Step-1: Coreference resolution complete.
Boyoz extracted.
Gevrek extracted.
Kumru extracted.
Kokoreç extracted.
Midye extracted.
Söğüş extracted.
Lokma extracted.
Torpil extracted.
Şambali extracted.
Sübye Sherbet extracted.
İzmir Bomba extracted.
Step-2: Chunking complete.
Step-3: Feature extraction complete.
Step-4: Validation complete.
-------------------------------
food/izmir_foods.txt HAS BEEN PROCESSED!
-------------------------------
