In [1]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import re
from collections import defaultdict, OrderedDict

# Load your API key from .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
folder_path = "sea"
file_names = os.listdir(folder_path)

file_names = [
    folder_path + "/" + f
    for f in file_names
    if os.path.isfile(folder_path + "/" + f) and f.endswith(".txt")
]

print(file_names)


['sea/dikili_bays.txt', 'sea/izmir_beaches.txt', 'sea/izmir_beaches_summary.txt']


In [3]:
def process_file_for_features(filepath: str):
    #get the file content
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
    
    #model request
    def modelRequest(prompt,temperature=0.5):

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        return response.choices[0].message.content

    
    #String Response to JSON file conversion
    def stringjson2json(output_text: str):
        """
        Extracts and parses a JSON array from an OpenAI LLM string output,
        which may contain markdown formatting, extra text, or surrounding commentary.

        Returns:
            - A parsed Python list/dictionary if successful
            - None if parsing fails
        """
        try:
            # Try to find the first code block starting with ```json
            if "```json" in output_text:
                start = output_text.find("```json") + len("```json")
                end = output_text.find("```", start)
                json_str = output_text[start:end].strip()
            elif "```" in output_text:  # Fallback to any code block
                start = output_text.find("```") + len("```")
                end = output_text.find("```", start)
                json_str = output_text[start:end].strip()
            else:
                json_str = output_text.strip()

            # Parse the JSON string
            parsed = json.loads(json_str)

            return parsed

        except Exception as e:
            print("JSON Parse Error:", e)
            return None

    
    # Step 1: Coreference resolution
    def build_coref_prompt(text: str) -> str:
        return f"""
    Resolve all coreferences in the text below. Replace pronouns (he, she, it, there, this, etc.) 
    and indirect references with the appropriate named entities to make the text fully self-contained.

    example-1:
    Original: "Elon Musk was born in South Africa. There, he briefly attended classes at the University of Pretoria."
    After Coreference Resolution: "Elon Musk was born in South Africa. In South Africa, Elon Musk briefly attended classes at the University of Pretoria."

    example-2:
    Original: "İzmir is a bustling city on the Aegean coast. It is known for its vibrant culture and seaside promenades."
    After Coreference Resolution: "İzmir is a bustling city on the Aegean coast. İzmir is known for İzmir's vibrant culture and seaside promenades."

    example-3:
    Original: "Cappadocia is famous for its fairy chimneys. They were formed over thousands of years by volcanic activity and erosion."
    After Coreference Resolution: "Cappadocia is famous for Cappadocia's fairy chimneys. Cappadocia's fairy chimneys were formed over thousands of years by volcanic activity and erosion."

    example-4:
    Original: "Pamukkale is known for its white travertine terraces. They are formed by mineral-rich hot springs."
    After Coreference Resolution: "Pamukkale is known for Pamukkale's white travertine terraces. Pamukkale's white travertine terraces are formed by mineral-rich hot springs."

    Text:
    {text}

    Rewritten Text:
    """


    resolved_text = modelRequest(build_coref_prompt(raw_text), temperature=0.5)
    
    print("Step-1: Coreference resolution has been done.")
    # Step 2: Chunking
    def chunk_text_prompt(text: str) -> str:
        return f"""
    You are an expert knowledge extraction assistant specializing in travel content and knowledge graph construction.

    Your task is to help organize a long travel-related text by **chunking it into meaningful sections** based on shifts in topic or place. These shifts are usually marked by **headers/titles**, such as the name of a city, region, or town.

    Instructions:
    - Read the input text.
    - When you detect a shift in the topic (typically marked by a new place or region title), split the text.
    - For the first section before the next topic begins:
        - Assign the nearest title (place name) as the "Place".
        - Extract all relevant text until the next shift.
        - Return this section as the first chunk.
    - Leave the remaining text under "Other part" so we can continue chunking recursively.

    Do not return multiple chunks at once.
    Always return only the first chunk and the remaining part.

    Return your output in the following structured format:

    Place: <Place Name>
    Chunk: <Chunked Text>
    Other part: <Remaining Text>
    ---
    Here is the text:
    \"\"\"{text}\"\"\"
    """
    def get_place_and_chunk(chunked_response):
        place_match = re.search(r"Place:\s*(.+?)\s*Chunk:", chunked_response, re.DOTALL)
        chunk_match = re.search(r"Chunk:\s*(.+?)\s*Other part:", chunked_response, re.DOTALL)

        return {
            "place": place_match.group(1).strip(),
            "chunk": chunk_match.group(1).strip()
        }



    chunks=[]
    remaining_text=resolved_text

    while True:
        prompt=chunk_text_prompt(remaining_text)
        chunked_response=modelRequest(prompt,temperature=0.2)

        chunked_part=get_place_and_chunk(chunked_response)
        chunks.append(chunked_part)

        last_chunked_text=chunked_part["chunk"][-50:]
#         print("last chunked text: ",last_chunked_text)
        find_end_index=re.search(last_chunked_text, resolved_text).span()[1]
#         print("find end index: ",find_end_index)
        remaining_text=resolved_text[find_end_index+1:]
#         print("remaining text: ", remaining_text)
        if len(remaining_text)<5:
            break   

    print("Step-2: Chunking has been done.")        
    # Step 3: Canonical NER
    
    def text2CanonicalNER(place: str, chunked_text: str) -> str:
        return f"""
    You are an expert assistant specializing in building travel knowledge graphs.

    Your task is to extract **all named entities** from the following paragraph related to the place: **"{place}"**.

    For each identified entity, return a JSON object with the following fields:
    - **"mention"**: the exact text as it appears in the paragraph.
    - **"label"**: the semantic type (choose from: LOCATION, LANDMARK, HISTORICAL_SITE, NATURAL_SITE, EVENT, CULTURAL_SITE).
    - **"canonical"**: the normalized, standardized, or modern name (e.g., internationally recognized or geolocated version).
    - **"note"** *(optional)*: include only if there's something notable about the entity such as:
      - historical origin (e.g., Greek, Ottoman),
      - potential ambiguity (e.g., multiple interpretations),
      - linguistic note (e.g., old name or foreign origin).

    Please ensure:
    - Each entity is related to **travel**, **history**, or **geography**.
    - Group output as a **JSON array** and return **only** the JSON, without extra explanation.

    ---
    Here is the paragraph related to "{place}":
    \"\"\"{chunked_text}\"\"\"
    """
    canonical_entities=[]
    for chunk in chunks:
        canonical_prompt=text2CanonicalNER(chunk["place"], chunk["chunk"])
        canonical_response=modelRequest(canonical_prompt,temperature=0.2)
    #     canonical_entities.extend(stringjson2json(canonical_response))
        canonical_entities.append(stringjson2json(canonical_response))
        print(f'{chunk["place"]} has been done.')


    print("Step-3: Canonical NER has been done.")    
    # Step 4: Feature Extraction
    def text2feature(parsed_json: list, text: str) -> str:
        return f"""
You are a structured knowledge extraction assistant for a smart travel recommendation system.

Below is a list of identified entities (e.g., towns, beaches, ruins) extracted from a travel paragraph, including their labels and notes:

{json.dumps(parsed_json, indent=2)}

Here is the paragraph they were extracted from:

\"\"\"{text}\"\"\"

Your task is to enrich each entity using only the information explicitly or implicitly present in the paragraph.

For each entity, extract the following fields:
- "mention": the exact name from the list above
- "node_type": choose the most appropriate type from this controlled list:

  [City, Region, Town, Village, Beach, Bay, Island, Museum, HistoricalSite, ReligiousPlace, Market, NaturalPark, AncientCity, Monument, Castle, Temple, Tower, Bridge]

  Refer to the guide below to choose correctly:
  - City: major urban center with administrative or economic importance
  - Region: geographic or administrative area (e.g., peninsula, province)
  - Town: medium-sized settlement
  - Village: small, often rural settlement
  - Beach: coastal area suitable for swimming or recreation
  - Bay: curved coastal inlet or small cove (includes places ending with “-bükü”)
  - Island: landmass surrounded by water
  - Museum: institution showcasing cultural or historical artifacts
  - HistoricalSite: ruins, ancient structures, castles, or aqueducts
  - ReligiousPlace: places of worship such as churches, mosques, or temples
  - Market: traditional, open-air, or specialty shopping area
  - NaturalPark: protected or scenic natural area
  - AncientCity: historic archaeological city or urban ruin
  - Monument: statue, memorial, or commemorative landmark
  - Castle: large fortified structure, often historical
  - Temple: structure for religious or spiritual rituals
  - Tower: tall structure such as a lighthouse or watchtower
  - Bridge: architectural structure spanning a natural feature

Do not invent new types. Be precise and pick the closest match based on the paragraph context.

Also extract:
- "what_to_do": list of activities possible at or near this location (e.g., swim, photograph, explore, relax)
- "best_for": ideal traveler types (e.g., couples, families, nature lovers, cultural tourists)
- "special_features": unique aspects or highlights (e.g., history, architecture, scenic views)
- "tags": short, descriptive keywords (e.g., beach, ancient ruins, sunset view, bazaar)
- "description": 1–2 sentence summary describing the place to a traveler
- "located_in": the **smallest known region mentioned** in the paragraph (e.g., if a beach is located in a village called “Sığacık” which is part of Seferihisar, use `"Sığacık"`. If there is no clue, then add the relevant location about the place)
Respond with a **valid JSON list**, with one object per entity in the same order.

Example output format:

[
  {{
    "mention": "Blue Lagoon",
    "node_type": "Beach",
    "what_to_do": ["swimming", "snorkeling", "boat tours"],
    "best_for": ["families", "nature lovers", "couples"],
    "special_features": ["turquoise waters", "protected natural area", "mountain backdrop"],
    "tags": ["beach", "lagoon", "scenic", "family-friendly"],
    "description": "Blue Lagoon is a serene coastal destination with clear turquoise waters and a calm environment, ideal for swimming and relaxation."
    "located_in": "Ölüdeniz" 
 }}
]
"""


    last_features=[]
    for entity,chunk in zip(canonical_entities, chunks):
        feature_prompt=text2feature(entity, chunk["chunk"])
        feature_response=modelRequest(feature_prompt,temperature=0.2)
    #     canonical_entities.extend(stringjson2json(canonical_response))
        last_features.append(stringjson2json(feature_response))
        
    print("Step-4: Feature Extraction has been done.")
    # Step 5: Validation
    def validate_extracted_features(entity_list: list, paragraph: str) -> str:
        return f"""
    You are an expert assistant reviewing enriched travel entities extracted from the paragraph below.

    Check each entity for:
    - Accuracy of the "node_type"
    - Whether the fields ("what_to_do", "best_for", "special_features", "tags", "description") are supported by or inferred from the paragraph

    If any entity contains unsupported, incorrect, or hallucinated values, **correct them**.

    Return your answer as a **valid JSON array**, with the corrected entities. Do not include any explanation or review. Only output the corrected list.

    Here is the paragraph:
    \"\"\"{paragraph}\"\"\"

    Here are the enriched entities to validate:
    {json.dumps(entity_list, indent=2)}
    """
    
    validated_features=[]
    for entity,chunk in zip(last_features, chunks):
        validation_prompt=validate_extracted_features(entity, chunk["chunk"])
        feature_response=modelRequest(validation_prompt,temperature=0.2)
        validated_features.extend(stringjson2json(feature_response))
    print("Step-5: Validation of Extracted Features has been done.")
    
    # Step 6: Merging the same Features

    # Group and merge logic
    def merge_dicts(existing, new):
        for key, value in new.items():
            if key in ['mention', 'node_type']:
                continue
            if isinstance(value, list):
                existing[key].extend(value)
            elif isinstance(value, str):
                existing[key].append(value)
        return existing
    grouped = defaultdict(lambda: defaultdict(list))
    mention_to_node_type = {}
    # Use directly as it's already flat
    for entry in validated_features:
        mention = entry['mention']
        mention_to_node_type[mention] = entry['node_type']
        grouped[mention] = merge_dicts(grouped[mention], entry)
    # Build the final merged list
    merged_features = []
    for mention, data in grouped.items():
        ordered = OrderedDict()
        ordered['mention'] = mention
        ordered['node_type'] = mention_to_node_type[mention]
        for key, value in data.items():
            if key not in ['mention', 'node_type']:
                if key == 'description':
                    ordered[key] = " ".join(set(value))  # Merge all descriptions
                else:
                    ordered[key] = list(set(value))  # Remove duplicates
        merged_features.append(ordered)
   

    print("Step-6: Merging the same Features has been done.")
    return merged_features

In [4]:
for file_name in file_names:
    validated_features=process_file_for_features(file_name)
    with open(file_name[:-4]+".json", "w", encoding="utf-8") as f:
        json.dump(validated_features, f, indent=4, ensure_ascii=False)
    print("-------------------------------")
    print(f"{file_name} HAS BEEN PROCESSED!")
    print("-------------------------------")

Step-1: Coreference resolution has been done.
Step-2: Chunking has been done.
Dikili has been done.
Step-3: Canonical NER has been done.
Step-4: Feature Extraction has been done.
Step-5: Validation of Extracted Features has been done.
Step-6: Merging the same Features has been done.
-------------------------------
sea/dikili_bays.txt HAS BEEN PROCESSED!
-------------------------------
Step-1: Coreference resolution has been done.
Step-2: Chunking has been done.
Çeşme has been done.
Seferihisar has been done.
Foça has been done.
Urla has been done.
Alaçatı has been done.
Dikili has been done.
Karaburun has been done.
Step-3: Canonical NER has been done.
Step-4: Feature Extraction has been done.
Step-5: Validation of Extracted Features has been done.
Step-6: Merging the same Features has been done.
-------------------------------
sea/izmir_beaches.txt HAS BEEN PROCESSED!
-------------------------------
Step-1: Coreference resolution has been done.
Step-2: Chunking has been done.
Izmir ha