In [44]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import re
import time
import random
import requests
from langchain.tools import DuckDuckGoSearchRun
search = DuckDuckGoSearchRun()

import googlemaps
from deep_translator import GoogleTranslator

load_dotenv()
api_key_google = os.getenv("GOOGLE_MAPS")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [21]:
#finding all json files
folder_path = "refined_all_data"
file_names = os.listdir(folder_path)

file_names = [
    folder_path + "/" + f
    for f in file_names
    if os.path.isfile(folder_path + "/" + f) and f.endswith(".json")
]
print(file_names)

['refined_all_data/all_museums_normalized.json', 'refined_all_data/dikili_bays.json', 'refined_all_data/izmir_beaches.json', 'refined_all_data/izmir_beaches_summary.json', 'refined_all_data/izmir_foods.json', 'refined_all_data/museum_fees_cleaned.json']


In [22]:
file_names

['refined_all_data/all_museums_normalized.json',
 'refined_all_data/dikili_bays.json',
 'refined_all_data/izmir_beaches.json',
 'refined_all_data/izmir_beaches_summary.json',
 'refined_all_data/izmir_foods.json',
 'refined_all_data/museum_fees_cleaned.json']

In [23]:
#String Response to JSON file conversion
def stringjson2json(output_text: str):
    """
    Extracts and parses a JSON array from an OpenAI LLM string output,
    which may contain markdown formatting, extra text, or surrounding commentary.

    Returns:
        - A parsed Python list/dictionary if successful
        - None if parsing fails
    """
    try:
        # Try to find the first code block starting with ```json
        if "```json" in output_text:
            start = output_text.find("```json") + len("```json")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        elif "```" in output_text:  # Fallback to any code block
            start = output_text.find("```") + len("```")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        else:
            json_str = output_text.strip()

        # Parse the JSON string
        parsed = json.loads(json_str)

        return parsed

    except Exception as e:
        print("JSON Parse Error:", e)
        return None

In [24]:
#get json file and convert string into json
def file2json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
        return stringjson2json(raw_text)

In [308]:
museums        = file2json(file_names[0])
museum_tickets = file2json(file_names[5])

In [26]:
dikili         = file2json(file_names[1])
beaches1       = file2json(file_names[2])
beaches2       = file2json(file_names[3])
foods          = file2json(file_names[4])

In [27]:
def modelRequest(prompt,temperature=0.5):

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        return response.choices[0].message.content

## Adding Museum Fees into Museums

In [309]:
def get_ticket_translation_prompt(tickets: list) -> str:
    return f"""
You are a professional translator and data formatter.

You are given a list of Turkish museum ticket price entries. Each dictionary has the fields:
- `city` (e.g., "izmir")
- `museum` (e.g., "izmir bergama akropol orenyeri")
- `local_tourist` (e.g., "muzekart ile girilebilir.", "ücretsiz", etc.)
- `foreigner` (e.g., "€ 15", "ücretsiz", etc.)
- `status` (e.g., "kapali", or a description about combined tickets)

Your task:
1. Translate all Turkish fields into **English**.
2. Use accurate terms:
   - "muzekart ile girilebilir." → "Accessible with Müzekart"
   - "ücretsiz" → "Free"
   - "kapali" → "Closed"
3. Keep the data structure the same.
4. Preserve the euro sign (e.g., "€ 15" → "€15").
5. For empty or null fields, leave them as `null`.
Here is the data to translate:
```json
{tickets}
Return the translated list as valid JSON in English.
"""


In [310]:
translation_prompt = get_ticket_translation_prompt(museum_tickets)
model_response = modelRequest(translation_prompt,temperature=0.3)
museum_ticket_translated = stringjson2json(model_response)

In [311]:
def get_museum_matching_prompt(museum_names: list, museum_ticket_names: list) -> str:
    return f"""
You are an expert in cultural heritage data integration.

## Task:
You are given two lists:
- `museum_names`: English museum names
- `museum_ticket_name`: Turkish museum names

Your goal is to match each Turkish museum name to its most likely English equivalent from the `museum_names` list.

## Rules:
- Only return matches if you are confident.
- If no confident match is found, write `"NO_MATCH"` for that Turkish name.
- Return the result as a list of dictionaries like this:
  ```json
  [
    {{ "turkish": , "english":  }},
    {{ "turkish": , "english":  }},
    {{ "turkish": , "english":  }}
  ]
Data:
museum_names = {museum_names}

museum_ticket_name = {museum_ticket_names}

Please return the matched list in JSON format.
"""


In [312]:
museum_names = [i["name"] for i in museums]
museum_ticket_name = [i["museum"] for i in museum_ticket_translated]


museum_prompt = get_museum_matching_prompt(museum_names,museum_ticket_name)
model_response = modelRequest(museum_prompt,temperature=0.3)
museums_match_response = stringjson2json(model_response)

In [313]:
def normalize(text):
    return text.strip().lower().replace("ı", "i").replace("\xa0", " ").replace("–", "-").replace("’", "'").replace("  ", " ")

In [314]:
#adding museum prices into museums
for museum in museums:
    museum["local_price"] = None
    museum["foreigner_price"] = None
    museum["node_type"] = 'Museum'

    for translation in museums_match_response:
        if normalize(translation["english"]) == normalize(museum["name"]):
            turkish_name = translation["turkish"]

            for turkish_museum_info in museum_ticket_translated:
                if normalize(turkish_museum_info["museum"]) == normalize(turkish_name):
                    museum["local_price"] = turkish_museum_info.get("local_tourist")
                    museum["foreigner_price"] = turkish_museum_info.get("foreigner")

In [315]:
museums[1]

{'file': 'museum_10.html',
 'name': 'Izmir Birgi Cakiraga Mansion',
 'rating': '4.2',
 'description': 'Masterpiece of Ottoman Civil Architecture\n\nBirgi is a peaceful, green and pretty neighborhood stretching along both banks of a stream on the southern slopes of Mt Tmolos (Bozdaglar). The Cakiraga Mansion in Birgi was commissioned by Cakiroglu Serif Ali Agha, a merchant from Birgi, in the late eighteenth century. The three-story mansion is renowned with its panoramas depicting Istanbul and Izmir. It is an unparalleled example of Ottoman civil architecture in the Aegean region. Authentic wall painting and woodwork on the exterior and interior walls of Cakiraga Mansion in Birgi enchant its visitors.\n\n',
 'museum_id': '2083',
 'address': 'Birgi Mahallesi Sht. Gurol Madan Caddesi Odemis/IZMIR',
 'email': 'odemismuzesi@ktb.gov.tr',
 'phone': '+90 (232) 531-5205',
 'opening_hours': {'summer': '08:30-19:00(Monday : Close)',
  'winter': '08:30-17:30(Monday : Close)',
  'box_office_closing'

### Feature Extraction for Museums

In [316]:
import json

def extract_museum_features_prompt(museum_data):
    return f"""
You are an expert travel assistant AI.

Your task is to extract **semantic features** from a list of museums. For each museum, return the following fields:
- `what_to_do`: Activities a visitor can do there (e.g., "learn history", "explore exhibitions")
- `best_for`: The type of visitors who would most enjoy it (e.g., "families", "art lovers", "students")
- `special_features`: Notable or unique characteristics (e.g., "interactive exhibits", "rare artifacts", "modern design")
- `tags`: General topic or style tags (e.g., "art", "history", "science", "architecture")

Instructions:
- Use fields such as `description`, `facilities`, and `name` to guide your answers.
- For each museum, return:
  - `mention` (same as in the original data)
  - `what_to_do`
  - `best_for`
  - `special_features`
  - `tags`
- All values must be lowercase and formatted as lists of strings.
- Do not remove or skip any museum.
- Do not include markdown or code formatting (no triple backticks, no Python labels).
- Just return a **raw Python list of dictionaries** with those five fields.

Here is the input data:

{json.dumps(museum_data, indent=2)}
"""


In [321]:
prompt = extract_museum_features_prompt(museums)
response = modelRequest(prompt, temperature=0.2)
museum_features = stringjson2json(response)

In [322]:
#getting standardized museum features
def standardize_museum_features_prompt(museum_features):
    return f"""
You are a data normalization assistant.

Given a list of museum objects and their semantic fields (`what_to_do`, `best_for`, `special_features`, and `tags`), your task is to standardize the values inside each field.

Instructions:
1. Normalize phrasing across all fields (e.g., "explore ancient ruins", "explore archaeological artifacts" → "explore ancient sites").
2. Use short, lowercase, meaningful phrases.
3. Replace synonyms and unify inconsistent forms.
4. Keep the original structure: keep the same list of dictionaries and keep each museum’s `mention` unchanged.
5. Only transform the **values inside** `what_to_do`, `best_for`, `special_features`, and `tags`.
6. Do not remove or add fields. Do not include markdown or code blocks.

Here is the input data:

{json.dumps(museum_features, indent=2)}

Return the modified list with standardized features.
"""


In [323]:
prompt = standardize_museum_features_prompt(museum_features)
response = modelRequest(prompt, temperature=0.2)
standardized_museums = stringjson2json(response)

In [347]:
#mapping dictionary
museum_feature_map = {
    museum["mention"].lower().strip(): {
        "what_to_do": museum["what_to_do"],
        "best_for": museum["best_for"],
        "special_features": museum["special_features"],
        "tags": museum["tags"]
    }
    for museum in standardized_museums
}


In [353]:
museum_feature_map

{'museum izmir culture and arts factory': {'what_to_do': ['learn history',
   'explore exhibitions',
   'participate in workshops',
   'enjoy outdoor activities'],
  'best_for': ['families', 'art lovers', 'students'],
  'special_features': ['interactive exhibits',
   'historical artifacts',
   'cultural workshops'],
  'tags': ['art', 'history', 'culture']},
 'izmir birgi cakiraga mansion': {'what_to_do': ['explore architecture',
   'admire wall paintings',
   'learn about Ottoman architecture'],
  'best_for': ['history enthusiasts', 'architecture lovers'],
  'special_features': ['authentic wall painting', 'panoramic views'],
  'tags': ['architecture', 'history']},
 'izmir cesme museum': {'what_to_do': ['explore archaeological exhibits',
   'learn about military history',
   'view historical artifacts'],
  'best_for': ['history enthusiasts', 'archaeology enthusiasts'],
  'special_features': ['historical castle', 'thematic exhibitions'],
  'tags': ['history', 'archaeology']},
 'izmir nav

In [355]:
def normalize_name(name: str) -> str:
    return re.sub(r"\s+", " ", name.strip().lower())

In [359]:
# Normalize all keys in the feature map
museum_feature_map = {
    normalize_name(name): features
    for name, features in museum_feature_map.items()
}

#Adding new features for museums
for museum in museums:
    key = normalize_name(museum["name"])
    features = museum_feature_map.get(key)
    if features:
        museum.update(features)
    else:
        print(f"No match for: {museum['name']}")


In [361]:
museums[1]

{'file': 'museum_10.html',
 'name': 'Izmir Birgi Cakiraga Mansion',
 'rating': '4.2',
 'description': 'Masterpiece of Ottoman Civil Architecture\n\nBirgi is a peaceful, green and pretty neighborhood stretching along both banks of a stream on the southern slopes of Mt Tmolos (Bozdaglar). The Cakiraga Mansion in Birgi was commissioned by Cakiroglu Serif Ali Agha, a merchant from Birgi, in the late eighteenth century. The three-story mansion is renowned with its panoramas depicting Istanbul and Izmir. It is an unparalleled example of Ottoman civil architecture in the Aegean region. Authentic wall painting and woodwork on the exterior and interior walls of Cakiraga Mansion in Birgi enchant its visitors.\n\n',
 'museum_id': '2083',
 'address': 'Birgi Mahallesi Sht. Gurol Madan Caddesi Odemis/IZMIR',
 'email': 'odemismuzesi@ktb.gov.tr',
 'phone': '+90 (232) 531-5205',
 'opening_hours': {'summer': '08:30-19:00(Monday : Close)',
  'winter': '08:30-17:30(Monday : Close)',
  'box_office_closing'

## Adding Geo Information

In [362]:
def google_maps(place_name,api_key=api_key_google):

    place_name = GoogleTranslator(source='auto', target='tr').translate(place_name)

    gmaps = googlemaps.Client(key=api_key)

    # Define your query
    place_name = f"{place_name}, Izmir"
#     print(place_name)
    # Send geocoding request
    geocode_result = gmaps.geocode(place_name)

    return geocode_result

In [363]:
def reverse_geocode(geo_string, api_key=api_key_google):
    geo = geo_string.split(",")
    lat = geo[0]
    lng = geo[1]
    gmaps = googlemaps.Client(key=api_key)
    
    # Reverse geocoding
    result = gmaps.reverse_geocode((lat, lng))
    
    if result:
        return result[0]
    else:
        return "No address found"

In [364]:
def parse_google_address(geocode_result):
    if not geocode_result:
        return {}

    result = geocode_result[0]
    address_components = result.get("address_components", [])
    location_info = {
        "latitude": result["geometry"]["location"]["lat"],
        "longitude": result["geometry"]["location"]["lng"],
        "district": None,
        "city": None,
        "province": None,
        "country": None
    }

    for component in address_components:
        types = component.get("types", [])
        if "administrative_area_level_2" in types:  # District (e.g., Konak)
            location_info["district"] = component["long_name"]
        elif "administrative_area_level_1" in types:  # Province (e.g., İzmir)
            location_info["province"] = component["long_name"]
        elif "country" in types:
            location_info["country"] = component["long_name"]
        elif "locality" in types or "administrative_area_level_3" in types:
            location_info["city"] = component["long_name"]

    return location_info


In [365]:
def get_most_specific_area(place_name: str, place_info: dict) -> str:
    """
    Return the most specific known location: district > city > province.
    """
    if place_name!=place_info["district"]: name= place_info["district"] 
    elif place_name!=place_info["province"]: name= place_info["province"]
    
    return name, [place_info["latitude"], place_info["longitude"]]

In [366]:
def get_address(place_name):
    url = "https://nominatim.openstreetmap.org/search"
    headers = {
        'User-Agent': 'izmir-chatbot/1.0 (mertunknown45@gmail.com)'
    }
    params = {
        'q': f"{place_name}, izmir",
        'format': 'json',
        'limit': 1,
        'countrycodes': 'tr'
    }
    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            return data[0]["display_name"]
    except Exception as e:
        print(f"Error fetching coordinates for {place_name}: {e}") 
            

In [367]:
def get_town_name(data):
    for i in data["address_components"]:
        if 'administrative_area_level_2' in i["types"]:
            return i["long_name"]
        elif 'administrative_area_level_1' in i["types"]:
            return i["long_name"] 

### Museum locations:

In [368]:
for ent in museums:
    address = reverse_geocode(ent["coordinates"])
    ent["located_in"] = get_town_name(address)

In [369]:
museums[1]

{'file': 'museum_10.html',
 'name': 'Izmir Birgi Cakiraga Mansion',
 'rating': '4.2',
 'description': 'Masterpiece of Ottoman Civil Architecture\n\nBirgi is a peaceful, green and pretty neighborhood stretching along both banks of a stream on the southern slopes of Mt Tmolos (Bozdaglar). The Cakiraga Mansion in Birgi was commissioned by Cakiroglu Serif Ali Agha, a merchant from Birgi, in the late eighteenth century. The three-story mansion is renowned with its panoramas depicting Istanbul and Izmir. It is an unparalleled example of Ottoman civil architecture in the Aegean region. Authentic wall painting and woodwork on the exterior and interior walls of Cakiraga Mansion in Birgi enchant its visitors.\n\n',
 'museum_id': '2083',
 'address': 'Birgi Mahallesi Sht. Gurol Madan Caddesi Odemis/IZMIR',
 'email': 'odemismuzesi@ktb.gov.tr',
 'phone': '+90 (232) 531-5205',
 'opening_hours': {'summer': '08:30-19:00(Monday : Close)',
  'winter': '08:30-17:30(Monday : Close)',
  'box_office_closing'

### Other locations:

In [140]:
centre_coordinates = [38.423734, 27.142826]

In [127]:
for ent in dikili:
    place_name = ent["mention"]+","+ ent["located_in"][0]
    info = parse_google_address(google_maps(place_name))
    district,geo_location=get_most_specific_area(place_name,info)
    ent["coordinates"] = geo_location

In [132]:
def get_coordinates(places_list):
    for ent in places_list:
        place_name = ent["mention"]+","+ ent["located_in"][0]
        info = parse_google_address(google_maps(place_name))
        district,geo_location=get_most_specific_area(place_name,info)
        ent["coordinates"] = geo_location
    return places_list

In [135]:
beaches1= get_coordinates(beaches1)

In [136]:
beaches2= get_coordinates(beaches2)

In [143]:
for ent in foods:
    place_name = ent["where_to_eat"][0]
    if place_name == "İzmir":
        ent["coordinates"]= centre_coordinates
    else: 
        place_name+=",İzmir"
        info = parse_google_address(google_maps(place_name))
        district,geo_location=get_most_specific_area(place_name,info)
        ent["coordinates"] = geo_location

In [154]:
#adding node type in foods
for ent in foods:
    ent["node_type"]="Food"

## Normalization of Turkish Characters

In [191]:
def normalize_name(name: str) -> str:
    translation_table = str.maketrans("çğıöşüÇĞİÖŞÜ", "cgiosuCGIOSU")
    return name.translate(translation_table).lower().strip()

def normalize_entities_in_place(entities):
    removed_elements = ["node_type", "coordinates"]
    normalized_keys = [key for key in entities[0].keys() if key not in removed_elements]

    for ent in entities:
        for key in normalized_keys:
            value = ent[key]
            if isinstance(value, str):
                ent[key] = normalize_name(value)
            elif isinstance(value, list):
                ent[key] = [normalize_name(v) if isinstance(v, str) else v for v in value]

    return entities


In [198]:
dikili   = normalize_entities_in_place(dikili)
beaches1 = normalize_entities_in_place(beaches1)
beaches2 = normalize_entities_in_place(beaches2)
foods    = normalize_entities_in_place(foods)
museums  = normalize_entities_in_place(museums)

### Changing name as mention

In [373]:
for ent in museums:
    ent["mention"] = ent["name"]
    del ent ["name"]

In [222]:
for ent in foods:
    ent["mention"] = ent["name"]
    del ent ["name"]

### Normalization of Features

In [377]:
def build_dikili_feature_map_prompt(data):
    return f"""
You are an expert data normalization assistant.

Given a list of places and their features, your task is to generate a **normalization mapping dictionary** to standardize the following fields:
- `what_to_do`
- `best_for`
- `special_features`
- `tags`

Instructions:
1. Identify synonyms, spelling variations, plural/singular forms, or inconsistent phrasing.
2. Create a dictionary for each field, where:
   - Keys = raw values found in the data
   - Values = their canonical, normalized form (lowercase, short, consistent)
3. Return the output as **valid Python code**, with one dictionary per field.
4. Do **not** wrap your output in markdown (no triple backticks or language markers).

Here is the input data:

{json.dumps(data, indent=2)}

Return the mapping dictionary like this:

{{
  "what_to_do": {{
    "explore nearby islands": "explore",
    "boat tours": "boat tour",
    "swimming": "swim"
  }},
  "best_for": {{
    "hotel guests": "guests",
    "nature lovers": "nature lovers"
  }},
  "special_features": {{
    "green nature": "nature",
    "deep blue sea": "sea"
  }},
  "tags": {{
    "swimming": "swim",
    "family-friendly": "families"
  }}
}}
"""


In [381]:
all_data = dikili + beaches1 + beaches2

In [382]:
prompt = build_dikili_feature_map_prompt(all_data)
response = modelRequest(prompt, temperature=0.2)
mapping_dict = stringjson2json(response)

In [408]:
mapping_dict

{'what_to_do': {'explore': 'explore',
  'relax': 'relax',
  'swim': 'swim',
  'sunbathe': 'sunbathe',
  'watch sunset': 'watch sunset',
  'camp': 'camp',
  'enjoy cooler waters': 'enjoy cooler waters',
  'hike': 'hike',
  'dine': 'dine',
  'experience local culture': 'experience local culture',
  'photograph': 'photograph',
  'learn about history': 'learn about history',
  'enjoy thermal springs': 'enjoy thermal springs',
  'snorkeling': 'snorkeling',
  'windsurfing': 'windsurfing',
  'rent a car': 'rent a car',
  'stay in boutique hotels': 'stay in boutique hotels',
  'explore traditional architecture': 'explore traditional architecture',
  'visit vineyards': 'visit vineyards',
  'visit wineries': 'visit wineries',
  'enjoy family activities': 'enjoy family activities',
  'explore historical sites': 'explore historical sites',
  'enjoy nightlife': 'enjoy nightlife',
  'explore slow cities': 'explore slow cities',
  'learn about local culture': 'learn about local culture',
  'watch': '

In [384]:
def normalize_features(entry, mapping_dict):
    for field in ["what_to_do", "best_for", "special_features", "tags"]:
        if field in entry:
            entry[field] = sorted({
                mapping_dict[field].get(val.lower().strip(), val.lower().strip())
                for val in entry[field]
            })

In [393]:
for place in all_data:
    normalize_features(place, mapping_dict)

### Saving Json Files

In [394]:
folder = "normalized_data"

In [395]:
file_path=folder+"/dikili.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(dikili, f, ensure_ascii=False, indent=4)

In [396]:
file_path=folder+"/beaches1.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(beaches1, f, ensure_ascii=False, indent=4)

In [397]:
file_path=folder+"/beaches2.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(beaches2, f, ensure_ascii=False, indent=4)

In [398]:
file_path=folder+"/foods.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(foods, f, ensure_ascii=False, indent=4)

In [399]:
file_path=folder+"/museums.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(museums, f, ensure_ascii=False, indent=4)

In [403]:
dikili[0].keys()

dict_keys(['mention', 'node_type', 'what_to_do', 'best_for', 'special_features', 'tags', 'description', 'located_in', 'coordinates'])

In [404]:
beaches1[0].keys()

dict_keys(['mention', 'node_type', 'what_to_do', 'best_for', 'special_features', 'tags', 'description', 'located_in', 'coordinates'])

In [405]:
beaches2[0].keys()

dict_keys(['mention', 'node_type', 'what_to_do', 'best_for', 'special_features', 'tags', 'description', 'located_in', 'coordinates'])

In [406]:
museums[0].keys()

dict_keys(['file', 'rating', 'description', 'museum_id', 'address', 'email', 'phone', 'opening_hours', 'facilities', 'images', 'coordinates', 'local_price', 'foreigner_price', 'node_type', 'what_to_do', 'best_for', 'special_features', 'tags', 'located_in', 'mention'])

In [407]:
foods[0].keys()

dict_keys(['type', 'ingredients', 'description', 'where_to_eat', 'coordinates', 'node_type', 'mention'])