In [192]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import re
import time
import random
import requests
from collections import defaultdict
from tqdm import tqdm


load_dotenv()
api_key_google = os.getenv("GOOGLE_MAPS")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
#finding all json files
folder_path = "normalized_data"
file_names = os.listdir(folder_path)

file_names = [
    folder_path + "/" + f
    for f in file_names
    if os.path.isfile(folder_path + "/" + f) and f.endswith(".json")
]
print(file_names)

['normalized_data/beaches1.json', 'normalized_data/beaches2.json', 'normalized_data/dikili.json', 'normalized_data/foods.json', 'normalized_data/museums.json']


In [170]:
#finding all json files
folder_path = "kg_database_data"
file_names = os.listdir(folder_path)

file_names = [
    folder_path + "/" + f
    for f in file_names
    if os.path.isfile(folder_path + "/" + f) and f.endswith(".json")
]
print(file_names)

['kg_database_data/foods.json', 'kg_database_data/merged_places.json', 'kg_database_data/museums.json']


In [3]:
#String Response to JSON file conversion
def stringjson2json(output_text: str):
    """
    Extracts and parses a JSON array from an OpenAI LLM string output,
    which may contain markdown formatting, extra text, or surrounding commentary.

    Returns:
        - A parsed Python list/dictionary if successful
        - None if parsing fails
    """
    try:
        # Try to find the first code block starting with ```json
        if "```json" in output_text:
            start = output_text.find("```json") + len("```json")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        elif "```" in output_text:  # Fallback to any code block
            start = output_text.find("```") + len("```")
            end = output_text.find("```", start)
            json_str = output_text[start:end].strip()
        else:
            json_str = output_text.strip()

        # Parse the JSON string
        parsed = json.loads(json_str)

        return parsed

    except Exception as e:
        print("JSON Parse Error:", e)
        return None

In [4]:
#get json file and convert string into json
def file2json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
        return stringjson2json(raw_text)

In [5]:
beaches1       = file2json(file_names[0])
beaches2       = file2json(file_names[1])
dikili         = file2json(file_names[2])
foods          = file2json(file_names[3])
museums        = file2json(file_names[4])

### Merging Places

In [6]:
#combining all places data in the same variable
places = beaches1 + beaches2 + dikili

In [7]:
#finding the duplicated places
adict ={}
for ent in places:
    adict[ent["mention"]] = adict.get(ent["mention"],0)+1

In [8]:
adict

{'cesme': 2,
 'ilica beach': 2,
 'altinkum beach': 2,
 'ayayorgi bay': 1,
 'kocakari beach': 1,
 'pirlanta beach': 1,
 'sakizli bay': 1,
 'boyalik bay': 1,
 'cesme castle': 1,
 'caravanserai': 1,
 'seferihisar': 2,
 'sigacik': 1,
 'cittaslow': 1,
 'teos': 1,
 'temple of dionysus': 1,
 'sigacik castle': 1,
 'akkum beach': 2,
 'akarca beach': 1,
 'foca': 2,
 'siren rocks': 1,
 'eski foca': 2,
 'yeni foca': 2,
 'persian tomb monument': 1,
 'fatih mosque': 1,
 'phrygian hill': 1,
 'foca aqueduct': 1,
 'hanedan beach': 1,
 'sazlica beaches': 1,
 'canak bay': 1,
 'voodoo beach': 1,
 'urla': 2,
 'izmir': 2,
 'karantina island': 1,
 'klazomenai ancient city': 1,
 'kostem olive oil museum': 1,
 'historical barbaros village': 1,
 'bademler village': 1,
 'malgaca market': 1,
 'altinkoy beach': 1,
 'demircili beach': 1,
 'melengec beach': 1,
 'deniz yildizi beach': 1,
 'bodrum bay': 1,
 'cesmealti beach': 1,
 'yassica island': 1,
 'sand sea beach': 1,
 'ozbek akkum beach': 1,
 'gulbahce beach': 1,

In [9]:
#merging places
def merge_places(places):
    merged = {}
    
    for ent in places:
        key = ent['mention'].lower()
        
        if key not in merged:
            merged[key] = ent.copy()
        else:
            existing = merged[key]

            # Merge lists without duplicates
            for field in ['what_to_do', 'best_for', 'special_features', 'tags']:
                existing[field] = list(set(existing.get(field, []) + ent.get(field, [])))
            
            # Keep the longest description
            if len(ent.get('description', '')) > len(existing.get('description', '')):
                existing['description'] = ent['description']
            
            # Prefer non-empty located_in and coordinates
            if not existing.get('located_in') and ent.get('located_in'):
                existing['located_in'] = ent['located_in']
            if not existing.get('coordinates') and ent.get('coordinates'):
                existing['coordinates'] = ent['coordinates']
            
            # node_type should remain consistent; if not, default to 'Place'
            if existing.get('node_type') != ent.get('node_type'):
                existing['node_type'] = 'Place'

    return list(merged.values())

In [10]:
merged_places = merge_places(places)

In [11]:
adict ={}
for ent in merged_places:
    adict[ent["mention"]] = adict.get(ent["mention"],0)+1

In [12]:
adict

{'cesme': 1,
 'ilica beach': 1,
 'altinkum beach': 1,
 'ayayorgi bay': 1,
 'kocakari beach': 1,
 'pirlanta beach': 1,
 'sakizli bay': 1,
 'boyalik bay': 1,
 'cesme castle': 1,
 'caravanserai': 1,
 'seferihisar': 1,
 'sigacik': 1,
 'cittaslow': 1,
 'teos': 1,
 'temple of dionysus': 1,
 'sigacik castle': 1,
 'akkum beach': 1,
 'akarca beach': 1,
 'foca': 1,
 'siren rocks': 1,
 'eski foca': 1,
 'yeni foca': 1,
 'persian tomb monument': 1,
 'fatih mosque': 1,
 'phrygian hill': 1,
 'foca aqueduct': 1,
 'hanedan beach': 1,
 'sazlica beaches': 1,
 'canak bay': 1,
 'voodoo beach': 1,
 'urla': 1,
 'izmir': 1,
 'karantina island': 1,
 'klazomenai ancient city': 1,
 'kostem olive oil museum': 1,
 'historical barbaros village': 1,
 'bademler village': 1,
 'malgaca market': 1,
 'altinkoy beach': 1,
 'demircili beach': 1,
 'melengec beach': 1,
 'deniz yildizi beach': 1,
 'bodrum bay': 1,
 'cesmealti beach': 1,
 'yassica island': 1,
 'sand sea beach': 1,
 'ozbek akkum beach': 1,
 'gulbahce beach': 1,

In [13]:
for ent in merged_places:
    if ent["mention"]=="aegean sea":
        del (ent)

In [14]:
merged_places

[{'mention': 'cesme',
  'node_type': 'Town',
  'what_to_do': ['dine', 'explore', 'swim', 'relax', 'enjoy nightlife'],
  'best_for': ['water sports lovers',
   'cultural tourists',
   'nature lovers',
   'couples',
   'families',
   'nightlife enthusiasts'],
  'special_features': ['holiday destination',
   'beach clubs',
   'nightlife',
   'historical',
   'beaches',
   'stunning natural beauty'],
  'tags': ['town',
   'holiday destination',
   'beach',
   'scenic',
   'families',
   'nightlife',
   'holiday',
   'historical'],
  'description': 'cesme is a popular holiday destination in izmir, known for its stunning beaches, historical sites, and vibrant dining options, making it ideal for a summer getaway. cesme is a nearby town connected to alacati, known for its beautiful coastline and summer tourism.',
  'located_in': ['izmir'],
  'coordinates': [38.3228017, 26.3027607]},
 {'mention': 'ilica beach',
  'node_type': 'Beach',
  'what_to_do': ['relax',
   'swim',
   'sunbathe',
   'phot

### Combining Features (special_features, tags, what_to_do)

In [15]:
def modelRequest(prompt,temperature=0.5):

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        return response.choices[0].message.content

In [16]:
merged_places=places + museums

In [17]:
what_to_do = []
special_features = []
tags = []
for ent in merged_places:
    what_to_do.append(ent["what_to_do"])
    special_features.append(ent["special_features"])
    tags.append(ent["tags"])

In [18]:
def map_terms_to_concepts_prompt(what_to_do, special_features, tags):
    return f"""
You are helping clean up location metadata for a travel knowledge graph.

You are given three lists related to a place:
- **What to Do**: {what_to_do}
- **Special Features**: {special_features}
- **Tags**: {tags}

Your task is to normalize all of these into lowercase, **one-word** semantic concepts.

Return a **JSON object** where:
- Each **original term** is a key (string)
- Each **cleaned one-word concept** is the value (string)

Do **not** group by category. Just output a flat map from all terms.

### Example Output:
{{
  "holiday destination": "holiday",
  "those looking for fun": "fun",
  "swimming": "swimming",
  "explore": "exploring",
  "historical": "history"
}}


Return the final term-to-concept mapping as valid JSON:
"""


In [19]:
prompt = map_terms_to_concepts_prompt(what_to_do, special_features, tags)
response = modelRequest(prompt, temperature=0.2)
merged_properties = stringjson2json(response)

In [21]:
for ent in merged_places:
    # Use a set to collect unique concept words to avoid duplicates.
    concepts = set()

    # A list of the keys containing the terms we need to process.
    keys_to_process = ["what_to_do", "special_features", "tags"]

    for key in keys_to_process:
        # Safely get the list of terms for the current key.
        # .get(key, []) returns an empty list if the key doesn't exist,
        # preventing errors.
        terms = ent.get(key, [])
        if terms:
            for term in terms:
                # Check if the term exists in our mapping dictionary.
                if term in merged_properties:
                    # If it exists, add the mapped concept to our set.
                    concepts.add(merged_properties[term])

    # Convert the set of unique concepts back to a list and assign it
    # to the 'properties' key for the current entity.
    ent["properties"] = list(concepts)

In [22]:
for ent in merged_places:
    del ent["what_to_do"]
    del ent["special_features"]
    del ent["tags"]

In [49]:
#modifying the style of located_in value, some of them are in a list element ["izmir"], some are just string "izmir"
for ent in places:
    try:
        ent["located_in"]= ent["located_in"][0]
    except: continue

In [167]:
file_path="kg_database_data/merged_places.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(merged_places, f, ensure_ascii=False, indent=4)

In [168]:
file_path="kg_database_data/museums.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(museums, f, ensure_ascii=False, indent=4)

In [169]:
file_path="kg_database_data/foods.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(foods, f, ensure_ascii=False, indent=4)

## Creating Knowledge Graph

In [57]:
def normalize_name(name: str) -> str:
    translation_table = str.maketrans("çğıöşüÇĞİÖŞÜ", "cgiosuCGIOSU")
    return name.translate(translation_table).lower().strip().title()

In [58]:
#normalizing museum located_in names ex: selçuk -> selcuk
for ent in museums:
    ent["located_in"] = normalize_name(ent["located_in"])

In [59]:
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph

In [60]:
# Initialize embedding and graph clients
embedding_provider = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"
)

In [177]:
def get_embedding(text: str):
    return embedding_provider.embed_query(text)

In [62]:
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URI'),
    username=os.getenv('NEO4J_USERNAME'),
    password=os.getenv('NEO4J_PASSWORD')
)

In [220]:
def insert_museum(graph, places):
    for place in tqdm(places):
        label = place["node_type"].capitalize()
        if label == "Museum":
            name = place["mention"].lower().capitalize()      
            located_in = place.get("located_in")
            best_for = place.get("best_for", [])
            properties = place.get("properties", [])
            description = place.get("description", "")

            embedding_description = get_embedding(description)
            
            
            coords = place.get("coordinates")
            
            #some coordinates are not in a list
            if isinstance(coords, str):
                lat, lon = [float(x.strip()) for x in coords.split(",")]
            else:
                lat, lon = float(coords[0]), float(coords[1])
                
                
            rating = place.get("rating")
            facilities = place.get("facilities", [])

            # Extract individual properties from the 'information' dictionary
            address = place.get("address", "")
            email = place.get("email", "")
            phone = place.get("phone", "")


            # storing as separate properties for summer/winter.
            opening_hours_data = place.get("opening_hours", {}) # Default to an empty dictionary if not present

            opening_hours_summer = opening_hours_data.get("summer", "")
            opening_hours_winter = opening_hours_data.get("winter", "")

            images = place.get("images", []) # This is a list of strings, which is fine
            local_price = place.get("local_price", "")
            foreigner_price = place.get("foreigner_price", "")

            # Merge Museum node with flattened properties
            graph.query("""
            MERGE (m:Museum {name: $name})
            SET m.description = $description,
                m.coordinates = point({latitude: $lat, longitude: $lon}),
                m.rating = $rating,
                m.address = $address,
                m.email = $email,
                m.phone = $phone,
                m.opening_hours_summer = $opening_hours_summer,
                m.opening_hours_winter = $opening_hours_winter,
                m.images = $images,
                m.local_price = $local_price,
                m.foreigner_price = $foreigner_price
            WITH m
            CALL db.create.setNodeVectorProperty(m, 'descriptionEmbedding', $embedding_description)
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "rating": rating,
                "address": address,
                "email": email,
                "phone": phone,
                "opening_hours_summer": opening_hours_summer,
                "opening_hours_winter": opening_hours_winter,
                "images": images,
                "local_price": local_price,
                "foreigner_price": foreigner_price,
                "embedding_description": embedding_description
            })

            # The rest of the relationships (LOCATED_IN, BEST_FOR, HAS_CONCEPT, HAS_FACILITY)
            
            if located_in=="Izmir": loc_node="Izmir"
            else: loc_node = "Town"
            # LOCATED_IN relationship
            if located_in:
                graph.query(f"""
                MERGE (l:{loc_node} {{name: $parent}})
                MERGE (m:Museum {{name: $name}})
                MERGE (m)-[:LOCATED_IN]->(l)
                """, {"name": name, "parent": located_in})

            # BEST_FOR → Audience
            for audience in best_for:
                embedding_audience = get_embedding(audience)
                
                graph.query("""
                MERGE (a:Audience {name: $audience})
                WITH a
                CALL db.create.setNodeVectorProperty(a, 'audienceEmbedding', $embedding)
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:BEST_FOR]->(a)
                """, {"audience": audience.lower().capitalize(), "name": name,"embedding": embedding_audience})

            # HAS_CONCEPT → Concept (from properties)
            for concept in properties:
                embedding_concept = get_embedding(concept)
                
                graph.query("""
                MERGE (c:Concept {name: $concept})
                WITH c
                CALL db.create.setNodeVectorProperty(c, 'nameEmbedding', $embedding)
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:HAS_CONCEPT]->(c)
                """, {"concept": concept.lower().capitalize(), "name": name, "embedding": embedding_concept})

            # HAS_FACILITY → Facility
            for facility in facilities:
                embedding_facility = get_embedding(facility)
                
                graph.query("""
                MERGE (f:Facility {name: $facility})
                WITH f
                CALL db.create.setNodeVectorProperty(f, 'facilityEmbedding', $embedding)
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:HAS_FACILITY]->(f)
                """, {"facility": facility.lower().capitalize(), "name": name, "embedding":embedding_facility})
                


In [221]:
def insert_places(graph,places):
    for place in tqdm(places):
        name = place["mention"].title()
        label = place["node_type"].title()
        description = place.get("description", "")
        embedding_description = get_embedding(description)

        coordinates = place.get("coordinates")
#         lat, lon = coordinates if coordinates else (None, None)
        try:
            lat, lon = (float(coordinates[0]), float(coordinates[1])) if coordinates else (None, None)
        except (TypeError, ValueError, IndexError):
            lat, lon = None, None
            
        located_in = place.get("located_in", "").title()
        best_for = place.get("best_for", [])
        properties = place.get("properties", [])

        if label == "City": #which means Izmir node
            
            # Merge City node with flattened properties
            graph.query("""
            MERGE (p:City {name: $name})
            SET p.description = $description,
                p.coordinates = point({latitude: $lat, longitude: $lon})
            WITH p
            CALL db.create.setNodeVectorProperty(p, 'descriptionEmbedding', $embedding)
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "embedding": embedding_description
            })
            
        else:
            # Merge the node with flattened properties
            # Use f-string to inject the label dynamically, keep property map with single braces as Langchain might handle
            graph.query(f"""
            MERGE (p:{label} {{name: $name}})
            SET p.description = $description,
                p.coordinates = point({{latitude: $lat, longitude: $lon}})
            WITH p
            CALL db.create.setNodeVectorProperty(p, 'descriptionEmbedding', $embedding)
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "embedding": embedding_description
            })
            
            if located_in=="Izmir": #only city is Izmir
                loc_label = "City"
            else: loc_label="Town"
            
            # LOCATED_IN relationship
            if located_in:
                graph.query(f"""
                MERGE (p:{loc_label} {{name: $parent}})
                MERGE (m:{label} {{name: $name}})
                MERGE (m)-[:LOCATED_IN]->(p)
                """, {
                    "name": name,
                    "parent": located_in
                })
                
                

        # Relationships (handled outside if-else so they work for all nodes)
        if label != "Museum":
            # BEST_FOR → Audience
            for audience in best_for:
                embedding_audience = get_embedding(audience)
                
                graph.query(f"""
                MERGE (a:Audience {{name: $audience}})
                WITH a
                CALL db.create.setNodeVectorProperty(a, 'audienceEmbedding', $embedding)
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:BEST_FOR]->(a)
                """, {
                    "audience": audience.lower().capitalize(),
                    "name": name,
                    "embedding": embedding_audience
                })

            # HAS_CONCEPT → Concept (from properties)
            for concept in properties:
                embedding_concept = get_embedding(concept)
                
                graph.query(f"""
                MERGE (c:Concept {{name: $concept}})
                WITH c
                CALL db.create.setNodeVectorProperty(c, 'conceptEmbedding', $embedding)
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:HAS_CONCEPT]->(c)
                """, {
                    "concept": concept.lower().capitalize(),
                    "name": name,
                    "embedding": embedding_concept
                })

In [222]:
def insert_foods(graph, foods):
    for food in tqdm(foods):
        name = food["mention"].lower().capitalize()
        food_type = food["type"].lower().capitalize()
        description = food.get("description", "")
        coordinates = food.get("coordinates")
        lat, lon = (coordinates[0], coordinates[1]) if coordinates else (None, None)
        ingredients = food.get("ingredients", [])
        where_to_eat = food.get("where_to_eat", [])

        embedding_description = get_embedding(description)
        
        # Create/Merge the Food node
        graph.query(f"""
            MERGE (f:Food {{name: $name}})
            SET f.description = $description,
                f.coordinates = point({{latitude: $lat, longitude: $lon}})
            WITH f
            CALL db.create.setNodeVectorProperty(f, 'descriptionEmbedding', $embedding)
        """, {
            "name": name,
            "description": description,
            "lat": lat,
            "lon": lon,
            "embedding": embedding_description
        })

        # Create/Merge the FoodType node and link it
        if food_type:
            embedding_foodtype = get_embedding(food_type)
            
            graph.query(f"""
                MERGE (ft:FoodType {{name: $food_type}})
                WITH ft
                CALL db.create.setNodeVectorProperty(ft, 'foodTypeEmbedding', $embedding)
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:TYPE_OF]->(ft)
            """, {
                "food_type": food_type,
                "name": name,
                "embedding": embedding_foodtype
            })

        # Create/Merge Ingredient nodes and link them
        for ingredient in ingredients:
            ingredient_name = ingredient.lower().capitalize()
            
            embedding_ingredient = get_embedding(food_type)
            
            graph.query(f"""
                MERGE (i:Ingredient {{name: $ingredient_name}})
                WITH i
                CALL db.create.setNodeVectorProperty(i, 'ingredientEmbedding', $embedding)
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:HAS_INGREDIENT]->(i)
            """, {
                "ingredient_name": ingredient_name,
                "name": name,
                "embedding": embedding_ingredient
            })
        
        # Create/Merge Location nodes (where_to_eat) and link them
        for location in where_to_eat:
            location_name = location.lower().capitalize()
            if location_name=="Izmir": loc_node="City"
            else: loc_node = "Town"
            # Assuming 'where_to_eat' typically refers to a Location, e.g., a district or specific place
            graph.query(f"""
                MERGE (l:{loc_node} {{name: $location_name}})
                WITH l
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:POPULAR_IN]->(l)
            """, {
                "location_name": location_name,
                "name": name
            })

In [None]:
setup_vector_indexes(graph)

In [None]:
insert_museum(graph, places)

In [219]:
insert_places(graph,merged_places)

100%|████████████████████████████████████████████████████████████████████████████████| 135/135 [05:30<00:00,  2.45s/it]


In [208]:
insert_foods(graph, foods)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:27<00:00,  2.49s/it]


In [227]:
merged_places[0]

{'mention': 'cesme',
 'node_type': 'Town',
 'best_for': ['couples', 'cultural tourists', 'families', 'nature lovers'],
 'description': 'cesme is a popular holiday destination in izmir, known for its stunning beaches, historical sites, and vibrant dining options, making it ideal for a summer getaway. cesme is a nearby town connected to alacati, known for its beautiful coastline and summer tourism.',
 'located_in': 'izmir',
 'coordinates': [38.3228017, 26.3027607],
 'properties': ['scenery',
  'relaxing',
  'town',
  'beach',
  'holiday',
  'exploring',
  'dining',
  'swimming']}

In [226]:
adict={}
for ent in merged_places:
    adict[ent["node_type"]]=adict.get(ent["node_type"],0)+1
adict

{'Town': 20,
 'Beach': 34,
 'Bay': 19,
 'Castle': 3,
 'HistoricalSite': 4,
 'CulturalSite': 1,
 'AncientCity': 4,
 'Temple': 1,
 'Monument': 1,
 'ReligiousPlace': 2,
 'NaturalPark': 4,
 'City': 2,
 'Island': 4,
 'Museum': 28,
 'Village': 5,
 'Market': 1,
 'Tower': 1,
 'Landmark': 1}

In [223]:
places

[{'mention': 'cesme',
  'node_type': 'Town',
  'best_for': ['couples', 'cultural tourists', 'families', 'nature lovers'],
  'description': 'cesme is a popular holiday destination in izmir, known for its stunning beaches, historical sites, and vibrant dining options, making it ideal for a summer getaway. cesme is a nearby town connected to alacati, known for its beautiful coastline and summer tourism.',
  'located_in': 'izmir',
  'coordinates': [38.3228017, 26.3027607],
  'properties': ['scenery',
   'relaxing',
   'town',
   'beach',
   'holiday',
   'exploring',
   'dining',
   'swimming']},
 {'mention': 'ilica beach',
  'node_type': 'Beach',
  'best_for': ['beach lovers', 'couples', 'families', 'nature lovers'],
  'description': 'ilica beach is one of the most beautiful beaches in alacati, offering soft sands and clear waters perfect for swimming and relaxation. ilica beach is renowned for its stunning scenery and is one of the most beautiful beaches in cesme, perfect for swimming an

In [211]:
def insert_places(graph, places):
    for place in tqdm(places):
        name = place["mention"].lower().capitalize()
        label = place["node_type"].capitalize()
        
        description = place.get("description", "")
        
        embedding_description = get_embedding(description)
        
        # Ensure coordinates are handled safely, they might be None if not present
        coordinates = place.get("coordinates")
        lat, lon = (coordinates[0], coordinates[1]) if coordinates else (None, None)

        located_in = place.get("located_in","").lower().capitalize()
        
        best_for = place.get("best_for", [])
        properties = place.get("properties", [])
        
        if label == "City": #which means Izmir node
            
            # Merge City node with flattened properties
            graph.query("""
            MERGE (p:City {name: $name})
            SET p.description = $description,
                p.coordinates = point({latitude: $lat, longitude: $lon})
            WITH p
            CALL db.create.setNodeVectorProperty(p, 'descriptionEmbedding', $embedding)
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "embedding": embedding_description
            })

            
        elif label !="Museum": #label is another type such as Town, Bay, Beach etc.
            
            # Merge the node with flattened properties
            # Use f-string to inject the label dynamically, keep property map with single braces as Langchain might handle
            graph.query(f"""
            MERGE (p:{label} {{name: $name}})
            SET p.description = $description,
                p.coordinates = point({{latitude: $lat, longitude: $lon}})
            WITH
            CALL db.create.setNodeVectorProperty(p, 'descriptionEmbedding', $embedding)
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "embedding": embedding_description
            })
            
            if located_in=="Izmir": #only city is Izmir
                loc_label = "City"
            else: loc_label="Town"
            
            # LOCATED_IN relationship
            if located_in:
                graph.query(f"""
                MERGE (p:{loc_label} {{name: $parent}})
                MERGE (m:{label} {{name: $name}})
                MERGE (m)-[:LOCATED_IN]->(p)
                """, {
                    "name": name,
                    "parent": located_in
                })
        elif label !="Museum": #label is another type such as Town, Bay, Beach etc.    
            # BEST_FOR → Audience
            for audience in best_for:
                # Here, the error is specifically on (a:Audience {{name: $audience}})
                # Let's try to simplify the escaping for the property map.
                # We need one layer of escaping for the f-string, then the Cypher braces.
                graph.query(f"""
                MERGE (a:Audience {{name: $audience}})
                WITH a
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:BEST_FOR]->(a)
                """, {
                    "audience": audience.lower().capitalize(),
                    "name": name
                })

            # HAS_CONCEPT → Concept (from properties)
            for concept in properties:
                graph.query(f"""
                MERGE (c:Concept {{name: $concept}})
                WITH c
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:HAS_CONCEPT]->(c)
                """, {
                    "concept": concept.lower().capitalize(),
                    "name": name
                })

In [194]:
insert_museum(graph, merged_places)

100%|████████████████████████████████████████████████████████████████████████████████| 135/135 [01:26<00:00,  1.55it/s]


In [214]:
insert_places(graph, merged_places)

In [147]:
def insert_museum(graph, places):
    for place in places:
        name = place["mention"].lower().capitalize()
        label = place["node_type"].capitalize()
        located_in = place.get("located_in")
        best_for = place.get("best_for", [])
        properties = place.get("properties", [])
        description = place.get("description", "")
        if label == "Museum":
            
            coords = place.get("coordinates")
            if isinstance(coords, str):
                lat, lon = [float(x.strip()) for x in coords.split(",")]
            else:
                lat, lon = float(coords[0]), float(coords[1])
            rating = place.get("rating")

            facilities = place.get("facilities", [])

            # Extract individual properties from the 'information' dictionary
            address = place.get("address", "")
            email = place.get("email", "")
            phone = place.get("phone", "")
            # You'll need to decide how to handle nested 'opening_hours'.
            # For simplicity, let's store them as separate properties or a string.
            # Here, storing as separate properties for summer/winter.
            opening_hours_data = place.get("opening_hours", {}) # Default to an empty dictionary if not present

            opening_hours_summer = opening_hours_data.get("summer", "")
            opening_hours_winter = opening_hours_data.get("winter", "")
            box_office_closing = opening_hours_data.get("box_office_closing", "")

            images = place.get("images", []) # This is a list of strings, which is fine
            local_price = place.get("local_price", "")
            foreigner_price = place.get("foreigner_price", "")

            # Merge Museum node with flattened properties
            graph.query("""
            MERGE (m:Museum {name: $name})
            SET m.description = $description,
                m.coordinates = point({latitude: $lat, longitude: $lon}),
                m.rating = $rating,
                m.address = $address,
                m.email = $email,
                m.phone = $phone,
                m.opening_hours_summer = $opening_hours_summer,
                m.opening_hours_winter = $opening_hours_winter,
                m.box_office_closing = $box_office_closing,
                m.images = $images,
                m.local_price = $local_price,
                m.foreigner_price = $foreigner_price
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon,
                "rating": rating,
                "address": address,
                "email": email,
                "phone": phone,
                "opening_hours_summer": opening_hours_summer,
                "opening_hours_winter": opening_hours_winter,
                "box_office_closing": box_office_closing,
                "images": images,
                "local_price": local_price,
                "foreigner_price": foreigner_price
            })

            # The rest of your relationships (LOCATED_IN, BEST_FOR, HAS_CONCEPT, HAS_FACILITY)
            # remain the same as they create separate nodes or relationships.
            
            if located_in=="Izmir": loc_node="Izmir"
            else: loc_node = "Town"
            # LOCATED_IN relationship
            if located_in:
                graph.query(f"""
                MERGE (l:{loc_node} {{name: $parent}})
                MERGE (m:Museum {{name: $name}})
                MERGE (m)-[:LOCATED_IN]->(l)
                """, {"name": name, "parent": located_in})

            # BEST_FOR → Audience
            for audience in best_for:
                graph.query("""
                MERGE (a:Audience {name: $audience})
                WITH a
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:BEST_FOR]->(a)
                """, {"audience": audience.lower().capitalize(), "name": name})

            # HAS_CONCEPT → Concept (from properties)
            for concept in properties:
                graph.query("""
                MERGE (c:Concept {name: $concept})
                WITH c
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:HAS_CONCEPT]->(c)
                """, {"concept": concept.lower().capitalize(), "name": name})

            # HAS_FACILITY → Facility
            for facility in facilities:
                graph.query("""
                MERGE (f:Facility {name: $facility})
                WITH f
                MATCH (m:Museum {name: $name})
                MERGE (m)-[:HAS_FACILITY]->(f)
                """, {"facility": facility.lower().capitalize(), "name": name})

        else: #for other node types
            lat,lon = merged_places[0].get("coordinates")
            
            

In [213]:
def insert_places(graph, places):
    for place in places:
        name = place["mention"].lower().capitalize()
        label = place["node_type"].capitalize()
        
        description = place.get("description", "")
        # Ensure coordinates are handled safely, they might be None if not present
        coordinates = place.get("coordinates")
        lat, lon = (coordinates[0], coordinates[1]) if coordinates else (None, None)

        located_in = place.get("located_in","").lower().capitalize()
        
        best_for = place.get("best_for", [])
        properties = place.get("properties", [])
        
        if label == "City": #which means Izmir node
            
            # Merge City node with flattened properties
            graph.query("""
            MERGE (p:City {name: $name})
            SET p.description = $description,
                p.coordinates = point({latitude: $lat, longitude: $lon})
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon
            })

            
        elif label !="Museum": #label is another type such as Town, Bay, Beach etc.
            
            # Merge the node with flattened properties
            # Use f-string to inject the label dynamically, keep property map with single braces as Langchain might handle
            graph.query(f"""
            MERGE (p:{label} {{name: $name}})
            SET p.description = $description,
                p.coordinates = point({{latitude: $lat, longitude: $lon}})
            """, {
                "name": name,
                "description": description,
                "lat": lat,
                "lon": lon
            })
            
            if located_in=="Izmir": #only city is Izmir
                loc_label = "City"
            else: loc_label="Town"
            
            # LOCATED_IN relationship
            if located_in:
                graph.query(f"""
                MERGE (p:{loc_label} {{name: $parent}})
                MERGE (m:{label} {{name: $name}})
                MERGE (m)-[:LOCATED_IN]->(p)
                """, {
                    "name": name,
                    "parent": located_in
                })
        elif label !="Museum": #label is another type such as Town, Bay, Beach etc.    
            # BEST_FOR → Audience
            for audience in best_for:
                # Here, the error is specifically on (a:Audience {{name: $audience}})
                # Let's try to simplify the escaping for the property map.
                # We need one layer of escaping for the f-string, then the Cypher braces.
                graph.query(f"""
                MERGE (a:Audience {{name: $audience}})
                WITH a
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:BEST_FOR]->(a)
                """, {
                    "audience": audience.lower().capitalize(),
                    "name": name
                })

            # HAS_CONCEPT → Concept (from properties)
            for concept in properties:
                graph.query(f"""
                MERGE (c:Concept {{name: $concept}})
                WITH c
                MATCH (p:{label} {{name: $name}})
                MERGE (p)-[:HAS_CONCEPT]->(c)
                """, {
                    "concept": concept.lower().capitalize(),
                    "name": name
                })

In [149]:
def insert_foods(graph, foods):
    for food in foods:
        name = food["mention"].lower().capitalize()
        food_type = food["type"].lower().capitalize()
        description = food.get("description", "")
        coordinates = food.get("coordinates")
        lat, lon = (coordinates[0], coordinates[1]) if coordinates else (None, None)
        ingredients = food.get("ingredients", [])
        where_to_eat = food.get("where_to_eat", [])

        # Create/Merge the Food node
        graph.query(f"""
            MERGE (f:Food {{name: $name}})
            SET f.description = $description,
                f.coordinates = point({{latitude: $lat, longitude: $lon}})
        """, {
            "name": name,
            "description": description,
            "lat": lat,
            "lon": lon
        })

        # Create/Merge the FoodType node and link it
        if food_type:
            graph.query(f"""
                MERGE (ft:FoodType {{name: $food_type}})
                WITH ft
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:TYPE_OF]->(ft)
            """, {
                "food_type": food_type,
                "name": name
            })

        # Create/Merge Ingredient nodes and link them
        for ingredient in ingredients:
            ingredient_name = ingredient.lower().capitalize()
            graph.query(f"""
                MERGE (i:Ingredient {{name: $ingredient_name}})
                WITH i
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:HAS_INGREDIENT]->(i)
            """, {
                "ingredient_name": ingredient_name,
                "name": name
            })
        
        # Create/Merge Location nodes (where_to_eat) and link them
        for location in where_to_eat:
            location_name = location.lower().capitalize()
            if location_name=="Izmir": loc_node="City"
            else: loc_node = "Town"
            # Assuming 'where_to_eat' typically refers to a Location, e.g., a district or specific place
            graph.query(f"""
                MERGE (l:{loc_node} {{name: $location_name}})
                WITH l
                MATCH (f:Food {{name: $name}})
                MERGE (f)-[:POPULAR_IN]->(l)
            """, {
                "location_name": location_name,
                "name": name
            })

In [153]:
### Connecting all town nodes not connected to another town, connect to Izmir (city) node

def connect_unlocated_places_to_izmir(graph):
    """
    Connects Town (and optionally other specific Place types like Bay, Beach)
    nodes that do not have an outgoing LOCATED_IN relationship to the
    'Izmir' City node.

    Args:
        graph: The Neo4jGraph object connected to your Neo4j database.
    """
    print("Connecting unlocated places to Izmir City node...")

    cypher_query = """
    // 1. Ensure the 'Izmir' City node exists (idempotent)
    MERGE (izmir:City {name: 'Izmir'})

    // 2. Find all Town (or other relevant labels) nodes that do NOT have an outgoing LOCATED_IN relationship
    WITH izmir
    MATCH (p)
    WHERE (p:Town OR p:Bay OR p:Beach) // Adjust labels as needed for your schema
      AND NOT (p)-[:LOCATED_IN]->()

    // 3. MERGE the LOCATED_IN relationship from these places to Izmir
    MERGE (p)-[:LOCATED_IN]->(izmir)

    RETURN count(p) AS PlacesConnectedToIzmir
    """

    try:
        # The graph.query method typically returns a list of records.
        # We expect one record with the count.
        result = graph.query(cypher_query)
        
        # Access the count from the result
        if result and isinstance(result, list) and len(result) > 0 and 'PlacesConnectedToIzmir' in result[0]:
            connected_count = result[0]['PlacesConnectedToIzmir']
            print(f"Successfully connected {connected_count} places to Izmir City node.")
        else:
            print("Query executed, but no count returned or result format unexpected.")
            print(f"Raw query result: {result}")

    except Exception as e:
        print(f"An error occurred while connecting unlocated places: {e}")

In [154]:
insert_museum(graph, merged_places)

In [155]:
insert_foods(graph,foods)

In [156]:
insert_places(graph, merged_places)

In [157]:
connect_unlocated_places_to_izmir(graph)

Connecting unlocated places to Izmir City node...
Successfully connected 10 places to Izmir City node.


In [167]:
file_path="kg_database_data/merged_places.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(merged_places, f, ensure_ascii=False, indent=4)

In [168]:
file_path="kg_database_data/museums.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(museums, f, ensure_ascii=False, indent=4)

In [169]:
file_path="kg_database_data/foods.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(foods, f, ensure_ascii=False, indent=4)

In [171]:
merged_places[-1]

{'file': 'museum_9.html',
 'rating': '4.9',
 'description': "after high reliefs of zeus altar were found on 1865 within the byzantine wall located at acropolis, official excavations were initiated by carl humann and alexander conze in 1878. at first, unearthed artifacts were being kept in the store at the depot of excavation house. this form has been one of anatolia's first depot museums. storage becomes insufficient during the time. in 1924, the artifacts were transferred to another building temporarily which will be used as public center later. then osman bayatli became the director for that temporary museum. osman bayatli here creates a new collection consisting of ethnographic works at the same time. this new museum building was not enough during the time to the increasing number of artifact and density. new attempts are made on 1932 for new museum needed and new museum building was started to be constructed on 1933. german architects bruno meyer and harold hanson were inspired by 