# Restaurant Graph Pipeline (fz dataset)

This notebook builds the full pipeline for the restaurant (fz) dataset:
- Clean raw ARFF and extract `area_code`
- Construct similarity edges based on `(addr, city)`
- Build graphs: constraint graph `S` and instance graph `G`
- Produce cleaned ground truth `G_opt` with no violations
- Persist artifacts for downstream perturbation/repair experiments

Parameters (tunable below): address distance threshold, file paths, and optional Neo4j import.

In [25]:
# Setup and Parameters

# Imports
import os
from pathlib import Path
from datetime import datetime

import pandas as pd

# Edit distance
try:
    import Levenshtein
    def string_distance(a, b):
        return Levenshtein.distance(str(a), str(b))
except Exception as e:
    raise RuntimeError("python-Levenshtein is required; install via pyproject or pip.")

# Optional: NetworkX for convenience (graphs + stats)
try:
    import networkx as nx
    NX_AVAILABLE = True
except Exception:
    NX_AVAILABLE = False

# Parameters (tune as needed)
ARFF_PATH = Path("datasets/restaurant/fz.arff")
OUTPUT_DIR = Path("datasets/temp")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

ADDRESS_DISTANCE_THRESHOLD = 7  # max edit distance for addr similarity

print("NetworkX available:", NX_AVAILABLE)
print("ARFF path:", ARFF_PATH)
print("Output dir:", OUTPUT_DIR)

NetworkX available: True
ARFF path: datasets\restaurant\fz.arff
Output dir: datasets\temp


In [26]:
# Load ARFF into DataFrame
from io import StringIO

def arff_to_dataframe(filepath: Path) -> pd.DataFrame:
    data = False
    header = ""
    csv_content = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.rstrip('\n')
            if "@attribute" in line.lower():
                attributes = line.split()
                attri_idx = next(i for i, x in enumerate(attributes) if x.lower() == "@attribute")
                column_name = attributes[attri_idx + 1]
                header = header + column_name + ","
            elif "@data" in line.lower():
                data = True
                header = header.rstrip(',') + '\n'
                csv_content.append(header)
            elif data and line.strip():
                csv_content.append(line + '\n')
    csv_string = ''.join(csv_content)
    df_local = pd.read_csv(StringIO(csv_string), quotechar='"')
    return df_local

# Load
df = arff_to_dataframe(ARFF_PATH)
print(f"Loaded {len(df)} rows; columns: {list(df.columns)}")
df.head(3)

Loaded 864 rows; columns: ['name', 'addr', 'city', 'phone', 'type', 'class']


Unnamed: 0,name,addr,city,phone,type,class
0,arnie morton's of chicago,"""435 s. la cienega blv.""","""los angeles""","""310/246-1501""","""american""",'0'
1,arnie morton's of chicago,"""435 s. la cienega blvd.""","""los angeles""","""310-246-1501""","""steakhouses""",'0'
2,art's delicatessen,"""12224 ventura blvd.""","""studio city""","""818/762-1221""","""american""",'1'


In [27]:
# Extract area code and basic cleaning

def extract_area_code(phone):
    if pd.isna(phone) or phone is None:
        return None
    s = str(phone).strip().strip('"').strip("'")
    digits = s.replace('-', '').replace('/', '').replace(' ', '')
    if len(digits) >= 3 and digits[:3].isdigit():
        return digits[:3]
    return None

# Ensure expected columns exist
expected_cols = {"name", "phone", "addr", "city"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

# Compute area_code
df["area_code"] = df["phone"].apply(extract_area_code)

print("Area code extraction:")
print("  total:", len(df))
print("  with area_code:", df["area_code"].notna().sum())
print("  unique area_codes:", df["area_code"].nunique())

df[["name", "phone", "area_code"]].head(5)

Area code extraction:
  total: 864
  with area_code: 864
  unique area_codes: 11


Unnamed: 0,name,phone,area_code
0,arnie morton's of chicago,"""310/246-1501""",310
1,arnie morton's of chicago,"""310-246-1501""",310
2,art's delicatessen,"""818/762-1221""",818
3,art's deli,"""818-762-1221""",818
4,hotel bel-air,"""310/472-1211""",310


In [28]:
# Build similarity pairs based on (addr, city)

similarity_edges = []
N = len(df)
print(f"Building similarity pairs over {N} restaurants...")
for i in range(N):
    ai = df.iloc[i]
    for j in range(i+1, N):
        aj = df.iloc[j]
        if ai["city"] != aj["city"]:
            continue
        d = string_distance(ai["addr"], aj["addr"])
        if d < ADDRESS_DISTANCE_THRESHOLD:
            similarity_edges.append((i, j))

print("Similarity summary:")
print("  total edges:", len(similarity_edges))
if N:
    print("  avg degree:", (2*len(similarity_edges))/N)
print("Sample edges:", similarity_edges[:10])

Building similarity pairs over 864 restaurants...
Similarity summary:
  total edges: 5307
  avg degree: 12.284722222222221
Sample edges: [(0, 1), (0, 22), (0, 242), (1, 22), (1, 242), (2, 3), (2, 37), (2, 706), (3, 37), (3, 706)]


In [29]:
# Persist cleaned temp outputs (raw graph inputs)

ts = datetime.now().strftime("%Y%m%d-%H%M%S")
restaurants_path = OUTPUT_DIR / f"restaurants_{ts}.txt"
similarities_path = OUTPUT_DIR / f"restaurant_similarities_{ts}.txt"

# Ensure area_code exists to avoid KeyError if earlier cell wasn't run
if "area_code" not in df.columns:
    def _extract_area_code_local(phone):
        if pd.isna(phone) or phone is None:
            return None
        s = str(phone).strip().strip('"').strip("'")
        digits = s.replace('-', '').replace('/', '').replace(' ', '')
        if len(digits) >= 3 and digits[:3].isdigit():
            return digits[:3]
        return None
    df["area_code"] = df["phone"].apply(_extract_area_code_local)

with open(restaurants_path, "w", encoding="utf-8") as f:
    f.write("id\tname\tarea_code\taddr\tcity\n")
    for idx, row in df.iterrows():
        f.write(f"{idx}\t{row['name']}\t{row['area_code']}\t{row['addr']}\t{row['city']}\n")

with open(similarities_path, "w", encoding="utf-8") as f:
    for i, j in similarity_edges:
        f.write(f"({i},{j})\n")

print("Saved:")
print("  ", restaurants_path)
print("  ", similarities_path)

Saved:
   datasets\temp\restaurants_20260120-131834.txt
   datasets\temp\restaurant_similarities_20260120-131834.txt


In [30]:
# Build graphs (S, G) and clean to ground truth (G_opt)

# Constraint graph S: nodes = area codes; only self-loops allowed
labels = sorted(df["area_code"].dropna().unique().tolist())

if NX_AVAILABLE:
    S = nx.Graph()
    S.add_nodes_from(labels)
    S.add_edges_from((ac, ac) for ac in labels)
else:
    S = {ac: {ac} for ac in labels}  # adjacency-by-label for checks

# Instance graph G: nodes = restaurant indices, label = area_code, edges from similarity pairs
if NX_AVAILABLE:
    G = nx.Graph()
    for idx, row in df.iterrows():
        G.add_node(int(idx))
        G.nodes[int(idx)]["label"] = row["area_code"]
        G.nodes[int(idx)]["name"] = row["name"]
        G.nodes[int(idx)]["addr"] = row["addr"]
        G.nodes[int(idx)]["city"] = row["city"]
    for u, v in similarity_edges:
        if u != v:
            G.add_edge(int(u), int(v))
else:
    # Lightweight structure without networkx
    G_nodes = {int(idx): {
        "label": row["area_code"],
        "name": row["name"],
        "addr": row["addr"],
        "city": row["city"],
    } for idx, row in df.iterrows()}
    G_edges = {(min(int(u), int(v)), max(int(u), int(v))) for (u, v) in similarity_edges if u != v}

# Violation check: neighbors must share same area_code

def has_edge_in_S(lu, lv):
    if NX_AVAILABLE:
        return S.has_edge(lu, lv)
    return lv in S.get(lu, set())

# Helpers to get the FIRST violating edge with respect to the CURRENT graph state
if NX_AVAILABLE:
    def first_violation(Gx):
        for (u, v) in Gx.edges():
            lu = Gx.nodes[u].get("label")
            lv = Gx.nodes[v].get("label")
            if not has_edge_in_S(lu, lv):
                return (u, v)
        return None

    def count_violations(Gx):
        c = 0
        for (u, v) in Gx.edges():
            lu = Gx.nodes[u].get("label")
            lv = Gx.nodes[v].get("label")
            if not has_edge_in_S(lu, lv):
                c += 1
        return c
else:
    def first_violation(_unused):
        for (u, v) in G_edges:
            lu = G_nodes[u]["label"]
            lv = G_nodes[v]["label"]
            if not has_edge_in_S(lu, lv):
                return (u, v)
        return None

    def count_violations_non_nx():
        c = 0
        for (u, v) in G_edges:
            lu = G_nodes[u]["label"]
            lv = G_nodes[v]["label"]
            if not has_edge_in_S(lu, lv):
                c += 1
        return c

# Cleaning: iteratively remove the lower-degree endpoint of first violation until none remain
removed = set()
if NX_AVAILABLE:
    G_opt = G.copy()
    viol_before = count_violations(G_opt)
    while True:
        pair = first_violation(G_opt)
        if not pair:
            break
        u, v = pair
        # Guard in case of transient references
        if u not in G_opt or v not in G_opt:
            continue
        drop = u if G_opt.degree[u] <= G_opt.degree[v] else v
        removed.add(drop)
        G_opt.remove_node(drop)
    viol_after = count_violations(G_opt)
else:
    # Non-NX cleaning
    viol_before = count_violations_non_nx()
    remaining_nodes = set(G_nodes.keys())
    while True:
        pair = first_violation(None)
        if not pair:
            break
        u, v = pair
        deg_u = sum(1 for e in G_edges if u in e)
        deg_v = sum(1 for e in G_edges if v in e)
        drop = u if deg_u <= deg_v else v
        removed.add(drop)
        remaining_nodes.discard(drop)
        G_edges = {e for e in G_edges if drop not in e}
    viol_after = count_violations_non_nx()

print("Graph stats (before cleaning):")
if NX_AVAILABLE:
    print(f"  |V|={len(G.nodes)}, |E|={len(G.edges)}")
else:
    print(f"  |V|={len(G_nodes)}, |E|={len(G_edges)}")

print("Cleaning results:")
print("  violations before:", viol_before)
print("  violations after:", viol_after)
print("  removed nodes:", len(removed))
if NX_AVAILABLE:
    print(f"  G_opt |V|={len(G_opt.nodes)}, |E|={len(G_opt.edges)}")
else:
    print(f"  G_opt |V|={len(remaining_nodes)}, |E|={len(G_edges)}")

Graph stats (before cleaning):
  |V|=864, |E|=5307
Cleaning results:
  violations before: 32
  violations after: 0
  removed nodes: 15
  G_opt |V|=849, |E|=5245


In [18]:
# Persist cleaned ground truth artifacts

ts2 = datetime.now().strftime("%Y%m%d-%H%M%S")
clean_restaurants_path = OUTPUT_DIR / f"restaurants_cleaned_{ts2}.txt"
clean_similarities_path = OUTPUT_DIR / f"restaurant_similarities_cleaned_{ts2}.txt"

# Helper to iterate nodes/edges in G_opt independent of NetworkX
if NX_AVAILABLE:
    nodes_iter = sorted(G_opt.nodes)
    edges_iter = sorted((min(u, v), max(u, v)) for (u, v) in G_opt.edges())
else:
    nodes_iter = sorted(remaining_nodes)
    edges_iter = sorted(G_edges)

# Map from id to row for writing attributes
by_id = {int(idx): row for idx, row in df.iterrows()}

with open(clean_restaurants_path, "w", encoding="utf-8") as f:
    f.write("id\tname\tarea_code\taddr\tcity\n")
    for rid in nodes_iter:
        r = by_id.get(int(rid))
        if r is None:
            continue
        f.write(f"{rid}\t{r['name']}\t{r['area_code']}\t{r['addr']}\t{r['city']}\n")

with open(clean_similarities_path, "w", encoding="utf-8") as f:
    for u, v in edges_iter:
        f.write(f"({u},{v})\n")

print("Saved cleaned ground truth:")
print("  ", clean_restaurants_path)
print("  ", clean_similarities_path)

Saved cleaned ground truth:
   datasets\temp\restaurants_cleaned_20260120-130842.txt
   datasets\temp\restaurant_similarities_cleaned_20260120-130842.txt


## Import into Neo4j

If you have a local Neo4j, set credentials in `.env` and run the next cell to import the raw graph to two DBs:
- Constraint DB (with uniqueness constraint and optional visualization self-loops)
- Instance DB (no uniqueness constraint â€” useful for injecting perturbations)


In [31]:
# Neo4j helpers (optional)

import pathlib
from dotenv import load_dotenv

try:
    from neo4j import GraphDatabase
    NEO4J_AVAILABLE = True
except Exception:
    NEO4J_AVAILABLE = False

if not NEO4J_AVAILABLE:
    print("neo4j driver not available; skip this section or install 'neo4j'.")
else:
    env_path = pathlib.Path.cwd() / ".env"
    load_dotenv(dotenv_path=env_path, override=True)

    def _strip_quotes(v):
        return None if v is None else v.strip().strip('"').strip("'")

    URI = _strip_quotes(os.getenv("NEO4J_URI"))
    USERNAME = _strip_quotes(os.getenv("NEO4J_USERNAME"))
    PASSWORD = _strip_quotes(os.getenv("NEO4J_PASSWORD"))
    CONSTRAINT_DB = _strip_quotes(os.getenv("NEO4J_CONSTRAINT_DB")) or "restaurants-constraint"
    INSTANCE_DB = _strip_quotes(os.getenv("NEO4J_INSTANCE_DB")) or "restaurants-instance"

    print("Neo4j URI:", URI)
    AUTH = (USERNAME, PASSWORD)

    def clear_database(driver, database):
        driver.execute_query("MATCH (n) DETACH DELETE n", database_=database)

    def setup_database(driver, database):
        driver.execute_query(
            """
            CREATE CONSTRAINT restaurant_id_unique IF NOT EXISTS
            FOR (r:Restaurant) REQUIRE r.id IS UNIQUE
            """,
            database_=database,
        )

    def import_data(driver, restaurants_list, similarities_list, database):
        driver.execute_query(
            """
            UNWIND $restaurants AS r
            MERGE (n:Restaurant {id: r.id})
            SET n.name = r.name, n.area_code = r.area_code, n.addr = r.addr, n.city = r.city
            """,
            restaurants=restaurants_list,
            database_=database,
        )
        driver.execute_query(
            """
            UNWIND $pairs AS pair
            MATCH (a:Restaurant {id: pair[0]})
            MATCH (b:Restaurant {id: pair[1]})
            MERGE (a)-[:SIMILAR]->(b)
            """,
            pairs=similarities_list,
            database_=database,
        )

    def visualize_constraint_graph(driver, database):
        driver.execute_query(
            "MATCH (:Restaurant)-[rel:CONSTRAINT]->() DELETE rel",
            database_=database,
        )
        driver.execute_query(
            "MATCH (r:Restaurant) MERGE (r)-[:CONSTRAINT]->(r)",
            database_=database,
        )

    # Prepare payloads from current in-memory data
    restaurants_payload = []
    for idx, row in df.iterrows():
        restaurants_payload.append({
            "id": str(idx),
            "name": row["name"],
            "area_code": row["area_code"],
            "addr": row["addr"],
            "city": row["city"],
        })
    similarities_payload = [(str(u), str(v)) for (u, v) in similarity_edges]

    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("Connected to Neo4j.")

        # Constraint DB (canonical)
        clear_database(driver, CONSTRAINT_DB)
        setup_database(driver, CONSTRAINT_DB)
        import_data(driver, restaurants_payload, similarities_payload, CONSTRAINT_DB)
        visualize_constraint_graph(driver, CONSTRAINT_DB)

        # Instance DB (sandbox)
        clear_database(driver, INSTANCE_DB)
        import_data(driver, restaurants_payload, similarities_payload, INSTANCE_DB)

    print("Import complete.")

Neo4j URI: neo4j://127.0.0.1:7687
Connected to Neo4j.
Import complete.


## Validation & Next Steps

- Files produced for experiments:
  - `datasets/temp/restaurants_*.txt`
  - `datasets/temp/restaurant_similarities_*.txt`
  - `datasets/temp/restaurants_cleaned_*.txt`
  - `datasets/temp/restaurant_similarities_cleaned_*.txt`

- Neo4j checks (Browser):
```cypher
// Counts
MATCH (r:Restaurant) RETURN count(r) AS restaurants;
MATCH (:Restaurant)-[:SIMILAR]->(:Restaurant) RETURN count(*) AS edges;

// NULL-aware violations (use in both instance and constraint DBs)
MATCH (a:Restaurant)-[:SIMILAR]->(b:Restaurant)
WHERE a.area_code IS NULL OR b.area_code IS NULL OR a.area_code <> b.area_code
RETURN count(*) AS violations;
```

- Notes:
  - If you cleaned only the instance DB, query that DB in Neo4j Desktop.
  - To make the canonical DB (constraint) the ground truth, set `TARGET_DB = CONSTRAINT_DB` in Cell 14 and rerun.

- Next: add perturbation generators and repair strategies using `G_opt` as ground truth.
- Threshold tuning: adjust `ADDRESS_DISTANCE_THRESHOLD` to control graph density.

## Neo4j-only Path (no NetworkX)

This section performs similarity edge creation (optional, via APOC) and the iterative cleaning to ground truth entirely in Neo4j using Cypher. The Python code here only orchestrates queries; no NetworkX is used.

- Build `:SIMILAR` edges in DB using APOC Levenshtein (optional)
- Iteratively remove a low-degree endpoint from any cross-area-code edge until none remain
- Export cleaned nodes/edges to datasets/temp

In [11]:
# Build SIMILAR edges in Neo4j with APOC (optional)
# Requires: Restaurants already imported (see Neo4j helpers above)

try:
    from neo4j import GraphDatabase
    NEO4J_AVAILABLE = True
except Exception:
    NEO4J_AVAILABLE = False

if not NEO4J_AVAILABLE:
    print("neo4j driver not available; skip this section or install 'neo4j'.")
else:
    import pathlib
    from dotenv import load_dotenv
    env_path = pathlib.Path.cwd() / ".env"
    load_dotenv(dotenv_path=env_path, override=True)

    def _strip_quotes(v):
        return None if v is None else v.strip().strip('"').strip("'")

    URI = _strip_quotes(os.getenv("NEO4J_URI"))
    USERNAME = _strip_quotes(os.getenv("NEO4J_USERNAME"))
    PASSWORD = _strip_quotes(os.getenv("NEO4J_PASSWORD"))
    INSTANCE_DB = _strip_quotes(os.getenv("NEO4J_INSTANCE_DB")) or "restaurants-instance"

    threshold = ADDRESS_DISTANCE_THRESHOLD

    apoc_stmt = (
        "CALL apoc.periodic.iterate(\n"
        "  'MATCH (a:Restaurant) RETURN a',\n"
        "  'MATCH (b:Restaurant)\\n"
        "   WHERE a.id < b.id AND a.city = b.city\\n"
        "     AND apoc.text.levenshteinDistance(a.addr, b.addr) < $threshold\\n"
        "   MERGE (a)-[:SIMILAR]->(b)',\n"
        "  {batchSize:1000, parallel:false, params:{threshold:$threshold}}\n"
        ")"
    )

    with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as driver:
        driver.verify_connectivity()
        try:
            driver.execute_query(apoc_stmt, threshold=threshold, database_=INSTANCE_DB)
            print(f"SIMILAR edges built in database '{INSTANCE_DB}' via APOC (threshold={threshold}).")
        except Exception as e:
            print("APOC-based edge creation failed. Ensure APOC is installed and enabled.")
            print("Error:", e)
            print("You can skip this step if edges were already imported earlier.")

APOC-based edge creation failed. Ensure APOC is installed and enabled.
Error: {neo4j_code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `apoc.periodic.iterate` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.} {gql_status: 42001} {gql_status_description: error: syntax error or access rule violation - invalid syntax}
You can skip this step if edges were already imported earlier.


In [None]:
# Clean to ground truth entirely in Neo4j (iterative Cypher)

from neo4j import GraphDatabase
import pathlib
from dotenv import load_dotenv

env_path = pathlib.Path.cwd() / ".env"
load_dotenv(dotenv_path=env_path, override=True)

def _strip_quotes(v):
    return None if v is None else v.strip().strip('"').strip("'")

URI = _strip_quotes(os.getenv("NEO4J_URI"))
USERNAME = _strip_quotes(os.getenv("NEO4J_USERNAME"))
PASSWORD = _strip_quotes(os.getenv("NEO4J_PASSWORD"))
INSTANCE_DB = _strip_quotes(os.getenv("NEO4J_INSTANCE_DB")) or "restaurants-instance"
CONSTRAINT_DB = _strip_quotes(os.getenv("NEO4J_CONSTRAINT_DB")) or "restaurants-constraint"

# Choose which DB to clean: ground-truth usually lives in CONSTRAINT_DB.
# Set TARGET_DB to CONSTRAINT_DB if you want the canonical DB cleaned.
TARGET_DB = CONSTRAINT_DB  # default to canonical DB for cleaning

with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as driver:
    driver.verify_connectivity()

    def get_one_violation(tx):
        # NULL-aware violation selection: any NULL or unequal area_code constitutes a violation
        res = tx.run(
            """
            MATCH (a:Restaurant)-[:SIMILAR]->(b:Restaurant)
            WHERE a.area_code IS NULL OR b.area_code IS NULL OR a.area_code <> b.area_code
            RETURN a.id AS u, b.id AS v
            LIMIT 1
            """
        ).data()
        return res[0] if res else None

    def degree_of(tx, rid):
        res = tx.run(
            """
            MATCH (r:Restaurant {id:$id})
            RETURN size((r)--()) AS d
            """,
            id=rid,
        ).data()
        return (res[0]["d"] if res else 0)

    def delete_node(tx, rid):
        tx.run("MATCH (r:Restaurant {id:$id}) DETACH DELETE r", id=rid)

    removed = 0
    with driver.session(database=TARGET_DB) as session:
        print(f"Cleaning Neo4j DB '{TARGET_DB}' to ground truth...")
        while True:
            vpair = session.execute_read(get_one_violation)
            if not vpair:
                break
            u, v = vpair["u"], vpair["v"]
            du = session.execute_read(degree_of, u)
            dv = session.execute_read(degree_of, v)
            drop = u if du <= dv else v
            session.execute_write(delete_node, drop)
            removed += 1

    print(f"Neo4j cleaning complete on '{TARGET_DB}'. Removed {removed} nodes.")

In [None]:
# Export cleaned graph from Neo4j to files (no NetworkX)

from pathlib import Path
from datetime import datetime
from neo4j import GraphDatabase
import pathlib
from dotenv import load_dotenv

env_path = pathlib.Path.cwd() / ".env"
load_dotenv(dotenv_path=env_path, override=True)

def _strip_quotes(v):
    return None if v is None else v.strip().strip('"').strip("'")

URI = _strip_quotes(os.getenv("NEO4J_URI"))
USERNAME = _strip_quotes(os.getenv("NEO4J_USERNAME"))
PASSWORD = _strip_quotes(os.getenv("NEO4J_PASSWORD"))
INSTANCE_DB = _strip_quotes(os.getenv("NEO4J_INSTANCE_DB")) or "restaurants-instance"

out_ts = datetime.now().strftime("%Y%m%d-%H%M%S")
out_nodes = OUTPUT_DIR / f"restaurants_cleaned_neo4j_{out_ts}.txt"
out_edges = OUTPUT_DIR / f"restaurant_similarities_cleaned_neo4j_{out_ts}.txt"

with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as driver:
    driver.verify_connectivity()
    with driver.session(database=INSTANCE_DB) as session:
        # Nodes
        recs = session.run(
            """
            MATCH (r:Restaurant)
            RETURN r.id AS id, r.name AS name, r.area_code AS area_code, r.addr AS addr, r.city AS city
            ORDER BY r.id
            """
        ).data()
        with open(out_nodes, "w", encoding="utf-8") as f:
            f.write("id\tname\tarea_code\taddr\tcity\n")
            for row in recs:
                f.write(f"{row['id']}\t{row['name']}\t{row['area_code']}\t{row['addr']}\t{row['city']}\n")

        # Undirected edges once (a.id < b.id)
        e_recs = session.run(
            """
            MATCH (a:Restaurant)-[:SIMILAR]->(b:Restaurant)
            WHERE a.id < b.id
            RETURN a.id AS u, b.id AS v
            ORDER BY u, v
            """
        ).data()
        with open(out_edges, "w", encoding="utf-8") as f:
            for row in e_recs:
                f.write(f"({row['u']},{row['v']})\n")

print("Exported cleaned Neo4j graph:")
print("  ", out_nodes)
print("  ", out_edges)