In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install -q func-timeout neo4j_graphrag neo4j

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
from neo4j_graphrag.schema import get_structured_schema
from func_timeout import func_timeout, FunctionTimedOut
from neo4j.exceptions import AuthError, Neo4jError
from neo4j import GraphDatabase
import pandas as pd
import subprocess
import requests
import json
import time
import re
import os

**Khởi động Ollama**

In [None]:
ollama_process = subprocess.Popen(['ollama', 'serve'],
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

print("Đang khởi động Ollama server...")
time.sleep(5)

# Kiểm tra server
try:
    response = requests.get('http://localhost:11434')
    print("✓ Ollama server đã sẵn sàng!")
except:
    print("✗ Lỗi khi khởi động server")

Đang khởi động Ollama server...
✓ Ollama server đã sẵn sàng!


**Pull model**

In [None]:
!ollama pull qwen2.5-coder:14b

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?2

In [None]:
# Cell mới - Kiểm tra model
!ollama list

NAME                 ID              SIZE      MODIFIED               
qwen2.5-coder:14b    9ec8897f747e    9.0 GB    Less than a second ago    


**Load data**

In [None]:
test_path = '/content/drive/MyDrive/T2C_qwen214b_bs_schema/text2cypher_test.csv'
checkpoint_path = '/content/drive/MyDrive/T2C_qwen214b_bs_schema/qwen214b_bs_schema.csv'
test_df = pd.read_csv(test_path, encoding="utf-8-sig")
print(f"✓ Loaded test data shape: {test_df.shape}")

✓ Loaded test data shape: (4833, 6)


**Neo4j setup**

In [None]:
URI = "neo4j+s://demo.neo4jlabs.com:7687"

# Danh sách alias
unique_aliases = test_df["database_reference_alias"].dropna().unique().tolist()
DATABASE_ALIASES = unique_aliases

# Lưu trữ drivers, schemas, examples theo alias
DRIVERS_BY_ALIAS = {}
SCHEMAS_BY_ALIAS = {}
EXAMPLES_BY_ALIAS = {}

**Helper driver**

In [None]:
def extract_alias(alias: str):
    name = alias.replace("neo4jlabs_demo_db_", "")
    return name, name

def get_driver(alias):
    if alias in DRIVERS_BY_ALIAS:
        return DRIVERS_BY_ALIAS[alias]
    user, pwd = extract_alias(alias)
    driver = GraphDatabase.driver(URI, auth=(user, pwd))
    DRIVERS_BY_ALIAS[alias] = driver
    return driver

def reset_driver(alias):
    if alias in DRIVERS_BY_ALIAS:
        try:
            DRIVERS_BY_ALIAS[alias].close()
        except Exception as e:
            print(f"Error closing driver: {e}")
        del DRIVERS_BY_ALIAS[alias]
    # Tạo lại driver mới
    return get_driver(alias)

print("✓ Neo4j helper loaded")

✓ Neo4j helper loaded


**Helper lấy example**

In [None]:
def safe_ident(name):
    return f"`{name}`"

def infer_type(value):
    if value is None:
        return "STRING"
    if isinstance(value, bool):
        return "BOOL"
    if isinstance(value, int):
        return "INT"
    if isinstance(value, float):
        return "FLOAT"
    return "STRING"

def is_valid_example(value, max_length=15):
    if value is None:
        return False

    # Convert sang string để check độ dài
    value_str = str(value)

    # Check độ dài NGAY LẬP TỨC
    if len(value_str) > max_length:
        return False

    if isinstance(value, str):
        val_lower = value.lower()
        if val_lower == "null":
            return False

        # Chuỗi hex dài (check này bây giờ redundant vì đã check len)
        if re.fullmatch(r"[0-9a-fA-F]+", value) and len(value) > 30:
            return False

        # Base64 dài (check này cũng redundant)
        if re.fullmatch(r"[0-9A-Za-z+/=]+", value) and len(value) > 40:
            return False

    return True

def get_sample(tx, label, prop_name, limit=1):
    label_safe = safe_ident(label)
    prop_safe = safe_ident(prop_name)

    q = (
        f"MATCH (n:{label_safe}) "
        f"WHERE n.{prop_safe} IS NOT NULL "
        f"RETURN n.{prop_safe} AS value LIMIT {limit}"
    )
    res = tx.run(q)
    return [r["value"] for r in res]

def get_relationship_sample(tx, rel_type, prop_name, limit=1):
    rel_safe = safe_ident(rel_type)
    prop_safe = safe_ident(prop_name)

    q = (
        f"MATCH ()-[r:{rel_safe}]->() "
        f"WHERE r.{prop_safe} IS NOT NULL "
        f"RETURN r.{prop_safe} AS value LIMIT {limit}"
    )
    res = tx.run(q)
    return [r["value"] for r in res]

def find_mentioned_nodes(query_text, all_node_labels):
    mentioned = set()
    query_lower = query_text.lower()

    for label in all_node_labels:
        # Hỗ trợ cả label có ký tự đặc biệt
        pattern = r'\b' + re.escape(label.lower()) + r'\b'
        if re.search(pattern, query_lower):
            mentioned.add(label)

    return mentioned

**Hàm tạo sẵn schema kèm example**

In [None]:
def example_alias(alias):
    driver = get_driver(alias)

    # Lấy schema
    try:
        schema = get_structured_schema(driver, is_enhanced=False)
    except AuthError as e:
        print(f"AuthError when getting schema for {alias}: {e}")
        driver = reset_driver(alias)
        schema = get_structured_schema(driver, is_enhanced=False)

    SCHEMAS_BY_ALIAS[alias] = schema

    node_props = schema.get("node_props", {})
    rel_props = schema.get("rel_props", {})
    examples = {"nodes": {}, "rels": {}}

    # Lấy ví dụ cho nodes
    with driver.session() as sess:
        for label, props in node_props.items():
            ex_node_props = {}
            for p in props:
                prop_name = p.get("property")
                if not prop_name:
                    continue
                try:
                    vals = sess.execute_read(get_sample, label, prop_name, 1)
                except AuthError as e:
                    print(f"AuthError sampling node {label}.{prop_name} for {alias}: {e}")
                    driver = reset_driver(alias)
                    with driver.session() as sess2:
                        vals = sess2.execute_read(get_sample, label, prop_name, 1)
                example = vals[0] if vals else None
                ex_node_props[prop_name] = example if is_valid_example(example) else None
            examples["nodes"][label] = ex_node_props

    # Lấy ví dụ cho relationships
    with driver.session() as sess:
        for rel_type, props in rel_props.items():
            ex_rel_props = {}
            for p in props:
                prop_name = p.get("property")
                if not prop_name:
                    continue
                try:
                    vals = sess.execute_read(get_relationship_sample, rel_type, prop_name, 1)
                except AuthError as e:
                    print(f"AuthError sampling rel {rel_type}.{prop_name} for {alias}: {e}")
                    driver = reset_driver(alias)
                    with driver.session() as sess2:
                        vals = sess2.execute_read(get_relationship_sample, rel_type, prop_name, 1)
                example = vals[0] if vals else None
                ex_rel_props[prop_name] = example if is_valid_example(example) else None
            examples["rels"][rel_type] = ex_rel_props

    EXAMPLES_BY_ALIAS[alias] = examples
    return SCHEMAS_BY_ALIAS[alias], EXAMPLES_BY_ALIAS[alias]

In [None]:
for alias in DATABASE_ALIASES:
    try:
        example_alias(alias)
    except Exception as e:
        print(f"Failed to precompute {alias}: {e}")

**Format schema sang json**

In [None]:
def convert_schema_json_format(schema, precomputed_examples, alias, node_labels_to_include=None):
    driver = get_driver(alias)

    if node_labels_to_include is None:
        node_labels_to_include = list(schema.get("node_props", {}).keys())

    unified_schema = {
        "nodes": {},
        "relationships": []
    }

    ex_nodes = precomputed_examples.get("nodes", {}) if precomputed_examples else {}

    # Convert nodes
    with driver.session() as sess:
        for label in node_labels_to_include:
            props = schema.get("node_props", {}).get(label, [])
            node_props = []

            for p in props:
                prop_name = p.get("property")
                if not prop_name:
                    continue

                # Ưu tiên dùng example đã precompute
                example = None
                if label in ex_nodes and prop_name in ex_nodes[label]:
                    example = ex_nodes[label][prop_name]

                # Nếu không có example sẵn -> truy vấn on-demand
                if example is None:
                    try:
                        vals = sess.execute_read(get_sample, label, prop_name, 1)
                    except AuthError as e:
                        print(f"AuthError in convert_schema (node) for {alias}: {e}")
                        driver = reset_driver(alias)
                        with driver.session() as sess2:
                            vals = sess2.execute_read(get_sample, label, prop_name, 1)
                    example = vals[0] if vals else None
                    if not is_valid_example(example):
                        example = None

                example_str = str(example) if example is not None else None
                dtype = infer_type(example) if example_str else "STRING"

                node_props.append({
                    "property": prop_name,
                    "type": dtype,
                    "example": example_str
                })

            unified_schema["nodes"][label] = node_props

    # Convert relationships
    ex_rels = precomputed_examples.get("rels", {}) if precomputed_examples else {}

    with driver.session() as sess:
        for rel_info in schema.get("relationships", []):
            rel_type = rel_info.get("type")
            start_label = rel_info.get("start")
            end_label = rel_info.get("end")

            if start_label in node_labels_to_include and end_label in node_labels_to_include:
                rel_props = []

                rel_prop_list = schema.get("rel_props", {}).get(rel_type, [])
                for p in rel_prop_list:
                    prop_name = p.get("property")
                    if not prop_name:
                        continue

                    example = None
                    if rel_type in ex_rels and prop_name in ex_rels[rel_type]:
                        example = ex_rels[rel_type][prop_name]

                    if example is None:
                        try:
                            vals = sess.execute_read(get_relationship_sample, rel_type, prop_name, 1)
                        except AuthError as e:
                            print(f"AuthError in convert_schema (rel) for {alias}: {e}")
                            driver = reset_driver(alias)
                            with driver.session() as sess2:
                                vals = sess2.execute_read(get_relationship_sample, rel_type, prop_name, 1)
                        example = vals[0] if vals else None
                        if not is_valid_example(example):
                            example = None

                    example_str = str(example) if example is not None else None
                    dtype = infer_type(example) if example_str else "STRING"

                    rel_props.append({
                        "property": prop_name,
                        "type": dtype,
                        "example": example_str
                    })

                unified_schema["relationships"].append({
                    "start": start_label,
                    "type": rel_type,
                    "end": end_label,
                    "properties": rel_props
                })

    return unified_schema

**Hàm schema linking**

In [None]:
def filter_schema_by_query(query_text, alias):
    if alias not in SCHEMAS_BY_ALIAS:
        raise ValueError(f"Schema not found for alias: {alias}")

    schema = SCHEMAS_BY_ALIAS[alias]
    precomputed_examples = EXAMPLES_BY_ALIAS.get(alias)

    all_node_labels = list(schema.get("node_props", {}).keys())

    # Nếu schema có ít hơn hoặc bằng 3 nodes -> trả về full schema
    if len(all_node_labels) <= 3:
        return convert_schema_json_format(schema, precomputed_examples, alias, None)

    # Tìm mentioned nodes
    mentioned_nodes = find_mentioned_nodes(query_text, all_node_labels)

    # Không tìm thấy mentioned nodes -> trả về full schema
    if not mentioned_nodes:
        return convert_schema_json_format(schema, precomputed_examples, alias, None)

    # Có mentioned nodes và schema lớn -> filter
    return convert_schema_json_format(schema, precomputed_examples, alias, mentioned_nodes)

**Format schema sang markdow**

In [None]:
def convert_schema_markdown_format(schema_dict):
    if not schema_dict:
        return None

    md_output = []
    md_output.append("### Nodes")

    # Format nodes
    for label, props in schema_dict.get("nodes", {}).items():
        md_output.append(f"- **{label}**")

        for prop in props:
            prop_name = prop["property"]
            dtype = prop["type"]
            example = prop.get("example")

            if example:
                md_output.append(f"  - `{prop_name}`: {dtype} Example: \"{example}\"")
            else:
                md_output.append(f"  - `{prop_name}`: {dtype}")

    # Format relationships
    md_output.append("\n### Relationships")

    relationships = schema_dict.get("relationships", [])
    if not relationships:
        md_output.append("- No relationships found")
    else:
        for rel in relationships:
            start = rel["start"]
            rel_type = rel["type"]
            end = rel["end"]
            rel_props = rel.get("properties", [])

            md_output.append(f"- **({start})-[:{rel_type}]->({end})**")

            for prop in rel_props:
                prop_name = prop["property"]
                dtype = prop["type"]
                example = prop.get("example")

                if example:
                    md_output.append(f"  - `{prop_name}`: {dtype} Example: \"{example}\"")
                else:
                    md_output.append(f"  - `{prop_name}`: {dtype}")

    return "\n".join(md_output).strip()

**Full schema format**

In [None]:
def get_full_schema_formatted(alias):

    if alias not in SCHEMAS_BY_ALIAS:
        raise ValueError(f"Schema not found for alias: {alias}")

    schema = SCHEMAS_BY_ALIAS[alias]
    precomputed_examples = EXAMPLES_BY_ALIAS.get(alias)

    # Convert toàn bộ schema sang unified format
    unified = convert_schema_json_format(schema, precomputed_examples, alias, None)

    # Format sang markdown
    return convert_schema_markdown_format(unified)

**Chạy schema linking 1 dòng**

In [None]:
def one_row_filter(query_text, alias):
    try:
        # Bước 1: Filter schema
        filtered_schema = filter_schema_by_query(query_text, alias)
        if not filtered_schema:
            return None

        # Bước 2: Format sang markdown
        formatted_schema = convert_schema_markdown_format(filtered_schema)
        return formatted_schema

    except Exception as e:
        print(f"Error processing {alias}: {e}")
        return None

**Tạo prompt**

In [None]:
def prompt(question, schema):
    system_message = """Task: Generate a Cypher statement to query a graph database. Instructions: Use only the provided relationship types and properties in the schema. Do not use any other relationship types or properties that are not provided in the schema. Do not include any explanations or apologies in your responses. Do not respond to any questions that ask anything other than constructing a Cypher statement. Do not include any text except the generated Cypher statement."""

    user_content = f"""Generate Cypher statement to query a graph database. Use only the provided relationship types and properties in the schema.
Schema: {schema}
Question: {question}
Cypher output:"""

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_content}
    ]

    return messages

print("✓ prompt loaded")

✓ prompt loaded


**Tạo cypher**

In [None]:
def generate_cypher(question, schema):
    messages = prompt(question, schema)

    try:
        # Gọi Ollama API
        response = requests.post(
            'http://localhost:11434/api/chat',
            json={
                "model": "qwen2.5-coder:14b",
                "messages": messages,
                "stream": False,
                "options": {
                    "temperature": 0.1,
                    "top_p": 0.9,
                    "num_predict": 256
                }
            }
        )

        if response.status_code == 200:
            result = response.json()
            content = result['message']['content']

            # Bước 1: Lấy nội dung trong cặp ``` ```
            code_block_match = re.search(r'```(?:cypher)?\s*(.*?)```', content, re.DOTALL | re.IGNORECASE)
            if code_block_match:
                content = code_block_match.group(1).strip()

            # Bước 2: Tìm và lấy từ MATCH trở đi (case-insensitive)
            match_pos = re.search(r'\b(MATCH|match)\b', content)
            if match_pos:
                content = content[match_pos.start():]

            # Bước 3: Xử lý content như code cũ
            content = content.strip().replace('\n', ' ')
            content = re.sub(r'\s+', ' ', content)
            content = content.rstrip(';').strip()

            return content
        else:
            print(f"[ERROR] API returned status {response.status_code}")
            return "error"

    except Exception as e:
        print(f"[ERROR] Generation failed: {e}")
        return "error"

print("✓ generate_cypher loaded")

✓ generate_cypher loaded


In [None]:
def generate_cypher2(question, alias=None, schema=None, timeout=900):
    # Kiểm tra alias có giá trị hay không
    if pd.isna(alias) or alias is None or str(alias).strip() == '':
        # Chỉ chạy bước 1, không có schema linking
        if schema is None:
            return ("error", schema)

        cypher_1 = generate_cypher(question, schema)
        return (cypher_1, schema)

    def _generate():
        schema_full = get_full_schema_formatted(alias)

        # BƯỚC 1: Generate Cypher lần 1
        cypher_1 = generate_cypher(question, schema_full)
        if cypher_1 in ["error"]:
            return (cypher_1, None)

        # BƯỚC 2: Extract schema linking từ Cypher(1) - CHỈ LÀM 1 LẦN
        schema_linked = one_row_filter(cypher_1, alias)
        if schema_linked is None:
            # Fallback về full schema nếu linking fail
            schema_linked = schema_full

        # BƯỚC 3: Generate Cypher lần 2 với schema đã link
        cypher_2 = generate_cypher(question, schema_linked)
        if cypher_2 in ["error"]:
            return (cypher_1, schema_linked)

        # Return cypher(2) và schema đã link
        return (cypher_2, schema_linked)

    try:
        return func_timeout(timeout, _generate)
    except FunctionTimedOut:
        return ("time_error", None)

**Dòng mẫu**

In [None]:
# Lấy test case đầu tiên
first_row = test_df.iloc[0]
test_question = first_row['question']
test_schema = first_row['schema']
test_alias = first_row['database_reference_alias']

print("="*80)
print("TEST QUESTION:")
print("="*80)
print(test_question)

# ============================================================================
# Test 1: Cypher
# ============================================================================
print("\n" + "="*80)
print("TEST 1: Cypher")
print("="*80)
result = generate_cypher2(test_question, test_alias, test_schema, 900)
print(result[0])

# ============================================================================
# Test 2: Expected Cypher
# ============================================================================
print("\n" + "="*80)
print("TEST 3: Expected Cypher")
print("="*80)
if 'cypher' in test_df.columns:
    print(first_row['cypher'])
else:
    print("N/A")

# ============================================================================
# Test 3: Sub Schema
# ============================================================================
print("\n" + "="*80)
print("TEST 3: Sub Schema")
print("="*80)
if result[1] is not None:
    print(result[1])
else:
    print("N/A")

TEST QUESTION:
Identify the 5 suppliers with the highest average unit price of products supplied.

TEST 1: Cypher
MATCH (s:Supplier)-[:SUPPLIES]->(p:Product) RETURN s.supplierID, s.companyName, AVG(p.unitPrice) AS avgUnitPrice ORDER BY avgUnitPrice DESC LIMIT 5

TEST 3: Expected Cypher
MATCH (s:Supplier)-[:SUPPLIES]->(p:Product) WITH s, avg(p.unitPrice) AS avgUnitPrice ORDER BY avgUnitPrice DESC LIMIT 5 RETURN s.companyName AS Supplier, avgUnitPrice AS AverageUnitPrice

TEST 3: Sub Schema
### Nodes
- **Order**
  - `shipName`: STRING
  - `requiredDate`: STRING
  - `shipCity`: STRING Example: "Reims"
  - `employeeID`: STRING Example: "5"
  - `shipPostalCode`: STRING Example: "51100"
  - `shippedDate`: STRING
  - `freight`: STRING Example: "32.38"
  - `orderDate`: STRING
  - `orderID`: STRING Example: "10248"
  - `shipAddress`: STRING
  - `customerID`: STRING Example: "VINET"
  - `shipCountry`: STRING Example: "France"
  - `shipVia`: STRING Example: "3"
  - `shipRegion`: STRING
- **Suppli

**Reset server mỗi 500 dòng**

In [None]:
def restart_ollama_server():
    global ollama_process

    print("[RESTART] Đang restart Ollama server...")

    # Kill process cũ
    try:
        ollama_process.terminate()
        ollama_process.wait(timeout=10)
    except:
        ollama_process.kill()

    time.sleep(3)

    # Khởi động lại
    ollama_process = subprocess.Popen(['ollama', 'serve'],
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    time.sleep(5)

    print("[RESTART] Ollama server đã khởi động lại!")

In [None]:
def run_batch(
    test_df,
    checkpoint_path,
    timeout=900,
    log_interval=100,
    restart_interval=500
):
    """
    Chạy batch generation với restart server và tính thời gian chính xác
    Dùng generate_cypher2
    FIX: Dòng rỗng '' được coi là CHƯA XỬ LÝ
    """
    # ==========================================================================
    # BƯỚC 1: Kiểm tra và load checkpoint
    if os.path.exists(checkpoint_path):
        print(f"[CHECKPOINT] Tìm thấy file checkpoint: {checkpoint_path}")
        df = pd.read_csv(checkpoint_path, encoding="utf-8-sig")
        df['cypher_generated'] = df['cypher_generated'].astype(str)
        print(f"[CHECKPOINT] Đã load {len(df)} dòng từ checkpoint")

        processed_count = (
            (df['cypher_generated'].notna()) &
            (df['cypher_generated'] != 'nan') &
            (df['cypher_generated'] != 'None') &
            (df['cypher_generated'].str.strip() != '')
        ).sum()
        print(f"[CHECKPOINT] Đã xử lý: {processed_count}/{len(df)} dòng")

    else:
        print(f"[CHECKPOINT] Không tìm thấy checkpoint, tạo mới từ test_df")
        df = test_df.copy()
        # FIX: Khởi tạo với NaN thay vì string rỗng
        df['cypher_generated'] = pd.NA
        df['cypher_generated'] = df['cypher_generated'].astype(str)

        df.to_csv(checkpoint_path, index=False, encoding='utf-8-sig')
        print(f"[CHECKPOINT] Đã tạo file checkpoint: {checkpoint_path}")

    # ==========================================================================
    # BƯỚC 2: Xử lý các dòng chưa có kết quả
    total_rows = len(df)
    batch_start_idx = 0
    processed_since_last_log = 0
    processed_since_last_restart = 0
    actual_processed = 0

    print(f"\n{'='*80}")
    print(f"BẮT ĐẦU XỬ LÝ - Tổng số dòng: {total_rows}")
    print(f"Timeout per query: {timeout}s")
    print(f"Restart mỗi: {restart_interval} dòng")
    print(f"{'='*80}\n")

    restart_ollama_server()
    start_time = time.time()

    for idx in range(total_rows):
        current_cypher = df.at[idx, 'cypher_generated']

        if (pd.notna(current_cypher) and
            str(current_cypher).strip() != '' and
            str(current_cypher) != 'nan' and
            str(current_cypher) != 'None'):
            continue

        # ======================================================================
        # XỬ LÝ DÒNG CHƯA CÓ KẾT QUẢ
        actual_processed += 1
        print(f"[Processing] Dòng {idx}...", end=" ", flush=True)

        try:
            question = df.at[idx, 'question']
            alias = df.at[idx, 'database_reference_alias']
            schema = df.at[idx, 'schema']

            result = generate_cypher2(question, alias, schema, timeout)
            cypher_result = result[0]

            df.at[idx, 'cypher_generated'] = cypher_result

            if cypher_result == "error":
                print("ERROR")
            elif cypher_result == "time_error":
                print("TIME ERROR")
            else:
                print("SUCCESS")

            processed_since_last_log += 1
            processed_since_last_restart += 1

        except Exception as e:
            print(f"ERROR - {str(e)}")
            df.at[idx, 'cypher_generated'] = "error"
            processed_since_last_log += 1
            processed_since_last_restart += 1

        # ======================================================================
        # RESTART OLLAMA SERVER SAU MỖI restart_interval DÒNG
        # ======================================================================
        if processed_since_last_restart >= restart_interval:
            df.to_csv(checkpoint_path, index=False, encoding='utf-8-sig')
            print(f"[CHECKPOINT] Đã lưu trước khi restart")

            restart_ollama_server()

            processed_since_last_restart = 0
            processed_since_last_log = 0

        # ======================================================================
        # LOG THỐNG KÊ VÀ LƯU CHECKPOINT
        # ======================================================================
        elif processed_since_last_log >= log_interval:
            df.to_csv(checkpoint_path, index=False, encoding='utf-8-sig')

            elapsed_time = time.time() - start_time

            # Tính thời gian ước tính ĐÚNG
            if actual_processed > 0:
                avg_time_per_row = elapsed_time / actual_processed
                # FIX: Đếm đúng số dòng còn lại (bao gồm cả dòng rỗng)
                remaining_to_process = (
                    (df['cypher_generated'].isna()) |
                    (df['cypher_generated'] == 'nan') |
                    (df['cypher_generated'] == 'None') |
                    (df['cypher_generated'].str.strip() == '')
                ).sum()
                estimated_time = avg_time_per_row * remaining_to_process
            else:
                avg_time_per_row = 0
                estimated_time = 0

            # FIX: Đếm lại từ DataFrame cho batch hiện tại (không tính rỗng)
            batch_df = df.iloc[batch_start_idx:idx+1]
            batch_success = (
                (batch_df['cypher_generated'].notna()) &
                (batch_df['cypher_generated'] != 'error') &
                (batch_df['cypher_generated'] != 'time_error') &
                (batch_df['cypher_generated'] != 'nan') &
                (batch_df['cypher_generated'] != 'None') &
                (batch_df['cypher_generated'].str.strip() != '')
            ).sum()
            batch_error = (batch_df['cypher_generated'] == 'error').sum()
            batch_timeout = (batch_df['cypher_generated'] == 'time_error').sum()

            print(f"\n{'='*80}")
            print(f"[LOG] Dòng {batch_start_idx}-{idx}")
            print(f"{'='*80}")
            print(f"Thành công:     {batch_success}")
            print(f"Error:          {batch_error}")
            print(f"Timeout Error:  {batch_timeout}")
            print(f"Tổng xử lý:     {batch_success + batch_error + batch_timeout}")
            print(f"Tiến độ:        {idx + 1}/{total_rows} ({(idx + 1)/total_rows*100:.2f}%)")
            print(f"Thời gian:      {elapsed_time/60:.2f} phút")
            print(f"Ước tính còn:   {estimated_time/60:.2f} phút")
            print(f"[CHECKPOINT] Đã lưu sau {processed_since_last_log} dòng")
            print(f"{'='*80}\n")

            batch_start_idx = idx + 1
            processed_since_last_log = 0

    # ==========================================================================
    # LƯU CHECKPOINT CUỐI CÙNG
    # ==========================================================================
    if processed_since_last_log > 0:
        df.to_csv(checkpoint_path, index=False, encoding='utf-8-sig')
        print(f"[CHECKPOINT] Đã lưu {processed_since_last_log} dòng cuối cùng")

    # ==========================================================================
    # KẾT THÚC - LOG CUỐI CÙNG
    # ==========================================================================
    total_time = time.time() - start_time

    final_success = (
        (df['cypher_generated'].notna()) &
        (df['cypher_generated'] != 'error') &
        (df['cypher_generated'] != 'time_error') &
        (df['cypher_generated'] != 'nan') &
        (df['cypher_generated'] != 'None') &
        (df['cypher_generated'].str.strip() != '')
    ).sum()
    final_error = (df['cypher_generated'] == 'error').sum()
    final_timeout = (df['cypher_generated'] == 'time_error').sum()

    print(f"\n{'='*80}")
    print(f"HOÀN THÀNH")
    print(f"{'='*80}")
    print(f"Tổng số dòng:        {total_rows}")
    print(f"Thành công:          {final_success} ({final_success/total_rows*100:.2f}%)")
    print(f"Error:               {final_error} ({final_error/total_rows*100:.2f}%)")
    print(f"Timeout Error:       {final_timeout} ({final_timeout/total_rows*100:.2f}%)")
    print(f"Tổng thời gian:      {total_time/60:.2f} phút")
    print(f"{'='*80}")

    return df

print("✓ run_batch fixed loaded - Dòng rỗng sẽ được xử lý lại")

✓ run_batch fixed loaded - Dòng rỗng sẽ được xử lý lại


In [None]:
result_df = run_batch(
    test_df=test_df,
    checkpoint_path=checkpoint_path,
    timeout=900,
    log_interval=50,
    restart_interval=250
)

[CHECKPOINT] Tìm thấy file checkpoint: /content/drive/MyDrive/T2C_qwen214b_bs_schema/qwen214b_bs_schema.csv
[CHECKPOINT] Đã load 4833 dòng từ checkpoint
[CHECKPOINT] Đã xử lý: 4250/4833 dòng

BẮT ĐẦU XỬ LÝ - Tổng số dòng: 4833
Timeout per query: 900s
Restart mỗi: 250 dòng

[RESTART] Đang restart Ollama server...
[RESTART] Ollama server đã khởi động lại!
[Processing] Dòng 4250... SUCCESS
[Processing] Dòng 4251... SUCCESS
[Processing] Dòng 4252... SUCCESS
[Processing] Dòng 4253... SUCCESS
[Processing] Dòng 4254... SUCCESS
[Processing] Dòng 4255... SUCCESS
[Processing] Dòng 4256... SUCCESS
[Processing] Dòng 4257... SUCCESS
[Processing] Dòng 4258... SUCCESS
[Processing] Dòng 4259... SUCCESS
[Processing] Dòng 4260... SUCCESS
[Processing] Dòng 4261... SUCCESS
[Processing] Dòng 4262... SUCCESS
[Processing] Dòng 4263... SUCCESS
[Processing] Dòng 4264... SUCCESS
[Processing] Dòng 4265... SUCCESS
[Processing] Dòng 4266... SUCCESS
[Processing] Dòng 4267... SUCCESS
[Processing] Dòng 4268... SUCCES