In [1]:
%%capture 
!pip install neo4j
!pip install func_timeout
!pip install rouge-score
!pip install evaluate

In [3]:
from neo4j.exceptions import CypherSyntaxError, Neo4jError, ServiceUnavailable
from pandas.testing import assert_series_equal, assert_frame_equal
from func_timeout import func_timeout, FunctionTimedOut
from rouge_score import rouge_scorer
from collections import Counter
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import re
import os

from evaluate import load
bleu_metric = load("bleu")

E0000 00:00:1766153856.110683      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766153856.171806      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766153856.683866      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766153856.683902      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766153856.683905      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766153856.683907      55 computation_placer.cc:177] computation placer already registered. Please check linka

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [4]:
import logging
logging.getLogger("neo4j").setLevel(logging.ERROR)

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# **Load Data**

In [5]:
INPUT_CSV_PATH = "/kaggle/input/test1row-v2/test.csv"
OUTPUT_CSV_PATH = "/kaggle/working/test.csv" 
CHECKPOINT_INTERVAL = 100

URI = "neo4j+s://demo.neo4jlabs.com:7687"
DRIVERS_BY_ALIAS = {}

In [6]:
df_input = pd.read_csv(INPUT_CSV_PATH, encoding="utf-8-sig")
print(f"Total rows: {len(df_input)}")

Total rows: 1


# **Driver helper**

In [7]:
def extract_driver(alias):
    if alias.startswith("neo4jlabs_demo_db_"):
        name = alias.replace("neo4jlabs_demo_db_", "")
        return name, name
    else:
        raise ValueError(f"Unsupported database alias: {alias}")

def get_driver(alias):
    if not alias or pd.isna(alias):
        raise ValueError("Database alias cannot be empty")
    
    if alias in DRIVERS_BY_ALIAS:
        return DRIVERS_BY_ALIAS[alias]
    
    try:
        username, password = extract_driver(alias)
        driver = GraphDatabase.driver(URI, auth=(username, password))
        DRIVERS_BY_ALIAS[alias] = driver
        # print(f"[INFO] Created new driver for alias: {alias}")
        return driver
    except Exception as e:
        # print(f"[ERROR] Failed to create driver for {alias}: {e}")
        raise

def close_all_drivers():
    for alias, driver in list(DRIVERS_BY_ALIAS.items()):
        try:
            driver.close()
            # print(f"[INFO] Closed driver for: {alias}")
        except Exception as e:
            print(f"[WARN] Error closing driver for {alias}: {e}")
    DRIVERS_BY_ALIAS.clear()
    print("[INFO] All drivers closed.")

# **Validate/Execution Helper**

In [8]:
def explain_cypher(cypher, database_alias):
    if cypher in ["error", None, ""] or not cypher.strip():
        return False
    
    if pd.isna(database_alias) or not database_alias:
        return False
    
    try:
        driver = get_driver(database_alias)
        with driver.session() as session:
            session.run(f"EXPLAIN {cypher}", timeout=30000)
        return True
    except Exception:
        return False

def execute_cypher(cypher, database_alias):
    if cypher in ["error", None, ""] or not cypher.strip():
        return False, None, "Empty query"
    
    if pd.isna(database_alias) or not database_alias:
        return False, None, "Invalid database alias"
    
    try:
        driver = get_driver(database_alias)
        with driver.session() as session:
            result = session.run(cypher, timeout=30000)
            records = result.data()
        return True, records, None
        
    except Exception as e:
        error_msg = str(e).lower()
        if 'timeout' in error_msg or 'timed out' in error_msg:
            return False, None, "TIMEOUT"
        return False, None, str(e)

# **Comparison Helper**

In [9]:
def normalize_cypher_result(data, cypher_query=None):
    if data is None or (isinstance(data, list) and len(data) == 0):
        return pd.DataFrame()
    
    try:
        df = pd.DataFrame(data)
    except Exception:
        return pd.DataFrame()
    
    if df.empty:
        return df
    
    df = df.drop_duplicates()
    
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except (ValueError, TypeError):
            pass
    
    numeric_cols = df.select_dtypes(include=['float64', 'float32', 'float16']).columns
    if len(numeric_cols) > 0:
        df[numeric_cols] = df[numeric_cols].round(6)
    
    str_cols = df.select_dtypes(include=['object']).columns
    for col in str_cols:
        df[col] = df[col].astype(str).str.strip().str.lower()
    
    has_order_by = False
    has_limit = False
    
    if cypher_query:
        has_order_by = bool(re.search(r'\bORDER BY\b', cypher_query, re.IGNORECASE))
        has_limit = bool(re.search(r'\bLIMIT\b', cypher_query, re.IGNORECASE))
    
    df = df.reindex(sorted(df.columns), axis=1)
    
    if not has_order_by and not (has_limit and not has_order_by):
        try:
            sort_df = df.copy()
            for col in sort_df.columns:
                sort_df[col] = sort_df[col].astype(str)
            sort_df = sort_df.sort_values(by=list(sort_df.columns))
            df = df.reindex(sort_df.index)
            df = df.reset_index(drop=True)
        except Exception:
            pass
    
    df = df.fillna(-99999)
    df = df.reset_index(drop=True)
    
    return df

def compare_values_match(df_pred, df_gt):
    if df_pred.shape != df_gt.shape:
        return False, 'different_shape'
    
    if df_pred.empty and df_gt.empty:
        return True, 'both_empty'
    
    try:
        df_pred_sorted = df_pred.reindex(sorted(df_pred.columns), axis=1)
        df_gt_sorted = df_gt.reindex(sorted(df_gt.columns), axis=1)
        
        is_equal = np.array_equal(df_pred_sorted.values, df_gt_sorted.values)
        
        if is_equal:
            return True, 'values_match'
        else:
            return False, 'different_results'
    
    except Exception as e:
        return False, f'comparison_error_{str(e)[:30]}'

def smart_subset_df(df_sub, df_super, query_sub=None, query_super=None):
    if df_sub.empty:
        return False, [], 'empty_subset'
    if df_super.empty:
        return False, [], 'empty_superset'
    
    df_sub = df_sub.copy()
    df_super_temp = df_super.copy()
    matched_columns = []
    
    for col_sub_name in df_sub.columns:
        col_matched = False
        for col_super_name in df_super_temp.columns:
            try:
                col_sub_values = df_sub[col_sub_name].sort_values().reset_index(drop=True)
                col_super_values = df_super_temp[col_super_name].sort_values().reset_index(drop=True)
                assert_series_equal(
                    col_sub_values, 
                    col_super_values, 
                    check_dtype=False, 
                    check_names=False,
                    check_exact=False
                )
                col_matched = True
                matched_columns.append(col_super_name)
                df_super_temp = df_super_temp.drop(columns=[col_super_name])
                break
            except (AssertionError, ValueError, TypeError):
                continue
        
        if not col_matched:
            return False, [], f'no_match_for_column_{col_sub_name}'
    
    try:
        df_sub_norm = normalize_cypher_result(df_sub, query_sub)
        df_super_matched = df_super[matched_columns].copy()
        df_super_matched.columns = df_sub.columns
        df_super_matched_norm = normalize_cypher_result(df_super_matched, query_super)
        assert_frame_equal(
            df_sub_norm, 
            df_super_matched_norm, 
            check_dtype=False,
            check_exact=False
        )
        return True, matched_columns, 'subset_match_success'
    except (AssertionError, ValueError, TypeError) as e:
        return False, matched_columns, f'subset_values_mismatch_{str(e)[:30]}'

def check_execution_accuracy(pred_cypher, gt_cypher, database_alias):
    import sys
    
    try:
        pred_success, pred_data, pred_error = execute_cypher(pred_cypher, database_alias)
        
        if pred_error == "TIMEOUT":
            return False, None, None, 'execution_timeout', 'pred_query_timeout'
        
        gt_success, gt_data, gt_error = execute_cypher(gt_cypher, database_alias)
        
        if gt_error == "TIMEOUT":
            return False, None, None, 'execution_timeout', 'gt_query_timeout'
        
        if not pred_success or not gt_success:
            return False, None, None, 'execution_error', 'query_execution_failed'
        
        if pred_data and len(pred_data) > 10000:
            return False, None, None, 'can_not_execute', f'pred_too_large_{len(pred_data)}_rows'
        if gt_data and len(gt_data) > 10000:
            return False, None, None, 'can_not_execute', f'gt_too_large_{len(gt_data)}_rows'
        
        pred_size = sys.getsizeof(pred_data) if pred_data else 0
        gt_size = sys.getsizeof(gt_data) if gt_data else 0
        
        if pred_size > 100 * 1024 * 1024:
            return False, None, None, 'can_not_execute', f'pred_memory_{pred_size//1024//1024}MB'
        if gt_size > 100 * 1024 * 1024:
            return False, None, None, 'can_not_execute', f'gt_memory_{gt_size//1024//1024}MB'
        
        df_pred = pd.DataFrame(pred_data) if pred_data else pd.DataFrame()
        df_gt = pd.DataFrame(gt_data) if gt_data else pd.DataFrame()
        
        df_pred_norm = normalize_cypher_result(df_pred, pred_cypher)
        df_gt_norm = normalize_cypher_result(df_gt, gt_cypher)
        
        is_values_match, detail = compare_values_match(df_pred_norm, df_gt_norm)
        
        if is_values_match:
            return True, None, None, 'values_match', detail
        
        if len(df_pred_norm.columns) < len(df_gt_norm.columns) and len(df_pred_norm) == len(df_gt_norm):
            is_subset, matched_cols, subset_detail = smart_subset_df(
                df_pred_norm, df_gt_norm, pred_cypher, gt_cypher
            )
            
            if is_subset:
                return True, None, None, 'value_subset_match', f'subset_{subset_detail}'
        
        return False, None, None, 'different_results', 'no_match'
    
    except Exception as e:
        return False, None, None, 'execution_error', f'unexpected_{str(e)[:30]}'

# **Metrics Helper**

In [10]:
def normalize_cypher_metrics(cypher: str) -> str:
    if not cypher:
        return ""
    cypher = cypher.strip()
    cypher = re.sub(r"'([^']*)'", r'"\1"', cypher) 
    cypher = cypher.lower()
    cypher = cypher.rstrip(';')                    
    cypher = re.sub(r'\s+', ' ', cypher)           
    return cypher.strip()

def normalize_cypher_exectmatch(cypher):
    if not cypher:
        return ""
    cypher = cypher.strip()
    # 1. Convert single quotes to double quotes
    cypher = re.sub(r"'([^']*)'", r'"\1"', cypher)
    
    # 2. Lowercase
    cypher = cypher.lower()
    
    # 3. Remove semicolon at end
    cypher = cypher.rstrip(';').strip()
    
    # 4. Remove ALL whitespace (thay vì chuẩn hóa thành 1 space)
    cypher = re.sub(r'\s+', '', cypher)
    
    return cypher

def exact_match(pred: str, gt: str) -> bool:
    return normalize_cypher_exectmatch(pred) == normalize_cypher_exectmatch(gt)

In [11]:
def calculate_metrics(pred_cypher, gt_cypher):
    metrics = {
        'exact_match': 0.0,
        'bleu_score': 0.0,
        'rouge_l_score': 0.0,
        'token_f1': 0.0
    }
   
    if pred_cypher in ["error", "time_error"] or not pred_cypher or not gt_cypher:
        return metrics
   
    try:
        # Exact match: dùng normalize mạnh
        metrics['exact_match'] = 1.0 if exact_match(pred_cypher, gt_cypher) else 0.0
       
        # Chuẩn hóa nhẹ cho các metric text
        pred_norm = normalize_cypher_metrics(pred_cypher)
        gt_norm = normalize_cypher_metrics(gt_cypher)
       
        # Tokenize giữ nguyên cấu trúc (dùng regex tách keyword và ký tự đặc biệt)
        pred_tokens = re.findall(r'\w+|[^\w\s]', pred_norm)
        gt_tokens = re.findall(r'\w+|[^\w\s]', gt_norm)
       
        # Google-Bleu (0-100)
        if pred_norm.strip() and gt_norm.strip():
            results = bleu_metric.compute(
                predictions=[pred_norm],
                references=[[gt_norm]],    # phải là list of list
                max_order=4,
                smooth=True
            )
            metrics['bleu_score'] = round(results['bleu'] * 100, 4)
        
       
        # ROUGE-L (dùng trực tiếp trên chuỗi đã normalize nhẹ)
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
        rouge_scores = scorer.score(gt_norm, pred_norm)
        metrics['rouge_l_score'] = round(rouge_scores['rougeL'].fmeasure, 4)
       
        # Token F1
        if pred_tokens and gt_tokens:
            pred_counter = Counter(pred_tokens)
            gt_counter = Counter(gt_tokens)
            common_tokens = sum((pred_counter & gt_counter).values())
           
            if common_tokens > 0:
                precision = common_tokens / len(pred_tokens)
                recall = common_tokens / len(gt_tokens)
                token_f1 = 2 * precision * recall / (precision + recall)
                metrics['token_f1'] = round(token_f1, 4)
       
    except Exception as e:
        print(f"Error calculating metrics: {e}")  # để debug nếu cần
   
    return metrics

# **Chạy một dòng**

In [12]:
def _evaluate_logic(row):
    pred_cypher = row['cypher_generated']
    gt_cypher = row['cypher']
    database_alias = row['database_reference_alias']
    question_id = row.name
    
    text_metrics = calculate_metrics(pred_cypher, gt_cypher)
    metrics_str = f"EM: {text_metrics['exact_match']:.2f} | BLEU: {text_metrics['bleu_score']:.3f} | ROUGE-L: {text_metrics['rouge_l_score']:.3f} | Token-F1: {text_metrics['token_f1']:.3f}"
    
    if pred_cypher == "error":
        print(f"[ROW {question_id}] Result: llm_error | {metrics_str}")
        return {'eval_result': 'llm_error', **text_metrics}
    
    if pred_cypher == "time_error":
        print(f"[ROW {question_id}] Result: llm_timeout | {metrics_str}")
        return {'eval_result': 'llm_timeout', **text_metrics}
    
    # XỬ LÝ DATABASE_ALIAS RỖNG
    if pd.isna(database_alias) or not database_alias or str(database_alias).strip() == "":
        print(f"[ROW {question_id}] Result: no_database_alias | {metrics_str}")
        return {'eval_result': 'no_database_alias', **text_metrics}
    
    try:
        is_valid = explain_cypher(pred_cypher, database_alias)
        if not is_valid:
            print(f"[ROW {question_id}] Result: invalid_syntax | {metrics_str}")
            return {'eval_result': 'invalid_syntax', **text_metrics}
    except Exception as e:
        print(f"[ROW {question_id}] Result: invalid_syntax | {metrics_str}")
        return {'eval_result': 'invalid_syntax', **text_metrics}
    
    try:
        exec_correct, _, _, exec_category, detail = check_execution_accuracy(
            pred_cypher, gt_cypher, database_alias
        )
        print(f"[ROW {question_id}] Result: {exec_category} | {metrics_str}")
        return {'eval_result': exec_category, **text_metrics}
    except Exception as e:
        print(f"[ROW {question_id}] Result: execution_error | {metrics_str}")
        return {'eval_result': 'execution_error', **text_metrics}

def evaluate_func(row):
    try:
        result = func_timeout(300, _evaluate_logic, args=(row,))
        return result
    except FunctionTimedOut:
        return {
            'eval_result': 'evaluation_timeout',
            'exact_match': 0.0,
            'bleu_score': 0.0,
            'rouge_l_score': 0.0,
            'token_f1': 0.0
        }
    except Exception:
        return {
            'eval_result': 'evaluation_error',
            'exact_match': 0.0,
            'bleu_score': 0.0,
            'rouge_l_score': 0.0,
            'token_f1': 0.0
        }

# **Chạy batch**

In [13]:
def run_evaluation():
    print(f"[INFO] Loading data from: {INPUT_CSV_PATH}")
    df = pd.read_csv(INPUT_CSV_PATH, encoding='utf-8-sig')
    print(f"[INFO] Total rows: {len(df)}")
    
    # Load checkpoint nếu có
    if os.path.exists(OUTPUT_CSV_PATH):
        df_result = pd.read_csv(OUTPUT_CSV_PATH, encoding='utf-8-sig')
        
        # Kiểm tra dòng nào đã được evaluate
        processed_mask = df_result['eval_result'].notna()
        processed_count = processed_mask.sum()
    else:
        df_result = df.copy()
        df_result['eval_result'] = None
        df_result['exact_match'] = None
        df_result['bleu_score'] = None
        df_result['rouge_l_score'] = None
        df_result['token_f1'] = None
        processed_mask = pd.Series([False] * len(df))
    
    try:
        for idx in range(len(df)):
            if processed_mask.iloc[idx]:
                continue
            
            row = df.iloc[idx]
            result = evaluate_func(row)
            
            for key, value in result.items():
                df_result.at[idx, key] = value
            
            if (idx + 1) % CHECKPOINT_INTERVAL == 0:
                df_result.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8-sig')
                # print(f"[CHECKPOINT] Saved at row {idx + 1}")
        
        df_result.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8-sig')        
    finally:
        close_all_drivers()
    
    return df_result

In [15]:
def print_summary_report(df):
    total_samples = len(df)
  
    # Tính số lượng có/không có alias
    df_with_db = df[df['eval_result'] != 'no_database_alias']
    total_with_db = len(df_with_db)
    total_no_db = total_samples - total_with_db
    
    # ===================================================================
    # TỔNG QUAN SỐ LƯỢNG MẪU
    # ===================================================================
    print("\n" + "=" * 60)
    print("EVALUATION SUMMARY")
    print("=" * 60)
    print(f"Total Samples             : {total_samples}")
    print(f"With Database Alias       : {total_with_db}")
    print(f"No Database Alias         : {total_no_db}")
    
    # ===================================================================
    # 1. EXECUTION ACCURACY (chỉ tính trên các dòng có database_alias)
    # ===================================================================
    print("\n" + "=" * 60)
    print("EXECUTION ACCURACY (alias)")
    print("=" * 60)
    
    if total_with_db > 0:
        counts = df_with_db['eval_result'].value_counts()
        
        # Đếm đầy đủ tất cả các category
        values_match         = counts.get('values_match', 0)
        value_subset_match   = counts.get('value_subset_match', 0)
        different_results    = counts.get('different_results', 0)
        invalid_syntax       = counts.get('invalid_syntax', 0)
        llm_error            = counts.get('llm_error', 0)
        llm_timeout          = counts.get('llm_timeout', 0)
        execution_timeout    = counts.get('execution_timeout', 0)
        evaluation_timeout   = counts.get('evaluation_timeout', 0)   # THÊM
        execution_error      = counts.get('execution_error', 0)
        evaluation_error     = counts.get('evaluation_error', 0)     # THÊM
        can_not_execute      = counts.get('can_not_execute', 0)
        
        print(f"Total rows evaluated      : {total_with_db}")
        print("-" * 60)
        
        # Correct cases
        print(f"Correct - Exact Match     : {values_match:4d}/{total_with_db} ({values_match/total_with_db*100:5.1f}%)")
        print(f"Correct - Subset Match    : {value_subset_match:4d}/{total_with_db} ({value_subset_match/total_with_db*100:5.1f}%)")
        
        # Incorrect / Failed
        print(f"Incorrect (diff results)  : {different_results:4d}/{total_with_db} ({different_results/total_with_db*100:5.1f}%)")
        print(f"Invalid Syntax            : {invalid_syntax:4d}/{total_with_db} ({invalid_syntax/total_with_db*100:5.1f}%)")
        
        # LLM issues
        print(f"LLM Error                 : {llm_error:4d}/{total_with_db} ({llm_error/total_with_db*100:5.1f}%)")
        print(f"LLM Timeout               : {llm_timeout:4d}/{total_with_db} ({llm_timeout/total_with_db*100:5.1f}%)")
        
        # Execution & Evaluation issues
        print(f"Execution Timeout         : {execution_timeout:4d}/{total_with_db} ({execution_timeout/total_with_db*100:5.1f}%)")
        print(f"Evaluation Timeout        : {evaluation_timeout:4d}/{total_with_db} ({evaluation_timeout/total_with_db*100:5.1f}%)")
        print(f"Execution Error           : {execution_error:4d}/{total_with_db} ({execution_error/total_with_db*100:5.1f}%)")
        print(f"Evaluation Error          : {evaluation_error:4d}/{total_with_db} ({evaluation_error/total_with_db*100:5.1f}%)")
        print(f"Cannot Execute (large/OOM): {can_not_execute:4d}/{total_with_db} ({can_not_execute/total_with_db*100:5.1f}%)")
        
        # Tính Execution Accuracy chính thức
        correct_total = values_match + value_subset_match
        exec_accuracy = correct_total / total_with_db * 100
        print("-" * 60)
        print(f"EXECUTION ACCURACY        : {correct_total:4d}/{total_with_db} ({exec_accuracy:5.2f}%)")
        
        # Verify tổng có khớp không
        total_counted = (values_match + value_subset_match + different_results +
                         invalid_syntax + llm_error + llm_timeout +
                         execution_timeout + evaluation_timeout +
                         execution_error + evaluation_error + can_not_execute)
        print(f"[VERIFY] Sum of categories: {total_counted}/{total_with_db} {'✅' if total_counted == total_with_db else '❌'}")
        
    else:
        print("No rows with database_alias found")
    
    # ===================================================================
    # 2. TEXT-BASED METRICS (tính trên cả alias + no alias)
    # ===================================================================
    print("\n" + "=" * 60)
    print("TEXT-BASED METRICS (alias + no alias)")
    print("=" * 60)
    
    if all(col in df.columns for col in ['exact_match', 'bleu_score', 'rouge_l_score', 'token_f1']):
        valid_metrics_df = df.dropna(subset=['exact_match', 'bleu_score', 'rouge_l_score', 'token_f1'])
        if len(valid_metrics_df) > 0:
            exact_match_mean = valid_metrics_df['exact_match'].mean()
            bleu_mean = valid_metrics_df['bleu_score'].mean()
            rouge_mean = valid_metrics_df['rouge_l_score'].mean()
            token_f1_mean = valid_metrics_df['token_f1'].mean()
            exact_match_count = (valid_metrics_df['exact_match'] == 1.0).sum()
            
            print(f"Total rows with metrics   : {len(valid_metrics_df)}/{total_samples}")
            print("-" * 60)
            print(f"Exact Match               : {exact_match_count}/{len(valid_metrics_df)} ({exact_match_mean*100:6.2f}%)")
            print(f"BLEU Score (avg)          : {bleu_mean:.4f}")
            print(f"ROUGE-L Score (avg)       : {rouge_mean:.4f}")
            print(f"Token-F1 (avg)            : {token_f1_mean:.4f}")
            
            # Breakdown theo category (giữ nguyên như cũ)
            print("\n" + "-" * 60)
            print("BREAKDOWN BY CATEGORY:")
            print("-" * 60)
            for category in ['values_match', 'value_subset_match', 'different_results', 'invalid_syntax', 'no_database_alias']:
                category_df = valid_metrics_df[valid_metrics_df['eval_result'] == category]
                if len(category_df) > 0:
                    cat_em = category_df['exact_match'].mean()
                    cat_bleu = category_df['bleu_score'].mean()
                    cat_rouge = category_df['rouge_l_score'].mean()
                    cat_token_f1 = category_df['token_f1'].mean()
                    print(f"{category:25s} (n={len(category_df):4d}): EM={cat_em:.3f}, BLEU={cat_bleu:.4f}, ROUGE-L={cat_rouge:.4f}, Token-F1={cat_token_f1:.4f}")
        else:
            print("No valid metrics found in the dataframe")
    else:
        print("Metrics columns not found in the dataframe")

In [16]:
df_results = run_evaluation()
print_summary_report(df_results)

[INFO] Loading data from: /kaggle/input/test1row-v2/test.csv
[INFO] Total rows: 1
[ROW 0] Result: values_match | EM: 1.00 | BLEU: 100.000 | ROUGE-L: 1.000 | Token-F1: 1.000
[INFO] All drivers closed.

EVALUATION SUMMARY
Total Samples             : 1
With Database Alias       : 1
No Database Alias         : 0

EXECUTION ACCURACY (alias)
Total rows evaluated      : 1
------------------------------------------------------------
Correct - Exact Match     :    1/1 (100.0%)
Correct - Subset Match    :    0/1 (  0.0%)
Incorrect (diff results)  :    0/1 (  0.0%)
Invalid Syntax            :    0/1 (  0.0%)
LLM Error                 :    0/1 (  0.0%)
LLM Timeout               :    0/1 (  0.0%)
Execution Timeout         :    0/1 (  0.0%)
Evaluation Timeout        :    0/1 (  0.0%)
Execution Error           :    0/1 (  0.0%)
Evaluation Error          :    0/1 (  0.0%)
Cannot Execute (large/OOM):    0/1 (  0.0%)
------------------------------------------------------------
EXECUTION ACCURACY        