In [1]:
# Cell 1: Imports
import pandas as pd
import json # Used for saving results
from openai import OpenAI
from autoddg import AutoDDG
from autoddg.utils import get_sample
from autoddg.evaluation import BaseEvaluator # Evaluation base class
from datetime import datetime
import os # For checking file existence

# Cell 2: Configuration & Client Initialization
# --- LLM Config ---
MODEL_CONFIG = {
    "base_url": "http://localhost:11434/v1",
    "api_key": "ollama",
    "model_name": "llama3",
    "evaluation_model_name": "llama3"
}

# --- Experiment Config ---
DATASET_NAME = "CODE-15%: a large scale annotated dataset of 12-lead ECGs"
DATA_FILE = "../src/autoddg/related/data/code-15.csv"
PAPER_FILE = "../src/autoddg/related/papers/code15.pdf"
RESULTS_FILE = "autoddg_experiment_results.csv"

# --- Define Evaluation Class (Keep as is) ---
class Eval(BaseEvaluator):
    def __init__(self, model_name: str = MODEL_CONFIG["evaluation_model_name"]):
        client = OpenAI(
            api_key=MODEL_CONFIG["api_key"], 
            base_url=MODEL_CONFIG["base_url"]
        )
        super().__init__(client=client, model_name=model_name)

# Initialize Core Tools
client = OpenAI(api_key=MODEL_CONFIG["api_key"], base_url=MODEL_CONFIG["base_url"])
auto_ddg = AutoDDG(client=client, model_name=MODEL_CONFIG["model_name"])
auto_ddg.set_evaluator(Eval())

In [2]:
# Cell 3: Load Data and Profile
df = pd.read_csv(DATA_FILE)
sample_df, dataset_sample = get_sample(df, sample_size=100)

# Run ALL core profiling steps
print("--- Running Core Profiling ---")
basic_profile, structural_profile = auto_ddg.profile_dataframe(df)
semantic_profile = auto_ddg.analyze_semantics(sample_df)
data_topic = auto_ddg.generate_topic(DATASET_NAME, None, dataset_sample)

print("Profiling Complete.")

--- Running Core Profiling ---


  data = data.astype(object).fillna('').astype(str)


Profiling Complete.


In [3]:
# Cell 4: Define Multiple Related Work Prompts

# V0: The original (less effective) prompt
PROMPT_V0_ORIGINAL = """
You are a **Dataset Description Synthesis Expert**. Your task is to extract and synthesize research context *specifically about the dataset* for a search engine description.

Extract key research context about the dataset: **{dataset_name}**.

**INSTRUCTIONS:**
Your summary MUST cover and integrate the following key research aspects:

1. **Research Domain and Applications:** What field or discipline uses this dataset, and what specific research questions or problems does it address?

2. **Dataset Usage and Findings:** How did researchers practically use this dataset (e.g., analyses, experiments, modeling), and what were the key results or findings derived from it?

3. **Characteristics and Provenance:** Describe how the data was collected or generated, any unique value it provides, and any notable preprocessing or curation steps mentioned.

4. **Limitations and Challenges:** Summarize any limitations, challenges, biases, or caveats researchers identified while using this data.

**OUTPUT FORMAT:** Synthesize all the extracted information into **one cohesive, natural-language paragraph** (approximately 300-400 words) that describes the research context of the dataset. **DO NOT** use bullet points, section headings (like "Title," "Abstract," "Results," etc.), or lists. The output must be ready to be inserted directly into the final dataset description.

**RESEARCH PAPER TEXT:**
{paper_text}
"""

# V1: The revised, more restrictive prompt (Recommended: 100-150 words)
PROMPT_V1_REVISED = """
You are a concise synthesis expert for a dataset search engine. Your ONLY goal is to extract factual context about the dataset's usage, findings, and limitations from the provided text and convert it into a single, cohesive, non-conversational paragraph.

Extract key research context about the dataset: **{dataset_name}**.

INSTRUCTIONS:
Your summary MUST cover and integrate the following key research aspects:
1. Research Domain and Applications.
2. Dataset Usage and Findings.
3. Characteristics and Provenance.
4. Limitations and Challenges.

OUTPUT FORMAT: Synthesize all the extracted information into **one cohesive, natural-language paragraph** (approximately 100-150 words). DO NOT use bullet points, section headings, or lists.

RESEARCH PAPER TEXT:
{paper_text}
"""

# Store prompts in a dictionary for easy iteration
RELATED_WORK_PROMPTS = {
    "V0_Original": PROMPT_V0_ORIGINAL,
    "V1_Revised": PROMPT_V1_REVISED
}

In [4]:
# Cell 5: Logging Function (Updated to use your 3 metrics)

def parse_scores(raw_score_text: str) -> dict:
    """Parses the 'Metric: Score' string output into a dictionary."""
    scores = {}
    lines = raw_score_text.strip().split('\n')
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            try:
                # Store the key in a normalized way (lowercase, no spaces)
                scores[key.strip().lower()] = int(value.strip())
            except ValueError:
                # Handle cases where the value isn't an integer
                pass
    return scores

def log_result(
    prompt_name, 
    description_type, 
    description, 
    raw_scores, # Now accepts the raw text string
    file_path=RESULTS_FILE
):
    """Logs the results of a single test run to a CSV file."""
    
    # NEW STEP: Parse the raw score string
    parsed_scores = parse_scores(raw_scores)
    
    # Extract the three core metrics for CSV columns
    completeness = parsed_scores.get('completeness', 0)
    conciseness = parsed_scores.get('conciseness', 0)
    readability = parsed_scores.get('readability', 0)
    
    # Storing the full dictionary for detail (helpful if the LLM sometimes adds a new metric)
    raw_scores_json = json.dumps(parsed_scores) 
    
    new_row = {
        'Test_ID': f"{description_type}-{datetime.now().strftime('%H%M%S')}",
        'Dataset_Name': DATASET_NAME,
        'Prompt_Type': prompt_name,
        'Description_Source': description_type,
        'Description_Text': description.replace('\n', ' '), 
        'Completeness_Score': completeness,
        'Conciseness_Score': conciseness,
        'Readability_Score': readability,
        'Raw_Scores_JSON': raw_scores_json, 
        'Evaluation_Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    }

    df_new = pd.DataFrame([new_row])
    
    # Check if file exists to decide whether to write header
    header_needed = not os.path.exists(file_path)
    
    # Append to CSV
    df_new.to_csv(file_path, mode='a', header=header_needed, index=False)
    print(f"Logged {description_type} with Prompt {prompt_name} to {file_path}")

In [5]:
# Cell 6: Run and Log Baseline (Vanilla) Description

print("\n--- Running Baseline (Vanilla) Test ---")
prompt_baseline, description_baseline = auto_ddg.describe_dataset(
    dataset_sample=dataset_sample,
    dataset_profile=basic_profile,
    use_profile=True,
    semantic_profile=semantic_profile,
    use_semantic_profile=True,
    data_topic=data_topic,
    use_topic=True,
    use_related_profile=False  # Vanilla
)

baseline_scores = auto_ddg.evaluate_description(description_baseline)
print(f"Baseline Scores: {baseline_scores}")

log_result("N/A", "Vanilla_AutoDDG", description_baseline, baseline_scores)


--- Running Baseline (Vanilla) Test ---
Baseline Scores: Here are my scores for the given dataset description based on the Evaluation Criteria:

Evaluation Form (scores ONLY):

Completeness: 9
Conciseness: 8
Readability: 9
Logged Vanilla_AutoDDG with Prompt N/A to autoddg_experiment_results.csv


In [6]:
# Cell 7: Run and Log Augmented Descriptions for Multiple Prompts

for prompt_name, extraction_prompt in RELATED_WORK_PROMPTS.items():
    print(f"\n--- Running Augmented Test with Prompt: {prompt_name} ---")
    
    # Step A: Analyze related work using the current prompt
    related_profile = auto_ddg.analyze_related(
        pdf_path=PAPER_FILE,
        dataset_name=DATASET_NAME,
        extraction_prompt=extraction_prompt,
        max_pages=10
    )
    print(f"Related Work Summary: {related_profile['summary'][:150]}...")

    # Step B: Generate description with the new related profile
    prompt_augmented, description_augmented = auto_ddg.describe_dataset(
        dataset_sample=dataset_sample,
        dataset_profile=basic_profile,
        use_profile=True,
        semantic_profile=semantic_profile,
        use_semantic_profile=True,
        data_topic=data_topic,
        use_topic=True,
        related_profile=related_profile,
        use_related_profile=True # Augmented
    )
    
    # Step C: Evaluate and Log
    augmented_scores = auto_ddg.evaluate_description(description_augmented)
    print(f"Augmented Scores ({prompt_name}): {augmented_scores}")
    
    log_result(prompt_name, "Augmented_AutoDDG", description_augmented, augmented_scores)
    
print("\nAll experiments complete. Results saved to:", RESULTS_FILE)

Ignoring wrong pointing object 43 0 (offset 0)



--- Running Augmented Test with Prompt: V0_Original ---
Reading PDF from: ../src/autoddg/related/papers/code15.pdf
Successfully extracted text from 10 pages (total: 10 pages)
Total characters extracted: 55673
Extracting related work profile for dataset: CODE-15%: a large scale annotated dataset of 12-lead ECGs
Sending 57116 characters to LLM...
Successfully extracted profile (1563 characters)
Related Work Summary: As a Dataset Description Synthesis Expert, I will extract and synthesize the research findings related to the comparison of dataset description synthe...


Ignoring wrong pointing object 43 0 (offset 0)


Augmented Scores (V0_Original): Here are my scores for the given dataset description based on the Evaluation Criteria:

Evaluation Form (scores ONLY):

Completeness: 9
Conciseness: 8
Readability: 9
Logged Augmented_AutoDDG with Prompt V0_Original to autoddg_experiment_results.csv

--- Running Augmented Test with Prompt: V1_Revised ---
Reading PDF from: ../src/autoddg/related/papers/code15.pdf
Successfully extracted text from 10 pages (total: 10 pages)
Total characters extracted: 55673
Extracting related work profile for dataset: CODE-15%: a large scale annotated dataset of 12-lead ECGs
Sending 56479 characters to LLM...
Successfully extracted profile (941 characters)
Related Work Summary: The synthesis expert has extracted the following factual context:

* The dataset search engine is comparing the performance of different machine learn...
Augmented Scores (V1_Revised): Here are the scores:

Evaluation Form (scores ONLY):

Completeness: 9
Conciseness: 8
Readability: 7
Logged Augmented_