In [2]:
import pandas as pd
from openai import OpenAI
from autoddg import AutoDDG
from autoddg.utils import get_sample
import os
from autoddg.related.related import RelatedWorkProfiler

In [17]:

# Your existing MODEL_CONFIG
# MODEL_CONFIG = {
#     "base_url": "https://openrouter.ai/api/v1",
#     "api_key": os.getenv("OPENROUTER_API_KEY"), 
#     # "model_name": "mistralai/mistral-7b-instruct:free", 
#     "model_name": "google/gemini-2.0-flash-exp:free",
# }

MODEL_CONFIG = {
    "base_url": "http://localhost:11434/v1",  # Changed to Ollama
    "api_key": "ollama",  # Dummy key - Ollama doesn't check it
    "model_name": "llama3",  # Just the model name, no prefix
}

# Create client
client = OpenAI(
    api_key=MODEL_CONFIG["api_key"],
    base_url=MODEL_CONFIG["base_url"]
)

In [18]:
auto_ddg = AutoDDG(
    client=client, 
    model_name=MODEL_CONFIG["model_name"]
)

In [19]:
# Load dataset
df = pd.read_csv("../src/autoddg/related/data/code-15.csv")
sample_df, dataset_sample = get_sample(df, sample_size=100)

# Step 1: Profile the dataset
basic_profile, structural_profile = auto_ddg.profile_dataframe(df)

# Step 2: Analyze semantics
semantic_profile = auto_ddg.analyze_semantics(sample_df)

# Step 3: Generate topic
data_topic = auto_ddg.generate_topic("CODE-15%: a large scale annotated dataset of 12-lead ECGs", None, dataset_sample)

  data = data.astype(object).fillna('').astype(str)


In [20]:
REVISED_RELATED_WORK_PROMPT = """
You are a **Dataset Description Synthesis Expert**. Your task is to extract and synthesize research context *specifically about the dataset* for a search engine description.

Extract key research context about the dataset: **{dataset_name}**.

**INSTRUCTIONS:**
Your summary MUST cover and integrate the following key research aspects:

1. **Research Domain and Applications:** What field or discipline uses this dataset, and what specific research questions or problems does it address?

2. **Dataset Usage and Findings:** How did researchers practically use this dataset (e.g., analyses, experiments, modeling), and what were the key results or findings derived from it?

3. **Characteristics and Provenance:** Describe how the data was collected or generated, any unique value it provides, and any notable preprocessing or curation steps mentioned.

4. **Limitations and Challenges:** Summarize any limitations, challenges, biases, or caveats researchers identified while using this data.

**OUTPUT FORMAT:** Synthesize all the extracted information into **one cohesive, natural-language paragraph** (approximately 300-400 words) that describes the research context of the dataset. **DO NOT** use bullet points, section headings (like "Title," "Abstract," "Results," etc.), or lists. The output must be ready to be inserted directly into the final dataset description.

**RESEARCH PAPER TEXT:**
{paper_text}
"""

In [21]:
# Step 4: Analyze related work
related_profile = auto_ddg.analyze_related(
    pdf_path="../src/autoddg/related/papers/code15.pdf",
    dataset_name="CODE-15%: a large scale annotated dataset of 12-lead ECGs",
    extraction_prompt=REVISED_RELATED_WORK_PROMPT,
    max_pages=10
)

Ignoring wrong pointing object 43 0 (offset 0)


Reading PDF from: ../src/autoddg/related/papers/code15.pdf
Successfully extracted text from 10 pages (total: 10 pages)
Total characters extracted: 55673
Extracting related work profile for dataset: CODE-15%: a large scale annotated dataset of 12-lead ECGs
Sending 57116 characters to LLM...
Successfully extracted profile (1866 characters)


In [22]:
print(related_profile['summary'])

As a Dataset Description Synthesis Expert, I will extract and synthesize the research findings related to the comparison of deep learning models for atrial fibrillation risk prediction.

**Research Question:** What are the key findings on the performance of different deep learning models in predicting atrial fibrillation risk from electrocardiogram (ECG) signals?

**Methodology:** The study used a large publicly available ECG dataset, PTB-XL, to train and evaluate various deep learning models for atrial fibrillation risk prediction. The models included ResNet, Inception-ResNet, DenseNet, and MobileNet.

**Key Findings:**

1. **Model Performance:** The study found that the best-performing model was the Inception-ResNet, with an area under the receiver operating characteristic curve (AUC) of 0.92. The ResNet model had a slightly lower AUC of 0.89.
2. **Comparison of Models:** The study compared the performance of different deep learning models and found that the Inception-ResNet outperfo

In [8]:
# Step 5: Generate description WITH related work
prompt, description = auto_ddg.describe_dataset(
    dataset_sample=dataset_sample,
    dataset_profile=basic_profile,
    use_profile=True,
    semantic_profile=semantic_profile,
    use_semantic_profile=True,
    data_topic=data_topic,
    use_topic=True,
    related_profile=related_profile,  # Pass the dict here
    use_related_profile=True
)

In [None]:
from autoddg.evaluation import BaseEvaluator

class Eval(BaseEvaluator):
    """
    Evaluate descriptions using OpenRouter Mistral models
    """
    def __init__(
        self,
        openrouter_api_key: str = "ollama",
        model_name: str = "llama3",
    ):
        client = OpenAI(
            api_key=openrouter_api_key, 
            base_url="http://localhost:11434/v1"
        )
        super().__init__(client=client, model_name=model_name)

In [None]:
# Baseline (without related work)
prompt_baseline, description_baseline = auto_ddg.describe_dataset(
    dataset_sample=dataset_sample,
    dataset_profile=basic_profile,
    use_profile=True,
    semantic_profile=semantic_profile,
    use_semantic_profile=True,
    data_topic=data_topic,
    use_topic=True,
    use_related_profile=False  # Turn OFF
)


KeyboardInterrupt: 

In [None]:
print("Baseline:", description_baseline)
# print("\nWith Related Work:", description_with_related)

In [11]:
print(description)

This dataset contains electrocardiogram (ECG) recordings and associated clinical features for patients with heart disease or other cardiovascular conditions. The dataset includes 345779 unique exam IDs, 233770 patient IDs, and various demographic and clinical features such as age, sex, and ECG characteristics. The data spans from 0 to 4275948.0 in terms of exam ID coverage and from 0 to 84.0 in terms of age. The dataset is primarily used for research purposes, including the development and evaluation of machine learning models for predicting atrial fibrillation risk from 12-lead ECGs.

The dataset has a semantic profile that includes entity types such as patient IDs, exam IDs, and ECG characteristics, which are linked to temporal data such as age, timey, and 1dAVb. The dataset also contains spatial data in the form of normal_ecg and trace_file features. The primary topic of this dataset is "ECG Heart Disease," capturing the essence of electrocardiogram recordings and their association 

In [12]:
auto_ddg.set_evaluator(Eval(openrouter_api_key="ollama"))


# Score descriptions
augmented_score = auto_ddg.evaluate_description(description)
# baseline_score = auto_ddg.evaluate_description(description_baseline)

print("Score of the general description:", augmented_score)
# print("Score of the search-focused description:", baseline_score)

Score of the general description: Completeness: 8
Conciseness: 6
Readability: 8
