# Quick Start

In [None]:
import pandas as pd
from openai import OpenAI

from autoddg import (
    DatasetDescriptionGenerator,
    DatasetTopicGenerator,
    SearchFocusedDescription,
    SemanticProfiler,
)
from autoddg.data_process import dataset_profiler
from autoddg.utils import get_sample

## Initialization of the OpenAI Client

In [None]:
my_api_key = "YOUR_OPENAI_API_KEY"
client = OpenAI(api_key=my_api_key)
model_name = "gpt-4o-mini"

## Context Preparation

In [None]:
# Load the dataset and sample
csv_file = "clark_dataset.csv"
title = "Renal Cell Carcinoma"
original_description = "This study reports a large-scale proteogenomic analysis of ccRCC to discern the functional impact of genomic alterations and provides evidence for rational treatment selection stemming from ccRCC pathobiology"
csv_df = pd.read_csv(csv_file)
reduced_sample_size = 100
sample_df, dataset_sample = get_sample(csv_df, sample_size=reduced_sample_size)

# Load the semantic profiler
semantic_profiler = SemanticProfiler(client=client, model_name=model_name)

# Generate the basic and semantic profiles
basic_profile, semantic_profile_part1 = dataset_profiler(csv_df)
semantic_profile_part2 = semantic_profiler.analyze_dataframe(sample_df)
semantic_profile = semantic_profile_part1 + "\n" + semantic_profile_part2

# Generate the dataset topic
data_topic_generator = DatasetTopicGenerator(client=client, model_name=model_name)
data_topic = data_topic_generator.generate_topic(
    title, original_description, dataset_sample
)

Semantic Type Analyzer initialized with model: gpt-4o-mini


  data = data.astype(object).fillna('').astype(str)


Dataset Topic Generator initialized with model: gpt-4o-mini


## Dataset Description Generation

In [4]:
# We use the basic and semantic profiles, and the dataset topic to generate the dataset description
description_generator = DatasetDescriptionGenerator(
    client=client, model_name=model_name
)
_, description = description_generator.generate_description(
    dataset_sample=dataset_sample,
    dataset_profile=basic_profile,
    use_profile=True,
    semantic_profile=semantic_profile,
    use_semantic_profile=True,
    data_topic=data_topic,
    use_topic=True,
)

# Generate the search-focused description
sfd_model = SearchFocusedDescription(client=client, model_name=model_name)
_, search_focused_description = sfd_model.expand_description(
    initial_description=description, topic=data_topic
)

Dataset Description Generator initialized with model: gpt-4o-mini, temperature: 0.0, description words: 100
Search Focused Description initialized with model: gpt-4o-mini


In [5]:
description

"This dataset focuses on Renal Cell Carcinoma, providing valuable insights into the characteristics and clinical profiles of patients diagnosed with this type of cancer. It includes 110 unique cases, each identified by a Case_ID, and captures essential demographic and clinical information such as tumor status (Tumor/Normal), gender, age, body mass index (BMI), race, and self-identified ethnicity. The dataset also details tumor-specific attributes, including the tumor site, size, focality, histologic type, grade, and pathological stage, allowing for comprehensive analysis of the disease.\n\nThe age of patients ranges from 0 to 84 years, with BMI values spanning from 0 to 68.59. The dataset features a variety of tumor sites and sizes, with measurements in centimeters, and includes classifications for tumor grade and stage, which are crucial for understanding disease progression and treatment options. \n\nFrom a semantic perspective, the dataset serves multiple functions: it classifies bi

In [6]:
search_focused_description

"Dataset Overview:\n- This dataset focuses on Renal Cell Carcinoma, providing valuable insights into the characteristics and clinical profiles of patients diagnosed with this type of cancer. It includes 110 unique cases, each identified by a Case_ID, and captures essential demographic and clinical information such as tumor status (Tumor/Normal), gender, age, body mass index (BMI), race, and self-identified ethnicity. The dataset also details tumor-specific attributes, including the tumor site, size, focality, histologic type, grade, and pathological stage, allowing for comprehensive analysis of the disease.\n\nThe age of patients ranges from 0 to 84 years, with BMI values spanning from 0 to 68.59. The dataset features a variety of tumor sites and sizes, with measurements in centimeters, and includes classifications for tumor grade and stage, which are crucial for understanding disease progression and treatment options. \n\nFrom a semantic perspective, the dataset serves multiple functi

## Quality Evaluation

In [7]:
from autoddg.evaluate import GPTEvaluator

llm_evaluator = GPTEvaluator(my_api_key)
gpt_score = llm_evaluator.evaluate(description)
gpt_score_sfd = llm_evaluator.evaluate(search_focused_description)

print("Score of the general description:", gpt_score)
print("Score of the search focused description:", gpt_score_sfd)

Score of the general description: Completeness: 9, Conciseness: 9, Readability: 9
Score of the search focused description: Completeness: 9, Conciseness: 8, Readability: 9
