# Refactor and Test Dataset Loading Logic

This notebook tests the refactored `load_dataset_tool.py` which now includes all dataset loading logic, removing the dependency on `datasets.py`.

In [1]:
from pathlib import Path
import pandas as pd
from pydantic import BaseModel, ConfigDict, Field
from stats_compass_core.registry import registry
from stats_compass_core.results import DataFrameLoadResult
from stats_compass_core.state import DataFrameState

# Define path to datasets directory (simulating the package structure)
# We assume we are running this from the root of the repo
_DATASETS_DIR = Path("stats_compass_core/datasets")
print(f"Datasets directory: {_DATASETS_DIR.absolute()}")
print(f"Exists: {_DATASETS_DIR.exists()}")

Datasets directory: /Users/tunjiogunbiyi/Dev/stats-compass-core/stats_compass_core/datasets
Exists: True


In [2]:
def _list_available_datasets() -> list[str]:
    """List available sample datasets."""
    if not _DATASETS_DIR.exists():
        return []
    return [f.stem for f in _DATASETS_DIR.glob("*.csv")]

print(f"Available datasets: {_list_available_datasets()}")

Available datasets: ['TATASTEEL', 'Housing', 'Bukayo_Saka_7322']


In [3]:
class LoadDatasetInput(BaseModel):
    """Input schema for load_dataset tool."""
    model_config = ConfigDict(populate_by_name=True)

    name: str = Field(
        description=f"Name of the dataset to load. Available: {', '.join(_list_available_datasets())}",
    )
    set_active: bool = Field(
        default=True,
        description="Whether to set this as the active DataFrame"
    )

print("Input schema defined successfully.")

Input schema defined successfully.


In [4]:
def load_dataset(state: DataFrameState, params: LoadDatasetInput) -> DataFrameLoadResult:
    """
    Load a built-in sample dataset into the session state.
    """
    try:
        file_path = _DATASETS_DIR / f"{params.name}.csv"
        
        if not file_path.exists():
            raise FileNotFoundError(f"Dataset file not found: {file_path}")

        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Add to state
        state.set_dataframe(df, name=params.name, operation="load_dataset", set_active=params.set_active)
        
        return DataFrameLoadResult(
            name=params.name,
            rows=len(df),
            columns=list(df.columns),
            dtypes={col: str(dtype) for col, dtype in df.dtypes.items()},
            head=df.head().to_dict(orient="records"),
            is_active=params.set_active,
        )
        
    except FileNotFoundError:
        available = ", ".join(_list_available_datasets())
        raise ValueError(f"Dataset '{params.name}' not found. Available datasets: {available}")
    except Exception as e:
        raise RuntimeError(f"Failed to load dataset '{params.name}': {str(e)}")

print("load_dataset function defined.")

load_dataset function defined.


In [5]:
# Test loading a valid dataset
state = DataFrameState()
params = LoadDatasetInput(name="Housing")

try:
    result = load_dataset(state, params)
    print(f"Successfully loaded dataset: {result.name}")
    print(f"Rows: {result.rows}, Columns: {len(result.columns)}")
    print(f"Head: {result.head[0]}")
except Exception as e:
    print(f"Error: {e}")

Error: Failed to load dataset 'Housing': 5 validation errors for DataFrameLoadResult
success
  Field required [type=missing, input_value={'name': 'Housing', 'rows...d'}], 'is_active': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
dataframe_name
  Field required [type=missing, input_value={'name': 'Housing', 'rows...d'}], 'is_active': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
source
  Field required [type=missing, input_value={'name': 'Housing', 'rows...d'}], 'is_active': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
shape
  Field required [type=missing, input_value={'name': 'Housing', 'rows...d'}], 'is_active': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
message
  Field required [type=missing, input_value={'name': 'Housing', 'rows...d'}], 'is_active': True}, input

In [6]:
# Test error handling for invalid dataset
try:
    invalid_params = LoadDatasetInput(name="NonExistentDataset")
    load_dataset(state, invalid_params)
except ValueError as e:
    print(f"Caught expected error: {e}")
except Exception as e:
    print(f"Caught unexpected error: {type(e).__name__}: {e}")

Caught expected error: Dataset 'NonExistentDataset' not found. Available datasets: TATASTEEL, Housing, Bukayo_Saka_7322
