# Check Adult datasets
This notebook loads `AdultDataset` and `AdultCensusDataset` from the repository and runs quick sanity checks: shapes, dtypes, NaN checks and a small sample preview. Run this inside the project (preferably via `uv run`).

In [None]:
# Cell 1: Imports and path setup
import sys
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# ensure repo root is on path
repo_root = Path.cwd().resolve().parents[1]  # notebooks/.. -> repo root
sys.path.insert(0, str(repo_root))

from counterfactuals.datasets.file_dataset import FileDataset  # noqa

In [2]:
# Cell 2: Load datasets
print("Loading AdultDataset...")
adult = FileDataset(config_path="../config/datasets/adult.yaml")
print("Loading AdultCensusDataset...")
adult_census = FileDataset(config_path="../config/datasets/adult_census.yaml")
print("Loaded datasets")

Loading AdultDataset...
Loading AdultCensusDataset...
Loaded datasets


  raw_data[self.config.target] = raw_data[self.config.target].replace(


In [7]:
feature_transformer = ColumnTransformer(
    [
        ("MinMaxScaler", MinMaxScaler(), adult.numerical_features_indices),
        (
            "OneHotEncoder",
            OneHotEncoder(sparse_output=False),
            adult.categorical_features_indices,
        ),
    ],
)

In [8]:
X_train_transformed = feature_transformer.fit_transform(adult.X_train)
X_test_transformed = feature_transformer.transform(adult.X_test)

In [9]:
X_train_transformed

array([[0.05479452, 0.39175258, 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.12328767, 0.18556701, 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.54794521, 0.59793814, 0.        , ..., 1.        , 0.        ,
        1.        ],
       ...,
       [0.87671233, 0.08247423, 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.04109589, 0.34020619, 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.10958904, 0.39175258, 0.        , ..., 1.        , 0.        ,
        1.        ]])

In [10]:
# Cell 3: Quick checks for AdultDataset
def check_dataset(ds, name):
    print(f"--- {name} ---")
    print("X shape:", ds.X.shape)
    print("y shape:", ds.y.shape)
    print("X dtype:", ds.X.dtype)
    print("y dtype:", ds.y.dtype)
    # print("Has NaNs in X?", np.isnan(ds.X).any())
    # print("Has NaNs in y?", np.isnan(ds.y).any())
    print("Sample X (first 3 rows):")
    print(ds.X[:3])
    print()


check_dataset(adult, "AdultDataset")
check_dataset(adult_census, "AdultCensusDataset")

--- AdultDataset ---
X shape: (1000, 8)
y shape: (1000,)
X dtype: object
y dtype: int64
Sample X (first 3 rows):
[[27 38 'Private' 'Some-college' 'Divorced' 'White-Collar' 'White'
  'Female']
 [45 40 'Government' 'HS-grad' 'Married' 'White-Collar' 'White' 'Female']
 [29 55 'Private' 'Bachelors' 'Married' 'White-Collar' 'Other' 'Male']]

--- AdultCensusDataset ---
X shape: (30000, 12)
y shape: (30000,)
X dtype: object
y dtype: int64
Sample X (first 3 rows):
[[27 0 0 38 ' Private' ' Some-college' ' Divorced' ' Adm-clerical'
  ' Not-in-family' ' White' ' Female' ' United-States']
 [45 0 0 40 ' State-gov' ' HS-grad' ' Married-civ-spouse'
  ' Exec-managerial' ' Wife' ' White' ' Female' ' United-States']
 [29 0 0 55 ' Private' ' Bachelors' ' Married-civ-spouse'
  ' Exec-managerial' ' Husband' ' Black' ' Male' ' United-States']]



## How to run
Prefer running this notebook with the project's environment. Example (from repo root):
```
uv run jupyter nbconvert --to notebook --execute notebooks/check_adult_datasets.ipynb --output -
```
Or open the notebook with Jupyter in the environment created by `uv` and run cells interactively.