In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

from oodt.data.loaders import CSVDataset
from oodt.pipelines.pipeline_builder import OODPipeline
from oodt.metrics.metrics import MetricsEvaluator
from oodt.shifts.concept.feature_stratification import FeatureStratificationShift
from oodt.shifts.concept.mf_kmeans import MFKMeansShift
from oodt.splitting.splitter import TrainTestSplitter
from oodt.shifts.base import BaseShiftStrategy
from oodt.utils.utils import get_project_path

# =========================
# Load dataset
# =========================

dataset_dir = get_project_path() / Path("datasets/partitions/synt")
paths = {
    "source": dataset_dir / "source.csv",
    "target": dataset_dir / "target.csv",
}

dataset = CSVDataset(
    path=paths,
    target_col="y",
    name="synt_dataset",
)
dataset.load()

X = dataset.data
y = dataset.target

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =========================
# Initialize components
# =========================

shift_strategy = MFKMeansShift(
    mf_name=["var"],
    n_partitions=2,
    random_state=42,
)

splitter = TrainTestSplitter(
    partitions={},          # injected by pipeline
    mode=2,                 # train = ID, test = ID + OOD
    train_ratio=0.7,
    test_ratio=0.3,
    id_partitions=[0],
    ood_partitions=[1],
    stratify=True,
    random_state=42,
)

model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
)

metrics = MetricsEvaluator(task="classification")

In [3]:
# =========================
# Run experiment
# =========================

pipeline = OODPipeline(
    model=model,
    shift_strategy=shift_strategy,
    splitter=splitter,
    metrics=metrics,
)

result = pipeline.run(X, y)

InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the range [1, inf). Got 0 instead.").
InvalidParameterError("The 'n_components' parameter of CCA must be an int in the

Converged after 11 iterations


  id_indices = id_indices.append(self.partitions[pid])
  ood_indices = ood_indices.append(self.partitions[pid])


In [4]:
# =========================
# Results
# =========================

print("ID metrics:")
print(result.metrics.id_metrics)

print("\nOOD metrics:")
print(result.metrics.ood_metrics)

print("\nGlobal OOD metrics:")
print(result.metrics.global_metrics)

print("\nMetadata:")
for k, v in result.metadata.items():
    print(k, len(v))

ID metrics:
{'accuracy': 0.9367088607594937}

OOD metrics:
{'accuracy': 0.844311377245509}

Global OOD metrics:
{'auroc': 0.6791859319336011, 'fpr@95tpr': 0.9240506329113924}

Metadata:
train_id_indices 619
train_ood_indices 0
test_id_indices 79
test_ood_indices 334
