# ⚖️ Semantic Oversampling

Generate synthetic minority-class records using `loclean.oversample`.

**Use case:** Your dataset has 8 "healthy" patients and only 2 "hypertension" — the LLM generates semantically plausible synthetic hypertension records to balance the classes.

In [None]:
import polars as pl
from pydantic import BaseModel, Field

import loclean

## Define schema and create imbalanced data

In [None]:
class PatientRecord(BaseModel):
    """Schema for synthetic patient records."""

    age: int = Field(..., ge=0, le=120, description="Patient age")
    blood_pressure: str = Field(
        ..., description="Blood pressure reading, e.g. '120/80'"
    )
    cholesterol: str = Field(..., description="Cholesterol level: Low, Normal, or High")
    diagnosis: str = Field(..., description="Medical diagnosis label")

In [None]:
df = pl.DataFrame(
    {
        "age": [45, 52, 38, 61, 55, 42, 35, 67, 48, 50],
        "blood_pressure": [
            "120/80",
            "140/90",
            "130/85",
            "150/95",
            "128/82",
            "135/88",
            "118/76",
            "155/100",
            "125/80",
            "138/92",
        ],
        "cholesterol": [
            "Normal",
            "High",
            "Normal",
            "High",
            "Normal",
            "Normal",
            "Low",
            "High",
            "Normal",
            "Normal",
        ],
        "diagnosis": [
            "healthy",
            "healthy",
            "healthy",
            "hypertension",
            "healthy",
            "healthy",
            "healthy",
            "hypertension",
            "healthy",
            "healthy",
        ],
    }
)

print("Class distribution (before):")
print(df["diagnosis"].value_counts())
df

## Generate synthetic minority records

In [None]:
result = loclean.oversample(
    df,
    target_col="diagnosis",
    target_value="hypertension",
    n=6,
    schema=PatientRecord,
    batch_size=3,
)

print(f"Rows: {len(df)} → {len(result)} (+{len(result) - len(df)} synthetic)")
print("\nClass distribution (after):")
print(result["diagnosis"].value_counts())
result