In [34]:
"""
Synthetic Immigration Case Data Generator

Purpose
-------
Generate a dashboard-ready synthetic dataset representing immigration cases for analysis and visualization.

Output
------
A pandas DataFrame with 3,000 rows and exactly 10 attributes:
- CaseID (PK): ID-0001 ... ID-3000
- Attorney: Attorney 1-7
- Paralegal: Paralegal 1-5
- Legal Assistant: Legal Assistant 1-3
- Status: Open / Pending / Closed (30% / 28% / 42%)
- Open Date: Jan 2020 - Dec 2025 (trend + seasonality in monthly intake)
- Pending Date: present for Pending/Closed; >= Open Date; within range
- Close Date: present for Closed; >= Pending Date; within range
- Jurisdiction: USCIS / Immigration Court / BIA (conditional on Case Type)
- Case Type: fixed list with jurisdiction constraints
"""

'\nSynthetic Immigration Case Data Generator\n\nPurpose\n-------\nGenerate a dashboard-ready synthetic dataset representing immigration cases for analysis and visualization.\n\nOutput\n------\nA pandas DataFrame with 3,000 rows and exactly 10 attributes:\n- CaseID (PK): ID-0001 ... ID-3000\n- Attorney: Attorney 1-7\n- Paralegal: Paralegal 1-5\n- Legal Assistant: Legal Assistant 1-3\n- Status: Open / Pending / Closed (30% / 28% / 42%)\n- Open Date: Jan 2020 - Dec 2025 (trend + seasonality in monthly intake)\n- Pending Date: present for Pending/Closed; >= Open Date; within range\n- Close Date: present for Closed; >= Pending Date; within range\n- Jurisdiction: USCIS / Immigration Court / BIA (conditional on Case Type)\n- Case Type: fixed list with jurisdiction constraints\n'

In [35]:
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import numpy as np
import pandas as pd

In [36]:
# =============================================================================
# CONFIG — DOMAIN CONSTANTS
# =============================================================================

ATTORNEYS = [f"Attorney {i}" for i in range(1, 8)]
PARALEGALS = [f"Paralegal {i}" for i in range(1, 6)]
LEGAL_ASSISTANTS = [f"Legal Assistant {i}" for i in range(1, 4)]

# Single source of truth for case rules: allowed jurisdictions per case type
CASE_CONFIG: dict[str, dict[str, Any]] = {
    "Asylum": {"jurisdictions": ["USCIS", "Immigration Court", "BIA"]},
    "Cancellation of removal": {"jurisdictions": ["Immigration Court", "BIA"]},
    "Motion to reopen": {"jurisdictions": ["Immigration Court", "BIA"]},
    "BIA appeal": {"jurisdictions": ["BIA"]},
    "Naturalization": {"jurisdictions": ["USCIS"]},
    "Adjustment of Status": {"jurisdictions": ["USCIS"]},
    "Relative Petition": {"jurisdictions": ["USCIS"]},
    "U Visa": {"jurisdictions": ["USCIS"]},
    "I-601": {"jurisdictions": ["USCIS"]},
    "I-601A": {"jurisdictions": ["USCIS"]},
    "Employment authorization": {"jurisdictions": ["USCIS"]},
    "VAWA": {"jurisdictions": ["USCIS"]},
    "Humanitarian reinstatement": {"jurisdictions": ["USCIS"]},
    "Humanitarian parole": {"jurisdictions": ["USCIS"]},
    "I-90": {"jurisdictions": ["USCIS"]},
    "I-751": {"jurisdictions": ["USCIS"]},
}

CASE_TYPES = list(CASE_CONFIG.keys())

# Case-type mix (must sum to 1.0 after normalization)
CASE_TYPE_W = np.array(
    [
        0.10,  # Asylum
        0.06,  # Cancellation of removal
        0.06,  # Motion to reopen
        0.03,  # BIA appeal
        0.08,  # Naturalization
        0.12,  # Adjustment of Status
        0.12,  # Relative Petition
        0.05,  # U Visa
        0.05,  # I-601
        0.05,  # I-601A
        0.10,  # Employment authorization
        0.05,  # VAWA
        0.04,  # Humanitarian reinstatement
        0.03,  # Humanitarian parole
        0.04,  # I-90
        0.02,  # I-751
    ],
    dtype=float,
)

CASE_TYPE_W = CASE_TYPE_W / CASE_TYPE_W.sum()

In [37]:
# =============================================================================
# SPEC — RUNTIME PARAMETERS
# =============================================================================

@dataclass(frozen=True)
class GeneratorSpec:
    n: int = 3000
    seed: int = 42
    start: str = "2020-01-01"
    end: str = "2025-12-31"
    pct_open: float = 0.30
    pct_pending: float = 0.28
    pct_closed: float = 0.42
    base_2020: float = 35.0
    annual_growth: float = 1.07

In [38]:
# =============================================================================
# HELPERS — TIME-SERIES INTAKE (TREND + SEASONALITY)
# =============================================================================

def month_intake_counts(
    periods: pd.PeriodIndex,
    n_cases: int,
    seed: int = 42,
    base_2020: float = 35.0,
    annual_growth: float = 1.07,
) -> np.ndarray:
    """
    Generate monthly intake counts with trend + seasonality and scale them so
    the total number of cases equals n_cases exactly.
    """
    rng = np.random.default_rng(seed)

    # Seasonality multipliers (Jan..Dec)
    seasonality = np.array([1.05, 0.95, 1.00, 1.01, 1.05, 0.98, 0.92, 0.95, 1.03, 1.06, 1.02, 0.99])

    raw: list[int] = []
    for p in periods:
        years_since_2020 = p.year - 2020
        trend_level = base_2020 * (annual_growth ** years_since_2020)
        lam = trend_level * seasonality[p.month - 1]
        raw.append(int(rng.poisson(lam)))

    raw_arr = np.array(raw, dtype=int)

    # Scale to exactly n_cases
    scale = n_cases / max(int(raw_arr.sum()), 1)
    scaled = np.round(raw_arr * scale).astype(int)

    # Fix rounding drift to match n_cases exactly
    diff = n_cases - int(scaled.sum())
    if diff != 0:
        idx = np.argsort(scaled)[::-1]  # adjust largest months first
        step = 1 if diff > 0 else -1
        for i in idx[: abs(diff)]:
            scaled[i] += step

    scaled = np.clip(scaled, 0, None)
    assert int(scaled.sum()) == n_cases
    return scaled

In [39]:
# =============================================================================
# HELPERS — DATE SAMPLING
# =============================================================================

def sample_date_after(base: pd.Series, end: pd.Timestamp, rng: np.random.Generator) -> pd.Series:
    """
    For each base date, sample an offset so the result is within [base, end].
    Enforces strictly after (>= base+1 day) when possible; allows same-day if base==end.
    """
    max_days = (end - base).dt.days.clip(lower=0)
    min_days = (max_days >= 1).astype(int)  # if there's room, force at least +1 day

    offsets = np.zeros(len(base), dtype=int)
    for i, (mn, mx) in enumerate(zip(min_days.to_numpy(), max_days.to_numpy())):
        offsets[i] = int(rng.integers(int(mn), int(mx) + 1))  # inclusive upper bound

    return base + pd.to_timedelta(offsets, unit="D")

def heavy_tail_weights(k: int, skew: float) -> np.ndarray:
    """Return a heavy-tailed probability vector of length k."""
    w = np.array([1 / ((i + 1) ** skew) for i in range(k)], dtype=float)
    return w / w.sum()

In [40]:
# =============================================================================
# CLASS — GENERATOR (PUBLIC API: generate())
# =============================================================================

class SyntheticCaseDataGenerator:
    def __init__(self, spec: GeneratorSpec) -> None:
        self.spec = spec
        self.rng = np.random.default_rng(spec.seed)
        self.start_date = pd.Timestamp(spec.start)
        self.end_date = pd.Timestamp(spec.end)

    # -------------------------------------------------------------------------
    # Public API
    # -------------------------------------------------------------------------
    def generate(self) -> pd.DataFrame:
        """
        Inputs: uses the GeneratorSpec provided to __init__.
        Behavior: runs the full pipeline end-to-end.
        Output: validated pandas DataFrame with fixed schema.
        """
        open_dates = self._generate_open_dates()

        df = pd.DataFrame({"Open Date": open_dates}).sort_values("Open Date").reset_index(drop=True)

        df["CaseID"] = [f"ID-{i:04d}" for i in range(1, self.spec.n + 1)]

        df = self._assign_status(df)
        df = self._assign_lifecycle_dates(df)
        df = self._assign_case_type(df)
        df = self._assign_jurisdiction(df)
        df = self._assign_staff(df)

        # Final column ordering (exact 10 attributes)
        df = df[
            [
                "CaseID",
                "Attorney",
                "Paralegal",
                "Legal Assistant",
                "Status",
                "Open Date",
                "Pending Date",
                "Close Date",
                "Jurisdiction",
                "Case Type",
            ]
        ]

        self._validate(df)
        return df

    # -------------------------------------------------------------------------
    # Pipeline steps (private)
    # -------------------------------------------------------------------------
    def _generate_open_dates(self) -> pd.Series:
        periods = pd.period_range(self.start_date.to_period("M"), self.end_date.to_period("M"), freq="M")
        counts = month_intake_counts(
            periods=periods,
            n_cases=self.spec.n,
            seed=self.spec.seed,               # keep intake reproducible
            base_2020=self.spec.base_2020,
            annual_growth=self.spec.annual_growth,
        )

        open_dates: list[pd.Timestamp] = []
        for p, c in zip(periods, counts):
            if c == 0:
                continue
            month_start = p.start_time.normalize()
            month_end = p.end_time.normalize()
            span_days = (month_end - month_start).days
            day_offsets = self.rng.integers(0, span_days + 1, size=c)
            open_dates.extend((month_start + pd.to_timedelta(day_offsets, unit="D")).to_list())

        open_dates = pd.to_datetime(open_dates)
        assert len(open_dates) == self.spec.n
        return open_dates

    def _assign_status(self, df: pd.DataFrame) -> pd.DataFrame:
        n = len(df)

        # Sanity check: proportions should sum to 1
        total = self.spec.pct_open + self.spec.pct_pending + self.spec.pct_closed
        if not np.isclose(total, 1.0):
            raise ValueError(f"Status proportions must sum to 1.0, got {total}")

        n_open = int(round(n * self.spec.pct_open))
        n_pending = int(round(n * self.spec.pct_pending))
        n_closed = n - n_open - n_pending  # remainder to avoid rounding drift

        status = np.array(["Open"] * n_open + ["Pending"] * n_pending + ["Closed"] * n_closed, dtype=object)
        self.rng.shuffle(status)
        df["Status"] = status
        return df

    def _assign_lifecycle_dates(self, df: pd.DataFrame) -> pd.DataFrame:
        df["Pending Date"] = pd.NaT
        df["Close Date"] = pd.NaT

        mask_pending_or_closed = df["Status"].isin(["Pending", "Closed"])
        df.loc[mask_pending_or_closed, "Pending Date"] = sample_date_after(
            df.loc[mask_pending_or_closed, "Open Date"],
            self.end_date,
            self.rng,
        )

        mask_closed = df["Status"].eq("Closed")
        df.loc[mask_closed, "Close Date"] = sample_date_after(
            df.loc[mask_closed, "Pending Date"],
            self.end_date,
            self.rng,
        )

        return df

    def _assign_case_type(self, df: pd.DataFrame) -> pd.DataFrame:
        df["Case Type"] = self.rng.choice(CASE_TYPES, size=len(df), p=CASE_TYPE_W)
        return df

    def _assign_jurisdiction(self, df: pd.DataFrame) -> pd.DataFrame:
        df["Jurisdiction"] = None

        for ct, cfg in CASE_CONFIG.items():
            allowed = cfg["jurisdictions"]
            mask = df["Case Type"].eq(ct)
            if mask.any():
                df.loc[mask, "Jurisdiction"] = self.rng.choice(allowed, size=int(mask.sum()))

        return df

    def _assign_staff(self, df: pd.DataFrame) -> pd.DataFrame:
        df["Attorney"] = self.rng.choice(ATTORNEYS, size=len(df), p=heavy_tail_weights(len(ATTORNEYS), 1.15))
        df["Paralegal"] = self.rng.choice(PARALEGALS, size=len(df), p=heavy_tail_weights(len(PARALEGALS), 1.05))
        df["Legal Assistant"] = self.rng.choice(
            LEGAL_ASSISTANTS, size=len(df), p=heavy_tail_weights(len(LEGAL_ASSISTANTS), 1.00)
        )
        return df

    # -------------------------------------------------------------------------
    # Validation (private)
    # -------------------------------------------------------------------------
    def _validate(self, df: pd.DataFrame) -> None:
        # PK uniqueness
        assert df["CaseID"].is_unique

        # Date ranges
        assert df["Open Date"].between(self.start_date, self.end_date).all()

        # Pending >= Open where present
        m_pd = df["Pending Date"].notna()
        assert (df.loc[m_pd, "Pending Date"] >= df.loc[m_pd, "Open Date"]).all()

        # Close >= Pending where present
        m_cd = df["Close Date"].notna()
        assert (df.loc[m_cd, "Close Date"] >= df.loc[m_cd, "Pending Date"]).all()

        # Status consistency
        assert (df.loc[df["Status"] == "Open", "Pending Date"].isna()).all()
        assert (df.loc[df["Status"] == "Open", "Close Date"].isna()).all()
        assert (df.loc[df["Status"] == "Pending", "Pending Date"].notna()).all()
        assert (df.loc[df["Status"] == "Pending", "Close Date"].isna()).all()
        assert (df.loc[df["Status"] == "Closed", "Pending Date"].notna()).all()
        assert (df.loc[df["Status"] == "Closed", "Close Date"].notna()).all()

        # Jurisdiction constraints
        for ct, j in zip(df["Case Type"].to_numpy(), df["Jurisdiction"].to_numpy()):
            assert str(j) in CASE_CONFIG[str(ct)]["jurisdictions"]

In [41]:
# =============================================================================
# SCRIPT ENTRY POINT
# =============================================================================

def generate_synthetic_cases(
    n: int = 3000,
    seed: int = 42,
    start: str = "2020-01-01",
    end: str = "2025-12-31",
) -> pd.DataFrame:
    """Convenience wrapper: generates the dataset using default spec values."""
    spec = GeneratorSpec(n=n, seed=seed, start=start, end=end)
    return SyntheticCaseDataGenerator(spec).generate()

if __name__ == "__main__":
    df = generate_synthetic_cases(n=3000, seed=42)
    print(df.info())
    print(df.head())
    df.to_csv("synthetic_immigration_data.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   CaseID           3000 non-null   object        
 1   Attorney         3000 non-null   object        
 2   Paralegal        3000 non-null   object        
 3   Legal Assistant  3000 non-null   object        
 4   Status           3000 non-null   object        
 5   Open Date        3000 non-null   datetime64[ns]
 6   Pending Date     2100 non-null   datetime64[ns]
 7   Close Date       1260 non-null   datetime64[ns]
 8   Jurisdiction     3000 non-null   object        
 9   Case Type        3000 non-null   object        
dtypes: datetime64[ns](3), object(7)
memory usage: 234.5+ KB
None
    CaseID    Attorney    Paralegal    Legal Assistant   Status  Open Date  \
0  ID-0001  Attorney 2  Paralegal 4  Legal Assistant 2  Pending 2020-01-02   
1  ID-0002  Attorney 1  Paralegal 3