In [3]:
import pandas as pd
from sklearn.utils import resample

# --- Example 10-row dataset ---
train_data = pd.DataFrame({
    "RowID": list(range(1, 11)),
    "Age": [25, 43, 31, 50, 29, 37, 61, 45, 33, 54],
    "SMS": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    "DaysTillAppt": [2, 1, 5, 3, 7, 4, 2, 6, 9, 8],
    "NO_SHOW": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
})

# Separate majority and minority
train_majority = train_data[train_data.NO_SHOW == 0]
train_minority = train_data[train_data.NO_SHOW == 1]

print("Majority count:", len(train_majority))
print("Minority count:", len(train_minority))

# -----------------------------
# ✅ Upsampling with replace=True
# -----------------------------
train_minority_upsampled = resample(
    train_minority,
    replace=True,                 # allows duplicates
    n_samples=len(train_majority),# make minority size == majority size (8)
    random_state=123
)

train_upsampled = pd.concat([train_majority, train_minority_upsampled]).reset_index(drop=True)

print("\n--- Upsampled (replace=True) ---")
print("Upsampled class counts:\n", train_upsampled["NO_SHOW"].value_counts())
print("\nUpsampled rows (showing Source RowIDs in minority samples):")
print(train_minority_upsampled[["RowID", "Age", "SMS", "DaysTillAppt", "NO_SHOW"]])

# -----------------------------
# ❌ Upsampling with replace=False (will error)
# -----------------------------
try:
    train_minority_no_replace = resample(
        train_minority,
        replace=False,                 # NO duplicates allowed
        n_samples=len(train_majority), # trying to get 8 from only 2 -> impossible
        random_state=123
    )
except ValueError as e:
    print("\n--- Attempt with replace=False ---")
    print("Error:", e)

# Optional: what replace=False CAN do (max = len(minority))
train_minority_no_replace_ok = resample(
    train_minority,
    replace=True,
    n_samples=len(train_minority),  # <= 2
    random_state=123
)

print("\n--- replace=False valid example (n_samples=2) ---")
print(train_minority_no_replace_ok[["RowID", "Age", "SMS", "DaysTillAppt", "NO_SHOW"]])


Majority count: 8
Minority count: 2

--- Upsampled (replace=True) ---
Upsampled class counts:
 NO_SHOW
0    8
1    8
Name: count, dtype: int64

Upsampled rows (showing Source RowIDs in minority samples):
   RowID  Age  SMS  DaysTillAppt  NO_SHOW
8      9   33    1             9        1
9     10   54    0             8        1
8      9   33    1             9        1
8      9   33    1             9        1
8      9   33    1             9        1
8      9   33    1             9        1
8      9   33    1             9        1
9     10   54    0             8        1

--- Attempt with replace=False ---
Error: Cannot sample 8 out of arrays with dim 2 when replace is False

--- replace=False valid example (n_samples=2) ---
   RowID  Age  SMS  DaysTillAppt  NO_SHOW
8      9   33    1             9        1
9     10   54    0             8        1
