In [None]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# raw = pd.read_csv("data.csv")  # dates are strings, no schema checks
# dim_region = pd.read_csv("region_lookup.csv")

# 1) Reading data with no checks

# Messy: pd.read_csv("data.csv") and dates left as plain text.
# Why it’s a problem: If columns change type (e.g., numbers come in as strings) or the date format shifts, the code happily carries on and gives you wrong results.
# Fix: Parse dates strictly and enforce a schema so bad feeds fail fast.
# What I did: Used pd.to_datetime(..., errors="raise") and a Pandera schema with type, range, and allowed-values checks.
import pandas as pd
import numpy as np

# --- AQUA: Data & Evidence — schema & joins ---
import pandera as pa
from pandera import Column, Check

schema = pa.DataFrameSchema({
    "id": Column(int, Check.gt(0), nullable=False),
    "event_time": Column(object, nullable=False),
    "age": Column(float, Check.in_range(0,120)),
    "bmi": Column(float, Check.gt(0), nullable=True),
    "region": Column(str, nullable=True),
    "spend": Column(float, Check.ge(0), nullable=False)
})

raw = schema.validate(
    pd.read_csv("data.csv").assign(
        event_time=lambda d: pd.to_datetime(d["event_time"], errors="raise")
    ),
    lazy=True
)
dim_region = pd.read_csv("region_lookup.csv")

#--

# # quick join
# df = raw.merge(dim_region, on="region", how="left")  # could explode rows

# 2) Blind join that might duplicate rows

# Messy: merge(... how="left") with no cardinality check.
# Why it’s a problem: If the lookup has duplicates, your dataset silently grows and skews counts and model training.
# Fix: Prove the join is one row per key.
# What I did: validate="m:1" and an assert on row counts so we catch row explosions immediately.
# assert  m:1 join to avoid row explosion
before = len(raw)
df = raw.merge(dim_region, on="region", how="left", validate="m:1")
assert len(df) == before, "Row explosion after join"

#--
# target made from future info (leakage risk)
df["high_spend_next_week"] = df.groupby("id")["spend"].shift(-1) > 100
y = df["high_spend_next_week"].astype(int)

# 3) Creating a “future” target then doing a random split

# Messy: shift(-1) to label “next week” behaviour, then train_test_split randomly.
# Why it’s a problem: You peek into the future to build the label but evaluate with a random split, letting future info leak into training. That inflates performance.
# Fix: Respect time.
# What I did: Sort by person and time, keep the “next week” target, then split by date (train on earlier period, test on later period).
# --- AQUA: Methods & Assumptions — time-aware target & split ---
df = df.sort_values(["id","event_time"])
df["y"] = (df.groupby("id")["spend"].shift(-1) > 100).astype(int)

#--
# # drop NAs
# df = df.dropna()

# 4) Dropping all missing values

# Messy: df.dropna() wipes any row with any missing field.
# Why it’s a problem: You can throw away lots of useful data and introduce bias. Also you haven’t said what to do when new data has gaps.
# Fix: Be surgical about missing data.
# What I did: Only drop rows that can’t have a target, and impute feature gaps inside the modelling pipeline so the imputer learns from the training folds only.
# drop rows without next-week target (end-of-series), but *log* the decision
df = df.dropna(subset=["y"])

#--
# # simple features
# X = df[["age", "bmi", "region", "spend"]]

# # scale and encode BEFORE split (leakage)
# scaler = StandardScaler()
# X[["age", "bmi", "spend"]] = scaler.fit_transform(X[["age", "bmi", "spend"]])


# 5) Preprocessing before the split

# Messy: scaler.fit_transform and OneHotEncoder().fit_transform on the full dataset.
# Why it’s a problem: The scaler and encoder learn from the test set too, which is another form of leakage.
# Fix: Put all preprocessing in a pipeline.
# What I did: Wrapped imputation, scaling and one-hot encoding in a ColumnTransformer inside a Pipeline, so they are fit only on the training folds during cross-validation.


# enc = OneHotEncoder()
# region_ohe = enc.fit_transform(X[["region"]]).toarray()
# X = pd.concat([X.drop(columns=["region"]), pd.DataFrame(region_ohe)], axis=1)


# 6) One-hot encoding that breaks on new categories

# Messy: Default OneHotEncoder with manual toarray() and concat.
# Why it’s a problem: New categories at prediction time can crash the model, and manual concatenation is brittle and error-prone.
# Fix: Make encoding robust and automated.
# What I did: OneHotEncoder(handle_unknown="ignore") inside the pipeline; no manual concatenation.


# # random split (not time-aware), no stratification
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# 7) Random split on time-based data

# Messy: train_test_split with default settings.
# Why it’s a problem: For temporal problems, random splits overstate performance because the model sees patterns from the future.
# Fix: Use time-aware validation.
# What I did: Train/test split by a date cutoff and used TimeSeriesSplit for cross-validation on the training period.

features = ["age","bmi","region","spend"]
X = df[features]
y = df["y"].astype(int)

# create a time-aware split: last 20% of timeline as test
cutoff = df["event_time"].quantile(0.8)
X_train, y_train = X[df["event_time"] <= cutoff], y[df["event_time"] <= cutoff]
X_test,  y_test  = X[df["event_time"] >  cutoff], y[df["event_time"] >  cutoff]

# --- AQUA: Leakage control — pipeline & proper CV ---
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.metrics import make_scorer, f1_score, average_precision_score, precision_recall_curve

num = ["age","bmi","spend"]
cat = ["region"]

pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                          ("sc", StandardScaler())]), num),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat)
    ],
    remainder="drop"
)

pipe = Pipeline([
    ("prep", pre),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42))
])


# 8) Default classifier with no imbalance handling

# Messy: LogisticRegression() defaults and no seed.
# Why it’s a problem: If positives are rare, the model can ignore them and still look “good.” Results may also vary run-to-run.
# Fix: Make it stable and fairer to the minority class.
# What I did: class_weight="balanced", higher max_iter, and random_state=42.

# # default logistic regression
# clf = LogisticRegression()
# clf.fit(X_train, y_train)

# # evaluate with accuracy on imbalanced data
# pred = clf.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, pred))


# 9) Using accuracy on an imbalanced target

# Messy: accuracy_score as the only metric.
# Why it’s a problem: With a 95/5 split, predicting “always negative” gives 95% accuracy but zero value.
# Fix: Use the right metrics and show more than one view.
# What I did: Report F1 and PR-AUC (precision-recall AUC), plus a classification report and confusion matrix.

# time-aware CV on training only
tscv = TimeSeriesSplit(n_splits=5)
scorers = {
    "f1": make_scorer(f1_score),
    "prauc": make_scorer(average_precision_score, needs_proba=True)
}
cv_res = cross_validate(pipe, X_train, y_train, cv=tscv, scoring=scorers, return_train_score=False)
print("CV F1 mean±sd:", np.mean(cv_res["test_f1"]), np.std(cv_res["test_f1"]))
print("CV PR-AUC mean±sd:", np.mean(cv_res["test_prauc"]), np.std(cv_res["test_prauc"]))

# fit on full training period and calibrate
from sklearn.calibration import CalibratedClassifierCV
cal = CalibratedClassifierCV(pipe, cv=3)
cal.fit(X_train, y_train)

# evaluation
from sklearn.metrics import classification_report, confusion_matrix
proba = cal.predict_proba(X_test)[:,1]
prec, rec, thr = precision_recall_curve(y_test, proba)
# choose threshold that maximises F1 (or cost-weighted objective)
best_idx = np.argmax(2*prec*rec/(prec+rec+1e-9))
y_pred = (proba >= thr[best_idx]).astype(int)

print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))



# 10) No cross-validation or uncertainty

# Messy: Single train/test split, one number.
# Why it’s a problem: One split can be lucky. You don’t know how variable performance is.
# Fix: Cross-validate and show variability.
# What I did: cross_validate with TimeSeriesSplit, printing mean and standard deviation across folds.

# 11) Untuned threshold and uncalibrated probabilities

# Messy: Use default 0.5 threshold and raw probabilities.
# Why it’s a problem: The probability scale might be off, and 0.5 rarely matches business costs.
# Fix: Calibrate and pick a threshold that suits the goal.
# What I did: CalibratedClassifierCV for well-scaled probabilities, then chose a threshold from the precision-recall curve to maximise F1 (or align to business costs).

# 12) Quiet assumptions and no audit trail

# Messy: Magic numbers, silent data decisions, no logging or checks.
# Why it’s a problem: Future you (or an auditor) cannot tell why numbers changed or whether data feeds broke.
# Fix: Make assumptions explicit and testable.
# What I did: Added a schema, explicit join validation, deterministic seeds, and clear, reproducible steps. In practice you’d also keep a short Data Quality log and a model card.

# 13) Performance and maintainability nits

# Messy: Manual dense one-hot arrays and possible pandas “SettingWithCopy” assignments.
# Why it’s a problem: It’s easy to create silent bugs and memory overhead.
# Fix: Let the pipeline manage sparse features and transformations cleanly, avoiding copy warnings and keeping code compact.