### Baseline model for Capstone competition

**Note:** I have submitted this into Kaggle, so changed the code that was previously here to work within that platform – I've then copied and pasted this back, for consistency.

In [2]:
import pandas as pd, numpy as np, os

SAMPLE_PATH = "/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv"
PREDS_PATH  = "/kaggle/input/submission-v1-baseline/submission_v1_baseline.csv"
OUT_PATH    = "/kaggle/working/submission.csv"

# 1) Load
sample = pd.read_csv(SAMPLE_PATH)
id_col, target_col = sample.columns.tolist()   # e.g. ['row_id','rule_violation']
preds = pd.read_csv(PREDS_PATH)

print("Sample:", sample.shape, list(sample.columns))
print("Preds :", preds.shape,  list(preds.columns))

# 2) Detect ID & prediction cols in your file
id_candidates     = [id_col, "row_id", "id", "ID", "Row_ID"]
target_candidates = [target_col, "rule_violation", "target", "prediction", "pred", "prob"]

id_in_preds     = next((c for c in id_candidates if c in preds.columns), None)
target_in_preds = next((c for c in target_candidates if c in preds.columns), None)

# 3) Build submission aligned to sample IDs/order
if id_in_preds is not None and target_in_preds is not None:
    # handle duplicated IDs defensively (take last)
    if preds[id_in_preds].duplicated().any():
        preds = preds.drop_duplicates(subset=[id_in_preds], keep="last")

    tmp = preds[[id_in_preds, target_in_preds]].rename(
        columns={id_in_preds: id_col, target_in_preds: target_col}
    )
    sub = sample[[id_col]].merge(tmp, on=id_col, how="left")
else:
    # no ID in your file: fall back to length-based mapping (must match exactly)
    if len(preds) != len(sample):
        raise ValueError(f"Length mismatch: preds={len(preds)} vs sample={len(sample)} "
                         f"and no ID column found in your file.")
    # pick the first column that looks like predictions, else use the first column
    use_col = target_in_preds or preds.columns[0]
    sub = sample.copy()
    sub[target_col] = preds[use_col].values

# 4) Coerce to valid {0,1} ints
vals = pd.to_numeric(sub[target_col], errors="coerce")
if vals.isna().any():
    # Print a few missing IDs to help debug
    missing_ids = sub.loc[vals.isna(), id_col].head(10).tolist()
    print(f"WARNING: {vals.isna().sum()} NaNs after merge. Example missing IDs: {missing_ids}")
    # Safe fallback so you can submit now:
    vals = vals.fillna(0)

# Convert probs/floats to 0/1
if not np.array_equal(np.unique(vals.dropna()), [0,1]):
    # if values are in [0,1], threshold at 0.5; else round
    if (vals.min() >= 0) and (vals.max() <= 1):
        vals = (vals >= 0.5).astype(int)
    else:
        vals = vals.round().clip(0,1).astype(int)
else:
    vals = vals.astype(int)

sub[target_col] = vals

# 5) Strict validation
errors = []
if list(sub.columns) != list(sample.columns):
    errors.append(f"Columns mismatch. Expected {list(sample.columns)}, got {list(sub.columns)}")
if len(sub) != len(sample):
    errors.append(f"Row count mismatch. Expected {len(sample)}, got {len(sub)}")
if not sub[id_col].equals(sample[id_col]):
    # either wrong set or wrong order
    if set(sub[id_col]) != set(sample[id_col]):
        miss = list(sorted(set(sample[id_col]) - set(sub[id_col])))[:5]
        extra = list(sorted(set(sub[id_col])   - set(sample[id_col])))[:5]
        errors.append(f"ID set differs. Missing sample IDs (first 5): {miss}; Extra (first 5): {extra}")
    else:
        errors.append("ID order differs from sample. Must match sample_submission order.")
if sub[target_col].isna().any():
    errors.append("Target has NaNs.")
u = set(np.unique(sub[target_col]))
if not u.issubset({0,1}):
    errors.append(f"Target has invalid values: {sorted(list(u))[:10]} (must be 0/1).")
if not np.issubdtype(sub[target_col].dtype, np.integer):
    sub[target_col] = sub[target_col].astype(int)

if errors:
    print("❌ Submission invalid:")
    for e in errors: print(" -", e)
    raise SystemExit()

# 6) Save with EXACT name and confirm
sub.to_csv(OUT_PATH, index=False)
print("✅ Saved:", OUT_PATH, "| Shape:", sub.shape, "| Dtype:", sub[target_col].dtype)
print(sub.head())
print("File exists?", os.path.exists(OUT_PATH), "Size:", os.path.getsize(OUT_PATH))


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv'