In [1]:
# All-in-one: locate or upload data → save to data/creditcard.csv → print split table
# Run 80 10 10 results for Capstone
%pip install -q pandas openpyxl

import pandas as pd, glob, os
from pathlib import Path

def load_or_upload():
    # Try common locations
    candidates = [Path("data/creditcard.csv")] + [
        Path(p) for p in glob.glob("**/creditcard*.csv", recursive=True)
    ]
    for p in candidates:
        if p.exists():
            return pd.read_csv(p), p

    # If not found, ask for upload (Colab)
    try:
        from google.colab import files
        uploaded = files.upload()  # pick .xlsx/.xls/.csv
        fname = next(iter(uploaded.keys()))
    except Exception:
        raise FileNotFoundError("creditcard.csv not found. In Colab, run this cell to upload your file.")

    # Read uploaded file
    if fname.lower().endswith(('.xlsx', '.xls')):
        xl = pd.ExcelFile(fname)
        # pick a sheet containing 'Class' if possible
        sel = None
        for s in xl.sheet_names:
            if "Class" in xl.parse(s, nrows=5).columns:
                sel = s; break
        df = xl.parse(sheet_name=sel or xl.sheet_names[0])
    else:
        df = pd.read_csv(fname)

    Path("data").mkdir(parents=True, exist_ok=True)
    df.to_csv("data/creditcard.csv", index=False)
    return df, Path("data/creditcard.csv")

df, path = load_or_upload()

# Chronological 80/10/10 split (prevents look-ahead)
if 'Time' in df.columns:
    df = df.sort_values('Time').reset_index(drop=True)

cut1, cut2 = int(len(df)*0.80), int(len(df)*0.90)
splits = {
    "Train (80%)": df.iloc[:cut1],
    "Validation (10%)": df.iloc[cut1:cut2],
    "Test (10%)": df.iloc[cut2:]
}

rows = []
for name, d in splits.items():
    n = len(d)
    f = int((d['Class'] == 1).sum())
    rate = 100 * f / max(n, 1)
    rows.append([name, f"{n:,}", f"{f:,}", f"{rate:.3f}%"])

overall_n = len(df)
overall_f = int((df['Class'] == 1).sum())
overall_rate = 100 * overall_f / overall_n
rows.append(["Overall", f"{overall_n:,}", f"{overall_f:,}", f"{overall_rate:.3f}%"])

print("Data & Labels (temporal 80/10/10; ordered by Time)")
print("| Split | Rows | Fraud (count) | Fraud rate |")
print("|---|---:|---:|---:|")
for r in rows:
    print(f"| {r[0]} | {r[1]} | {r[2]} | {r[3]} |")

print("\nSaved/used file:", path)

Saving creditcard.csv to creditcard.csv
Data & Labels (temporal 80/10/10; ordered by Time)
| Split | Rows | Fraud (count) | Fraud rate |
|---|---:|---:|---:|
| Train (80%) | 227,845 | 417 | 0.183% |
| Validation (10%) | 28,481 | 53 | 0.186% |
| Test (10%) | 28,481 | 22 | 0.077% |
| Overall | 284,807 | 492 | 0.173% |

Saved/used file: data/creditcard.csv
