In [9]:
from pathlib import Path
import textwrap, shutil, json

ROOT = Path.cwd()
# 1) папки
for p in ["data/raw","data/processed","src/data","src/features","src/models","tests","artifacts","models"]:
    (ROOT/p).mkdir(parents=True, exist_ok=True)

# 2) положу CSV в data/raw (если лежит рядом)
for cand in [ROOT/"UCI_Credit_Card.csv", Path(r"C:\Users\USER\Desktop\credits\UCI_Credit_Card.csv")]:
    if cand.exists():
        shutil.copy2(cand, ROOT/"data/raw/UCI_Credit_Card.csv")
        break

# 3) файлы
files = {
"src/data/validation.py": r'''
import pandas as pd
import pandera as pa
from pandera import Column, Check, DataFrameSchema

TARGET = "default.payment.next.month"

SCHEMA = DataFrameSchema(
    {
        "LIMIT_BAL": Column(pa.Float, Check.ge(0), coerce=True),
        "SEX":       Column(pa.Int, Check.isin([1,2]), coerce=True),
        "EDUCATION": Column(pa.Int, Check.isin([1,2,3,4]), coerce=True),   # 0/5/6 → 4
        "MARRIAGE":  Column(pa.Int, Check.isin([1,2,3]), coerce=True),     # 0 → 3
        "AGE":       Column(pa.Int, Check.between(18, 100), coerce=True),
        **{f"PAY_{k}": Column(pa.Int, Check.between(-2,9), coerce=True) for k in [0,2,3,4,5,6]},
        **{f"BILL_AMT{i}": Column(pa.Float, coerce=True) for i in range(1,7)},
        **{f"PAY_AMT{i}":  Column(pa.Float, Check.ge(0), coerce=True) for i in range(1,7)},
        "utilization1":   Column(pa.Float, nullable=True, coerce=True),
        "payment_ratio1": Column(pa.Float, nullable=True, coerce=True),
        "max_delay":      Column(pa.Int, Check.between(-2,9), coerce=True),
        TARGET: Column(pa.Int, Check.isin([0,1]), coerce=True),
    },
    checks=[Check(lambda df: 0.05 <= df[TARGET].mean() <= 0.5, error="Аномальная доля класса-1")],
)

def validate_csv(path: str):
    df = pd.read_csv(path)
    SCHEMA.validate(df, lazy=True)
    return True
''',

"src/data/make_dataset.py": r'''
from pathlib import Path
import json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

TARGET = "default.payment.next.month"

def clean_frame(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "ID" in df: df = df.drop(columns=["ID"])
    df["EDUCATION"] = df["EDUCATION"].replace({0:4,5:4,6:4})
    df["MARRIAGE"]  = df["MARRIAGE"].replace({0:3})
    money = [c for c in df if c.startswith(("BILL_AMT","PAY_AMT"))]
    lo = {c: df[c].quantile(0.01) for c in money}
    hi = {c: df[c].quantile(0.99) for c in money}
    for c in money: df[c] = np.clip(df[c], lo[c], hi[c])
    int_cols = ["SEX","EDUCATION","MARRIAGE","AGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",TARGET]
    for c in int_cols:
        s = pd.to_numeric(df[c], errors="coerce")
        mode_val = s.mode(dropna=True)[0] if not s.mode(dropna=True).empty else 0
        df[c] = s.fillna(mode_val).round().astype(int)
    df = df.drop_duplicates()
    return df

def main(raw_csv: str, out_dir: str):
    out = Path(out_dir); out.mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(raw_csv)
    df = clean_frame(df)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[TARGET])
    train_df.to_csv(out/"train_base.csv", index=False)
    test_df.to_csv(out/"test_base.csv", index=False)
    summary = {
        "train_rows": int(len(train_df)), "test_rows": int(len(test_df)),
        "n_features_raw": int(train_df.shape[1]-1),
        "target_mean_train": float(train_df[TARGET].mean()),
        "target_mean_test": float(test_df[TARGET].mean()),
        "utilization1_p95_test": None
    }
    (out/"summary.json").write_text(json.dumps(summary, indent=2, ensure_ascii=False))
    print("Saved train/test and summary.json")

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("raw_csv"); p.add_argument("out_dir")
    args = p.parse_args()
    main(args.raw_csv, args.out_dir)
''',

"src/features/build_features.py": r'''
from pathlib import Path
import json, pandas as pd

TARGET = "default.payment.next.month"

def add_basic_features(df: pd.DataFrame) -> pd.DataFrame:
    f = df.copy()
    f["utilization1"] = (f["BILL_AMT1"] / f["LIMIT_BAL"].clip(lower=1)).fillna(0)
    f["payment_ratio1"] = (f["PAY_AMT1"] / f["BILL_AMT1"].abs().clip(lower=1)).fillna(0)
    pay_cols = ["PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]
    f["max_delay"] = f[pay_cols].max(axis=1)
    return f

def main(proc_dir: str):
    d = Path(proc_dir)
    train = pd.read_csv(d/"train_base.csv"); test = pd.read_csv(d/"test_base.csv")
    train = add_basic_features(train); test = add_basic_features(test)
    train.to_csv(d/"train.csv", index=False); test.to_csv(d/"test.csv", index=False)
    s = json.loads((d/"summary.json").read_text())
    s["n_features"] = int(train.shape[1]-1)
    s["utilization1_p95_test"] = float(test["utilization1"].quantile(0.95))
    (d/"summary.json").write_text(json.dumps(s, indent=2, ensure_ascii=False))
    print("Features added and summary updated.")

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser(); p.add_argument("proc_dir")
    args = p.parse_args(); main(args.proc_dir)
''',

"src/models/train.py": r'''
from pathlib import Path
import json, pandas as pd, matplotlib.pyplot as plt
from joblib import dump
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, RocCurveDisplay
import mlflow, mlflow.sklearn

TARGET = "default.payment.next.month"
NUM = ["LIMIT_BAL","AGE","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
       "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","utilization1","payment_ratio1","max_delay"]
CAT = ["SEX","EDUCATION","MARRIAGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]

def build_pipeline():
    num_tf = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
    cat_tf = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore"))])
    pre = ColumnTransformer([("num", num_tf, NUM), ("cat", cat_tf, CAT)])
    return Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(n_estimators=230, learning_rate=0.06, max_depth=3))])

def main(proc_dir: str, model_path: str, metrics_path: str, roc_path: str):
    d = Path(proc_dir)
    train = pd.read_csv(d/"train.csv"); test = pd.read_csv(d/"test.csv")
    X_train, y_train = train.drop(columns=[TARGET]), train[TARGET]
    X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET]
    pipe = build_pipeline(); pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:,1]; pred = (proba>=0.5).astype(int)
    metrics = {"model":"gbdt",
               "test_auc": float(roc_auc_score(y_test, proba)),
               "test_f1": float(f1_score(y_test, pred)),
               "test_precision": float(precision_score(y_test, pred, zero_division=0)),
               "test_recall": float(recall_score(y_test, pred))}
    Path(model_path).parent.mkdir(parents=True, exist_ok=True); dump(pipe, model_path)
    Path(metrics_path).write_text(json.dumps(metrics, indent=2, ensure_ascii=False))
    RocCurveDisplay.from_predictions(y_test, proba); Path(roc_path).parent.mkdir(parents=True, exist_ok=True)
    plt.tight_layout(); plt.savefig(roc_path); plt.close()
    mlflow.set_tracking_uri("file:./mlruns"); mlflow.set_experiment("CreditDefault_Prediction")
    with mlflow.start_run():
        for k,v in metrics.items():
            if k.startswith("test_"): mlflow.log_metric(k, v)
        mlflow.log_param("model","gbdt"); mlflow.log_artifact(roc_path)
        mlflow.sklearn.log_model(pipe, artifact_path="model")
    print("Saved model, metrics, roc.")

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--proc_dir", default="data/processed")
    p.add_argument("--model_path", default="models/credit_default_model.pkl")
    p.add_argument("--metrics_path", default="metrics.json")
    p.add_argument("--roc_path", default="artifacts/roc.png")
    a = p.parse_args(); main(a.proc_dir, a.model_path, a.metrics_path, a.roc_path)
''',

"tests/test_data_anomalies.py": r'''
import pandas as pd
import pytest
from src.data.validation import SCHEMA

def test_schema_fails_on_out_of_range():
    df = pd.read_csv("data/processed/train.csv")
    bad = df.copy(); bad.loc[0, "SEX"] = 3
    with pytest.raises(Exception): SCHEMA.validate(bad, lazy=True)

def test_schema_fails_on_nan_in_int():
    df = pd.read_csv("data/processed/train.csv")
    bad = df.copy(); bad.loc[0, "AGE"] = None
    with pytest.raises(Exception): SCHEMA.validate(bad, lazy=True)
''',

"dvc.yaml": r'''
stages:
  prepare:
    cmd: python src/data/make_dataset.py data/raw/UCI_Credit_Card.csv data/processed
    deps:
      - src/data/make_dataset.py
      - data/raw/UCI_Credit_Card.csv
    outs:
      - data/processed/train_base.csv
      - data/processed/test_base.csv
      - data/processed/summary.json

  features:
    cmd: python src/features/build_features.py data/processed
    deps:
      - src/features/build_features.py
      - data/processed/train_base.csv
      - data/processed/test_base.csv
    outs:
      - data/processed/train.csv
      - data/processed/test.csv
    metrics:
      - data/processed/summary.json:
          cache: false

  train:
    cmd: python src/models/train.py --proc_dir data/processed --model_path models/credit_default_model.pkl --metrics_path metrics.json --roc_path artifacts/roc.png
    deps:
      - src/models/train.py
      - data/processed/train.csv
      - data/processed/test.csv
    outs:
      - models/credit_default_model.pkl
      - artifacts/roc.png
    metrics:
      - metrics.json:
          cache: false
''',

".gitignore": r'''
/.venv/
/mlruns/
/__pycache__/
/*.egg-info
/.dvc/cache/
/artifacts/
/models/*.pkl
.ipynb_checkpoints/
/data/processed/*.csv
''',
"requirements.txt": "pandas\nscikit-learn\npandera\nmlflow\nmatplotlib\njoblib\npytest\ndvc\n",
}

for rel, content in files.items():
    p = ROOT/rel
    p.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")

print("Проект развернут.")


Проект развернут.
