In [16]:
%pip install -q dvc==3.53.2 joblib matplotlib 

from pathlib import Path
ROOT = Path.cwd() if Path.cwd().name=="credit" else Path.cwd().parent
ROOT


Note: you may need to restart the kernel to use updated packages.


WindowsPath('C:/Users/USER/Desktop/credit')

# Дерево папок + сырой датасет под DVC

In [18]:
import shutil, os

# создадим нужные папки
for p in ["data/raw", "data/processed", "models", "src/data", "src/models"]:
    (ROOT / p).mkdir(parents=True, exist_ok=True)

# положи исходный CSV в data/raw (если его там нет)
raw_csv = ROOT / "data" / "raw" / "UCI_Credit_Card.csv"
if not raw_csv.exists():
    # если файл лежит рядом с ноутбуком/где-то ещё — поправь путь ниже и скопируй
    possible = ROOT / "UCI_Credit_Card.csv"
    if possible.exists():
        shutil.copy2(possible, raw_csv)

raw_csv, raw_csv.exists()


(WindowsPath('C:/Users/USER/Desktop/credit/data/raw/UCI_Credit_Card.csv'),
 True)

## DVC init + добавим сырой CSV в DVC

In [20]:
import subprocess, sys

def sh(cmd):
    r = subprocess.run(cmd, cwd=ROOT, shell=True, text=True, capture_output=True)
    print(">", cmd, "\n", r.stdout or r.stderr); 
    return r

sh("dvc init -q")
# добавим сырой датасет под контроль DVC (в гит уйдёт .dvc-файл, сам csv — в .gitignore)
sh("dvc add data/raw/UCI_Credit_Card.csv")


> dvc init -q 
 
> dvc add data/raw/UCI_Credit_Card.csv 
 
To track the changes with git, run:

	git add 'data\raw\UCI_Credit_Card.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true



CompletedProcess(args='dvc add data/raw/UCI_Credit_Card.csv', returncode=0, stdout="\nTo track the changes with git, run:\n\n\tgit add 'data\\raw\\UCI_Credit_Card.csv.dvc'\n\nTo enable auto staging, run:\n\n\tdvc config core.autostage true\n", stderr='\\u280b Checking graph\n\n')

## Скрипт src/models/train.py

In [22]:
code = r"""
import json, argparse
from pathlib import Path
import joblib, pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

TARGET = "default.payment.next.month"

def make_preprocess(X):
    all_cols = X.columns.tolist()
    cat = [c for c in ["SEX","EDUCATION","MARRIAGE"] if c in all_cols] + [c for c in all_cols if c.startswith("PAY_")]
    cat = sorted(list(dict.fromkeys(cat)))
    num = [c for c in all_cols if c not in cat]

    num_tf = Pipeline([("imputer", SimpleImputer(strategy="median")),
                       ("scaler", StandardScaler())])
    cat_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                       ("onehot", OneHotEncoder(handle_unknown="ignore"))])
    return ColumnTransformer([("num", num_tf, num), ("cat", cat_tf, cat)])

def main(train_path, test_path, model_out, metrics_out):
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)
    for df in (train, test):
        if "ID" in df.columns: df.drop(columns=["ID"], inplace=True)

    X_train, y_train = train.drop(columns=[TARGET]), train[TARGET]
    X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET]

    pipe = Pipeline([
        ("preprocess", make_preprocess(X_train)),
        ("clf", GradientBoostingClassifier(learning_rate=0.1, n_estimators=150, random_state=42))
    ])
    pipe.fit(X_train, y_train)

    y_proba = pipe.predict_proba(X_test)[:,1]
    y_pred  = pipe.predict(X_test)

    metrics = {
        "model": "GradientBoostingClassifier",
        "model_params": {"learning_rate": 0.1, "n_estimators": 150},
        "roc_auc": float(roc_auc_score(y_test, y_proba)),
        "precision": float(precision_score(y_test, y_pred, zero_division=0)),
        "recall": float(recall_score(y_test, y_pred, zero_division=0)),
        "f1": float(f1_score(y_test, y_pred, zero_division=0))
    }

    model_out = Path(model_out); model_out.parent.mkdir(parents=True, exist_ok=True)
    metrics_out = Path(metrics_out); metrics_out.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipe, model_out)
    Path(metrics_out).write_text(json.dumps(metrics, indent=2), encoding="utf-8")

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--train", required=True)
    p.add_argument("--test", required=True)
    p.add_argument("--model-out", required=True)
    p.add_argument("--metrics-out", required=True)
    args = p.parse_args()
    main(args.train, args.test, args.model_out, args.metrics_out)
"""
path = ROOT / "src" / "models" / "train.py"
path.write_text(code, encoding="utf-8"); path


WindowsPath('C:/Users/USER/Desktop/credit/src/models/train.py')

## Создаём dvc.yaml (2 стадии)

In [24]:
dvc_yaml = f"""
stages:
  prepare:
    cmd: python src/data/make_dataset.py data/raw/UCI_Credit_Card.csv data/processed/
    deps:
    - src/data/make_dataset.py
    - data/raw/UCI_Credit_Card.csv
    outs:
    - data/processed/train.csv
    - data/processed/test.csv

  train:
    cmd: python src/models/train.py --train data/processed/train.csv --test data/processed/test.csv --model-out models/credit_default_model.pkl --metrics-out models/metrics.json
    deps:
    - src/models/train.py
    - data/processed/train.csv
    - data/processed/test.csv
    outs:
    - models/credit_default_model.pkl
    metrics:
    - models/metrics.json:
        cache: false
"""
p = (ROOT / "dvc.yaml")
p.write_text(dvc_yaml, encoding="utf-8"); p


WindowsPath('C:/Users/USER/Desktop/credit/dvc.yaml')

In [32]:
# пересоберёт обе стадии, положит артефакты и метрики
sh("dvc repro")
print((ROOT/"models"/"metrics.json").read_text()[:250])


> dvc repro 
 'data\raw\UCI_Credit_Card.csv.dvc' didn't change, skipping
Running stage 'prepare':
> python src/data/make_dataset.py data/raw/UCI_Credit_Card.csv data/processed/

{
  "model": "GradientBoostingClassifier",
  "model_params": {
    "learning_rate": 0.1,
    "n_estimators": 150
  },
  "roc_auc": 0.7705572956671517,
  "precision": 0.6647230320699709,
  "recall": 0.3436322532027129,
  "f1": 0.45305514157973176
}


In [36]:
# подтягиваем удалённые коммиты и переносим свои поверх, затем пушим
sh("git fetch origin")
sh("git status -sb")
sh("git log --oneline --decorate --graph -5")

# главное действие: ребейз на origin/main
sh("git pull --rebase origin main")

# если ребейз прошёл без конфликтов — просто пушим
sh("git push")


> git fetch origin 
 
> git status -sb 
 ## main...origin/main [ahead 3, behind 2]
 M notebooks/04_dvc_pipeline.ipynb

> git log --oneline --decorate --graph -5 
 * de8cafa (HEAD -> main) build(dvc): init + dvc.yaml (prepare/train) + data/raw under DVC + reproducible training
* 09b89f4 data(dvc): track raw dataset with DVC
* daa47cb chore(gitignore): allow DVC .dvc files under data/
* 7eaf1c9 docs(mlflow): add screenshots (UI start, experiments list, RF_bal_400_d10 run)
* a69187a feat(model): baseline pipeline + GridSearch + metrics + ROC plot; chore: remove $null

> git pull --rebase origin main 
 error: cannot pull with rebase: You have unstaged changes.
error: Please commit or stash them.

> git push 
 To https://github.com/pero1x1/credit.git
 ! [rejected]        main -> main (non-fast-forward)
error: failed to push some refs to 'https://github.com/pero1x1/credit.git'
hint: Updates were rejected because the tip of your current branch is behind
hint: its remote counterpart. If you wa

CompletedProcess(args='git push', returncode=1, stdout='', stderr="To https://github.com/pero1x1/credit.git\n ! [rejected]        main -> main (non-fast-forward)\nerror: failed to push some refs to 'https://github.com/pero1x1/credit.git'\nhint: Updates were rejected because the tip of your current branch is behind\nhint: its remote counterpart. If you want to integrate the remote changes,\nhint: use 'git pull' before pushing again.\nhint: See the 'Note about fast-forwards' in 'git push --help' for details.\n")

In [38]:
# не забываем requirements и .gitignore обновить, если нужно
req = ROOT/"requirements.txt"
txt = req.read_text(encoding="utf-8") if req.exists() else ""
for line in ["dvc==3.53.2", "joblib"]:
    if line not in txt:
        txt += ("" if txt.endswith("\n") else "\n") + line + "\n"
req.write_text(txt, encoding="utf-8")

sh("git add -A")
sh('git commit -m "build(dvc): init + dvc.yaml (prepare/train) + data/raw under DVC + reproducible training"')
sh("git push")


> git add -A 

> git commit -m "build(dvc): init + dvc.yaml (prepare/train) + data/raw under DVC + reproducible training" 
 [main 391f9aa] build(dvc): init + dvc.yaml (prepare/train) + data/raw under DVC + reproducible training
 1 file changed, 98 insertions(+), 4 deletions(-)

> git push 
 To https://github.com/pero1x1/credit.git
 ! [rejected]        main -> main (non-fast-forward)
error: failed to push some refs to 'https://github.com/pero1x1/credit.git'
hint: Updates were rejected because the tip of your current branch is behind
hint: its remote counterpart. If you want to integrate the remote changes,
hint: use 'git pull' before pushing again.
hint: See the 'Note about fast-forwards' in 'git push --help' for details.



CompletedProcess(args='git push', returncode=1, stdout='', stderr="To https://github.com/pero1x1/credit.git\n ! [rejected]        main -> main (non-fast-forward)\nerror: failed to push some refs to 'https://github.com/pero1x1/credit.git'\nhint: Updates were rejected because the tip of your current branch is behind\nhint: its remote counterpart. If you want to integrate the remote changes,\nhint: use 'git pull' before pushing again.\nhint: See the 'Note about fast-forwards' in 'git push --help' for details.\n")