# Шаг 1. Подготовка данных для PD-модели

Что хочу сделать:
1) Загрузить датасет из data/raw.
2) Быстро глянуть структуру и целевую переменную.
3) Провалидировать данные (Pandera).
4) Разделить на train/test и сохранить в data/processed.
5) Запушить изменения в репозиторий.


In [2]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd() if Path.cwd().name == "credit" else Path.cwd().parent
RAW = ROOT / "data" / "raw" / "UCI_Credit_Card.csv"

assert RAW.exists(), f"Положи файл сюда: {RAW}"

df = pd.read_csv(RAW)
df.head(3), df.shape


(   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
 0   1    20000.0    2          2         1   24      2      2     -1     -1   
 1   2   120000.0    2          2         2   26     -1      2      0      0   
 2   3    90000.0    2          2         2   34      0      0      0      0   
 
    ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
 0  ...        0.0        0.0        0.0       0.0     689.0       0.0   
 1  ...     3272.0     3455.0     3261.0       0.0    1000.0    1000.0   
 2  ...    14331.0    14948.0    15549.0    1518.0    1500.0    1000.0   
 
    PAY_AMT4  PAY_AMT5  PAY_AMT6  default.payment.next.month  
 0       0.0       0.0       0.0                           1  
 1    1000.0       0.0    2000.0                           1  
 2    1000.0    1000.0    5000.0                           0  
 
 [3 rows x 25 columns],
 (30000, 25))

## Короткий EDA (без графиков)
Посмотрю столбцы, пропуски и баланс классов.


In [7]:
pd.Series(df.dtypes).head(10)


ID             int64
LIMIT_BAL    float64
SEX            int64
EDUCATION      int64
MARRIAGE       int64
AGE            int64
PAY_0          int64
PAY_2          int64
PAY_3          int64
PAY_4          int64
dtype: object

In [9]:
(df.isna().sum().sort_values(ascending=False).head(10), df.isna().sum().sum())


(ID           0
 BILL_AMT2    0
 PAY_AMT6     0
 PAY_AMT5     0
 PAY_AMT4     0
 PAY_AMT3     0
 PAY_AMT2     0
 PAY_AMT1     0
 BILL_AMT6    0
 BILL_AMT5    0
 dtype: int64,
 0)

In [34]:
target = "default.payment.next.month"
df[target].value_counts(), df[target].value_counts(normalize=True).round(3)


(default.payment.next.month
 0    23364
 1     6636
 Name: count, dtype: int64,
 default.payment.next.month
 0    0.779
 1    0.221
 Name: proportion, dtype: float64)

## Валидация Pandera
Проверю базовые вещи: LIMIT_BAL >= 0, SEX ∈ {1,2}, AGE в [18..100], target ∈ {0,1}.


In [36]:
import pandera as pa
from pandera import Column, DataFrameSchema, Check

schema = DataFrameSchema({
    "LIMIT_BAL": Column(pa.Int64, Check.ge(0)),
    "SEX": Column(pa.Int64, Check.isin([1, 2])),
    "EDUCATION": Column(pa.Int64),
    "MARRIAGE": Column(pa.Int64),
    "AGE": Column(pa.Int64, Check.between(18, 100)),
    "default.payment.next.month": Column(pa.Int64, Check.isin([0, 1])),
}, coerce=True)

df_valid = schema.validate(df, lazy=True)
df_valid.shape


(30000, 25)

## Train/Test split и сохранение
Сделаю стратификацию по целевой переменной.


In [38]:
from sklearn.model_selection import train_test_split

X = df_valid.drop(columns=[target])
y = df_valid[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train = pd.concat([X_train, y_train], axis=1)
test  = pd.concat([X_test, y_test], axis=1)

PROC = ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)
train.to_csv(PROC / "train.csv", index=False)
test.to_csv(PROC / "test.csv", index=False)

train.shape, test.shape


((24000, 25), (6000, 25))

## Сохраню такой же код отдельным скриптом (для DVC)
Файл: `src/data/make_dataset.py`.


In [40]:
from textwrap import dedent

code = dedent("""\
import argparse
from pathlib import Path
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check
from sklearn.model_selection import train_test_split

TARGET = "default.payment.next.month"

def validate(df: pd.DataFrame) -> pd.DataFrame:
    schema = DataFrameSchema({
        "LIMIT_BAL": Column(pa.Int64, Check.ge(0)),
        "SEX": Column(pa.Int64, Check.isin([1, 2])),
        "EDUCATION": Column(pa.Int64),
        "MARRIAGE": Column(pa.Int64),
        "AGE": Column(pa.Int64, Check.between(18, 100)),
        TARGET: Column(pa.Int64, Check.isin([0, 1])),
    }, coerce=True)
    return schema.validate(df, lazy=True)

def main(raw_path: str, out_dir: str, test_size: float = 0.2, seed: int = 42):
    raw_path, out_dir = Path(raw_path), Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(raw_path)
    df = validate(df)
    X = df.drop(columns=[TARGET])
    y = df[TARGET]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )
    train = pd.concat([X_train, y_train], axis=1)
    test = pd.concat([X_test, y_test], axis=1)
    train.to_csv(out_dir / "train.csv", index=False)
    test.to_csv(out_dir / "test.csv", index=False)
    print("Prepared: data/processed/train.csv, data/processed/test.csv")

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--raw", default="data/raw/UCI_Credit_Card.csv")
    p.add_argument("--out", default="data/processed")
    args = p.parse_args()
    main(args.raw, args.out)
""")

path = ROOT / "src" / "data" / "make_dataset.py"
path.write_text(code, encoding="utf-8")
path


WindowsPath('C:/Users/USER/Desktop/credit/src/data/make_dataset.py')

## Git: коммит и пуш этого шага


In [42]:
import subprocess, pathlib

def sh(cmd):
    r = subprocess.run(cmd, cwd=ROOT, shell=True, text=True, capture_output=True)
    print(">", cmd, "\n", r.stdout or r.stderr)
    return r

sh("git add -A")
sh('git commit -m "feat(data): EDA + Pandera validation + train/test split + make_dataset.py"')
sh("git push")


> git add -A 

> git commit -m "feat(data): EDA + Pandera validation + train/test split + make_dataset.py" 
 [main aff2a78] feat(data): EDA + Pandera validation + train/test split + make_dataset.py
 1 file changed, 138 insertions(+), 4 deletions(-)

> git push 
 To https://github.com/pero1x1/credit.git
   cab6e96..aff2a78  main -> main



CompletedProcess(args='git push', returncode=0, stdout='', stderr='To https://github.com/pero1x1/credit.git\n   cab6e96..aff2a78  main -> main\n')