In [1]:
# недостающие пакеты прямо из ноутбука.
%pip install -q pandas pandera scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [1]:
# задаю корневой путь 
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

ROOT = Path.cwd()
RAW_DIR = ROOT / "data" / "raw"
PROC_DIR = ROOT / "data" / "processed"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Файл датасета: я беру его из текущей папки, если есть, и копирую в data/raw
csv_candidates = [
    ROOT / "UCI_Credit_Card.csv",
    Path(r"C:\Users\USER\Desktop\credits\UCI_Credit_Card.csv")
]
for c in csv_candidates:
    if c.exists():
        dst = RAW_DIR / "UCI_Credit_Card.csv"
        if not dst.exists():
            dst.write_bytes(c.read_bytes())
        raw_csv = dst
        break
else:
    raise FileNotFoundError("Не нашёл UCI_Credit_Card.csv ни рядом с ноутбуком, ни по пути C:\\Users\\USER\\Desktop\\credits")

raw_csv


WindowsPath('C:/Users/USER/Desktop/credits/data/raw/UCI_Credit_Card.csv')

In [3]:
df = pd.read_csv(raw_csv)
print("Форма:", df.shape)
print("Колонки:", list(df.columns)[:10], "... (всего", len(df.columns), ")")
print("Пропуски (шт.):")
display(df.isna().sum().to_frame("n_missing").T)

# Целевая колонка в датасете
TARGET = "default.payment.next.month"
print("Целевой класс (доля 1):", df[TARGET].mean().round(4))

display(df.describe().T.head(12))


Форма: (30000, 25)
Колонки: ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4'] ... (всего 25 )
Пропуски (шт.):


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
n_missing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Целевой класс (доля 1): 0.2212


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,30000.0,15000.5,8660.398374,1.0,7500.75,15000.5,22500.25,30000.0
LIMIT_BAL,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
SEX,30000.0,1.603733,0.489129,1.0,1.0,2.0,2.0,2.0
EDUCATION,30000.0,1.853133,0.790349,0.0,1.0,2.0,2.0,6.0
MARRIAGE,30000.0,1.551867,0.52197,0.0,1.0,2.0,2.0,3.0
AGE,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
PAY_0,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0


In [5]:
# Убираю ID не нужен для обучения
if "ID" in df.columns:
    df = df.drop(columns=["ID"])

# train/test 
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df[TARGET]
)

# Сохраняю базовые версии без фичей
train_df.to_csv(PROC_DIR / "train.csv", index=False)
test_df.to_csv(PROC_DIR / "test.csv", index=False)
print("Сохранил:", PROC_DIR / "train.csv", "|", PROC_DIR / "test.csv")


Сохранил: C:\Users\USER\Desktop\credits\data\processed\train.csv | C:\Users\USER\Desktop\credits\data\processed\test.csv


In [7]:
def add_basic_features(frame: pd.DataFrame) -> pd.DataFrame:
    f = frame.copy()
    # Доля счёта к кредитному лимиту (за сентябрь)
    f["utilization1"] = (f["BILL_AMT1"] / f["LIMIT_BAL"].clip(lower=1)).fillna(0)
    # Доля последнего платежа к последнему счёту (за сентябрь)
    f["payment_ratio1"] = (f["PAY_AMT1"] / (f["BILL_AMT1"].abs().clip(lower=1))).fillna(0)
    # Максимальная просрочка за 6 месяцев
    pay_cols = ["PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]
    f["max_delay"] = f[pay_cols].max(axis=1)
    return f

train_aug = add_basic_features(pd.read_csv(PROC_DIR / "train.csv"))
test_aug  = add_basic_features(pd.read_csv(PROC_DIR / "test.csv"))

train_aug.to_csv(PROC_DIR / "train.csv", index=False)
test_aug.to_csv(PROC_DIR / "test.csv", index=False)
print("Фичи добавлены и сохранены.")
train_aug.filter(["utilization1","payment_ratio1","max_delay", TARGET]).head()


Фичи добавлены и сохранены.


Unnamed: 0,utilization1,payment_ratio1,max_delay,default.payment.next.month
0,1.011069,0.092724,3,0
1,0.0,53.0,1,0
2,1.0171,0.000197,2,0
3,0.940359,0.043987,0,1
4,0.0,0.0,1,0


In [9]:
import pandera as pa
from pandera import Column, Check

schema = pa.DataFrameSchema({
    "LIMIT_BAL": Column(pa.Float, Check.ge(0)),
    "SEX":       Column(pa.Int, Check.isin([1,2])),
    "EDUCATION": Column(pa.Int, Check.isin([0,1,2,3,4,5,6])),
    "MARRIAGE":  Column(pa.Int, Check.isin([0,1,2,3])),
    "AGE":       Column(pa.Int, Check.between(18, 100)),
    "PAY_0":     Column(pa.Int, Check.between(-2, 9)),
    "PAY_2":     Column(pa.Int, Check.between(-2, 9)),
    "PAY_3":     Column(pa.Int, Check.between(-2, 9)),
    "PAY_4":     Column(pa.Int, Check.between(-2, 9)),
    "PAY_5":     Column(pa.Int, Check.between(-2, 9)),
    "PAY_6":     Column(pa.Int, Check.between(-2, 9)),
    "BILL_AMT1": Column(pa.Float),
    "BILL_AMT2": Column(pa.Float),
    "BILL_AMT3": Column(pa.Float),
    "BILL_AMT4": Column(pa.Float),
    "BILL_AMT5": Column(pa.Float),
    "BILL_AMT6": Column(pa.Float),
    "PAY_AMT1":  Column(pa.Float, Check.ge(0)),
    "PAY_AMT2":  Column(pa.Float, Check.ge(0)),
    "PAY_AMT3":  Column(pa.Float, Check.ge(0)),
    "PAY_AMT4":  Column(pa.Float, Check.ge(0)),
    "PAY_AMT5":  Column(pa.Float, Check.ge(0)),
    "PAY_AMT6":  Column(pa.Float, Check.ge(0)),
    "utilization1": Column(pa.Float),
    "payment_ratio1": Column(pa.Float),
    "max_delay": Column(pa.Int, Check.between(-2, 9)),
    "default.payment.next.month": Column(pa.Int, Check.isin([0,1])),
})

# Я валидирую и train, и test. Если что-то не так — упадёт с понятным сообщением.
for split in ["train","test"]:
    _df = pd.read_csv(PROC_DIR / f"{split}.csv")
    schema.validate(_df, lazy=True)
    print(f"Валидация OK для {split}")


Валидация OK для train
Валидация OK для test


In [11]:
import numpy as np

train = pd.read_csv(PROC_DIR / "train.csv")
test = pd.read_csv(PROC_DIR / "test.csv")

summary = {
    "train_shape": train.shape,
    "test_shape": test.shape,
    "target_mean_train": float(train[TARGET].mean()),
    "target_mean_test": float(test[TARGET].mean()),
}
summary


{'train_shape': (24000, 27),
 'test_shape': (6000, 27),
 'target_mean_train': 0.22120833333333334,
 'target_mean_test': 0.22116666666666668}

In [17]:
import numpy as np
import pandas as pd
from pathlib import Path

PROC_DIR = Path("data/processed")
TARGET = "default.payment.next.month"

def clean_frame(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # категории "прочее"
    df["EDUCATION"] = df["EDUCATION"].replace({0:4, 5:4, 6:4})
    df["MARRIAGE"]  = df["MARRIAGE"].replace({0:3})

    # защита от выбросов
    money_cols = [c for c in df.columns if c.startswith(("BILL_AMT","PAY_AMT"))]
    caps = {c: df[c].quantile(0.99) for c in money_cols}
    for c,cap in caps.items():
        df[c] = np.clip(df[c], a_min=df[c].quantile(0.01), a_max=cap)

    # Явные типы
    int_cols = ["SEX","EDUCATION","MARRIAGE","AGE","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6", TARGET]
    for c in int_cols:
        s = pd.to_numeric(df[c], errors="coerce")
    # если были пропуски — подставляю моду колонки (надёжнее, чем ffill для категорий)
    mode_val = s.mode(dropna=True)[0] if not s.mode(dropna=True).empty else 0
    df[c] = s.fillna(mode_val).round().astype(int)

    # удаление полных дубликатов
    before = len(df)
    df = df.drop_duplicates()
    print(f"Дубликаты: удалено {before - len(df)} строк")

    return df

# применяю к нашим файлам, пересохраняю
for split in ["train","test"]:
    _df = pd.read_csv(PROC_DIR / f"{split}.csv")
    _df = clean_frame(_df)
    _df.to_csv(PROC_DIR / f"{split}.csv", index=False)

print("Очистка завершена и сохранена.")


Дубликаты: удалено 0 строк
Дубликаты: удалено 0 строк
Очистка завершена и сохранена.


In [19]:
import pandera as pa
from pandera import Column, Check, DataFrameSchema
import pandas as pd
from pathlib import Path

PROC_DIR = Path("data/processed")
TARGET = "default.payment.next.month"

schema = DataFrameSchema(
    {
        "LIMIT_BAL": Column(pa.Float, Check.ge(0), coerce=True),
        "SEX":       Column(pa.Int, Check.isin([1,2]), coerce=True),
        "EDUCATION": Column(pa.Int, Check.isin([1,2,3,4]), coerce=True),   # после маппинга 0/5/6→4
        "MARRIAGE":  Column(pa.Int, Check.isin([1,2,3]), coerce=True),     # после маппинга 0→3
        "AGE":       Column(pa.Int, Check.between(18, 100), coerce=True),
        **{f"PAY_{k}": Column(pa.Int, Check.between(-2,9), coerce=True) for k in [0,2,3,4,5,6]},
        **{f"BILL_AMT{i}": Column(pa.Float, coerce=True) for i in range(1,7)},
        **{f"PAY_AMT{i}":  Column(pa.Float, Check.ge(0), coerce=True) for i in range(1,7)},
        "utilization1":   Column(pa.Float, nullable=True, coerce=True),
        "payment_ratio1": Column(pa.Float, nullable=True, coerce=True),
        "max_delay":      Column(pa.Int, Check.between(-2,9), coerce=True),
        TARGET: Column(pa.Int, Check.isin([0,1]), coerce=True),
    },
    checks=[
        Check(lambda df: 0.05 <= df[TARGET].mean() <= 0.5, error="Аномальная доля класса-1"),
    ],
)

for split in ["train","test"]:
    df = pd.read_csv(PROC_DIR / f"{split}.csv")
    schema.validate(df, lazy=True)
    print(f"Валидация пройдена: {split} ({df.shape})")


Валидация пройдена: train ((23974, 27))
Валидация пройдена: test ((5998, 27))


In [21]:
import json, numpy as np, pandas as pd
from pathlib import Path
PROC_DIR = Path("data/processed"); TARGET = "default.payment.next.month"

train = pd.read_csv(PROC_DIR / "train.csv")
test  = pd.read_csv(PROC_DIR / "test.csv")

summary = {
    "train_rows": int(len(train)),
    "test_rows": int(len(test)),
    "n_features": int(train.shape[1] - 1),  # минус таргет
    "target_mean_train": float(train[TARGET].mean()),
    "target_mean_test": float(test[TARGET].mean()),
    # базовые статистики по ключевым полям — удобно для мониторинга
    "limit_bal_mean_train": float(train["LIMIT_BAL"].mean()),
    "limit_bal_mean_test": float(test["LIMIT_BAL"].mean()),
    "utilization1_p95_train": float(train["utilization1"].quantile(0.95)),
    "utilization1_p95_test": float(test["utilization1"].quantile(0.95)),
}

(Path("data") / "processed" / "summary.json").write_text(json.dumps(summary, indent=2, ensure_ascii=False))
summary


{'train_rows': 23974,
 'test_rows': 5998,
 'n_features': 26,
 'target_mean_train': 0.22128138817051807,
 'target_mean_test': 0.22107369123041012,
 'limit_bal_mean_train': 167326.7706682239,
 'limit_bal_mean_test': 167932.2574191397,
 'utilization1_p95_train': 1.012913611111111,
 'utilization1_p95_test': 1.013461}

In [4]:
from pathlib import Path
import sys
sys.path.append(str(Path.cwd()))  # корень проекта в PYTHONPATH

from src.data.validation import SCHEMA  


In [6]:
import pandas as pd
import pytest
from src.data.validation import SCHEMA

def test_schema_fails_on_out_of_range():
    df = pd.read_csv("data/processed/train.csv")
    bad = df.copy()
    bad.loc[0, "SEX"] = 3  # вне допустимого {1,2}
    with pytest.raises(Exception):
        SCHEMA.validate(bad, lazy=True)

def test_schema_fails_on_nan_in_int():
    df = pd.read_csv("data/processed/train.csv")
    bad = df.copy()
    bad.loc[0, "AGE"] = None
    with pytest.raises(Exception):
        SCHEMA.validate(bad, lazy=True)


In [8]:
def add_age_bin(df):
    bins = [0, 25, 35, 45, 55, 100]
    labels = [0,1,2,3,4]
    df["age_bin"] = pd.cut(df["AGE"], bins=bins, labels=labels, right=True).astype(int)
    return df

for split in ["train","test"]:
    f = pd.read_csv("data/processed/"+split+".csv")
    f = add_age_bin(f)
    f.to_csv("data/processed/"+split+".csv", index=False)
