
# **Balanceo de datos**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Cargar dataset limpio
df = pd.read_csv("base_clean.csv")

X = df.drop(columns=["fraud_bool"])
y = df["fraud_bool"]


## Dividir datos

Train: 70%

Validation: 15%

Test: 15%

In [2]:

# Train + Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Validation + Test (50/50 de ese 30%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)


## Oversampling / Undersampling / Hybrid

In [3]:
# balance.py
from typing import Literal, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

ArrayLike = Union[pd.DataFrame, pd.Series, np.ndarray]

def balance_data(X_train, y_train, method="oversample", *, use_smote=True,
                 categorical_features=None, sampling_strategy="auto",
                 random_state=42):
    """
    Aplica un método de balanceo al train set.
    """
    if method == "oversample":
        if use_smote:
            # Si tienes variables categóricas sin one-hot → usa SMOTENC
            if categorical_features:
                from imblearn.over_sampling import SMOTENC
                sampler = SMOTENC(
                    categorical_features=categorical_features,
                    sampling_strategy=sampling_strategy,
                    random_state=random_state
                )
            else:
                sampler = SMOTE(
                    sampling_strategy=sampling_strategy,
                    random_state=random_state
                )
        else:
            sampler = RandomOverSampler(
                sampling_strategy=sampling_strategy,
                random_state=random_state
            )

    elif method == "undersample":
        sampler = RandomUnderSampler(
            sampling_strategy=sampling_strategy,
            random_state=random_state
        )

    elif method == "hybrid":
        sampler = SMOTEENN(
            sampling_strategy=sampling_strategy,
            random_state=random_state
        )

    else:
        raise ValueError("method debe ser 'oversample', 'undersample' o 'hybrid'.")

    return sampler.fit_resample(X_train, y_train)

#### Elegir uno de los 3 métodos de balanceo:

In [4]:
# 1) Oversampling (SMOTE)
X_tr_os, y_tr_os = balance_data(X_train, y_train, method="oversample", use_smote=True)

# 2) Undersampling
#X_tr_us, y_tr_us = balance_data(X_train, y_train, method="undersample")

# 3) Híbrido (SMOTEENN)
#X_tr_hy, y_tr_hy = balance_data(X_train, y_train, method="hybrid")