# **Preparación de los datos**

- Este archivo prepara los datos, dejándolos listos para ser modelados.
- Utiliza las variantes Base, III, y V, ya previamente limpios (transformados).
- Es necesario tener en la misma carpeta los csv: "base_clean.csv", "viii_clean.csv", y "vv_clean.csv".
- La salida del archivo es una carpeta "prepared_data" conteniendo los archivos .pkl para ser usados en la etapa de modelado

In [57]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple, List, Optional, Union

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

### Función de balanceo

In [58]:
from collections import Counter
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler

def oversample_train(X_train: pd.DataFrame, y_train: pd.Series, random_state: int = 42):
    """
    Balancea el conjunto de entrenamiento usando oversampling (SMOTE o SMOTENC).
    Si no es posible aplicar SMOTE por pocos fraudes, usa RandomOverSampler.
    """
    X = X_train.copy()
    y = y_train.copy()

    # --- 1. Normalizar tipos ---
    # bool -> int
    for col in X.select_dtypes(include=["bool"]).columns:
        X[col] = X[col].astype(int)
    # object/category -> category
    for col in X.select_dtypes(include=["object"]).columns:
        X[col] = X[col].astype("category")

    # --- 2. Identificar categóricas para SMOTENC ---
    categorical_idx = [
        i for i, dt in enumerate(X.dtypes) if str(dt) == "category"
    ] or None

    # --- 3. Ajustar k_neighbors dinámicamente ---
    cnt = Counter(y)
    min_class = min(cnt, key=cnt.get)
    n_min = cnt[min_class]
    k_neighbors = max(1, min(5, n_min - 1))  # mínimo 1, máximo 5

    print(f"Fraudes en train={n_min}. Usando k_neighbors={k_neighbors}.")

    # --- 4. Elegir sampler ---
    if n_min <= 1:
        print("Muy pocos fraudes, usando RandomOverSampler.")
        sampler = RandomOverSampler(random_state=random_state)
    else:
        if categorical_idx:
            sampler = SMOTENC(
                categorical_features=categorical_idx,
                k_neighbors=k_neighbors,
                random_state=random_state
            )
        else:
            sampler = SMOTE(
                k_neighbors=k_neighbors,
                random_state=random_state
            )

    # --- 5. Fit + resample ---
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res


### Funciones utiles

In [59]:
TARGET_COL = "fraud_bool"
MONTH_COL = "month"

def check_required_columns(df: pd.DataFrame, name: str):
    missing = [c for c in [TARGET_COL, MONTH_COL] if c not in df.columns]
    if missing:
        raise ValueError(f"{name}: faltan columnas requeridas {missing}")

def split_by_month(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    """
    Devuelve dict con splits temporales para Base:
    - train_months: 1–5
    - val_month: 6
    - test_months: 7–8
    """
    # Normalizamos mes a int por si viene como string
    df = df.copy()
    df[MONTH_COL] = pd.to_numeric(df[MONTH_COL], errors="coerce").astype("Int64")

    splits = {
        "train": df[df[MONTH_COL].between(1, 5, inclusive="both")],
        "val":   df[df[MONTH_COL] == 6],
        "test":  df[df[MONTH_COL].between(7, 8, inclusive="both")],
    }
    # Verificación mínima
    for k, v in splits.items():
        if len(v) == 0:
            raise ValueError(f"Split '{k}' quedó vacío. Revisa la columna 'month' y sus valores.")
    return splits

def align_feature_space(
    ref_X: pd.DataFrame, other_X: pd.DataFrame
) -> pd.DataFrame:
    """
    Alinea columnas de other_X a las de ref_X:
    - agrega columnas faltantes con 0
    - reordena columnas
    - descarta columnas extra no presentes en ref_X
    """
    other = other_X.copy()
    # Añadir faltantes
    for col in ref_X.columns:
        if col not in other.columns:
            other[col] = 0
    # Filtrar extras
    other = other[ref_X.columns]
    return other

def get_X_y(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    X = df.drop(columns=[TARGET_COL])
    y = df[TARGET_COL].astype(int)
    return X, y

def infer_categorical_indices(X: pd.DataFrame) -> Optional[List[int]]:
    """
    Si aún tienes columnas categóricas (dtype 'object' o 'category') sin One-Hot,
    devuelve índices para SMOTENC. Si no, devuelve None.
    """
    cat_cols = [i for i, (col, dt) in enumerate(zip(X.columns, X.dtypes))
                if str(dt) in ("object", "category")]
    return cat_cols if len(cat_cols) > 0 else None


### 5) Carga de CSVs

In [60]:
base = pd.read_csv("base_clean.csv")
v3   = pd.read_csv("viii_clean.csv")  # Variant III
v5   = pd.read_csv("vv_clean.csv")    # Variant V

check_required_columns(base, "base_clean.csv")
check_required_columns(v3,   "viii_clean.csv (Variant III)")
check_required_columns(v5,   "vv_clean.csv (Variant V)")

In [61]:
print(base.dtypes)

fraud_bool                        int64
income                          float64
name_email_similarity           float64
prev_address_months_count       float64
current_address_months_count    float64
                                 ...   
device_os_linux                   int64
device_os_macintosh               int64
device_os_other                   int64
device_os_windows                 int64
device_os_x11                     int64
Length: 62, dtype: object


### Splits Temporales

In [62]:
splits_base = split_by_month(base)
train_base = splits_base["train"]
val_base   = splits_base["val"]
test_base  = splits_base["test"]

X_train_base, y_train_base = get_X_y(train_base)
X_val_base,   y_val_base   = get_X_y(val_base)
X_test_base,  y_test_base  = get_X_y(test_base)


### Balanceo en conjuntos de train

In [63]:
""" from imblearn.over_sampling import SMOTENC

# Indica los índices (posición de columna) que son categóricos
categorical_features = X_train_base.select_dtypes(include=["object", "category"]).columns.tolist()


smote_nc = SMOTENC(
    categorical_features=categorical_features,
    random_state=42,
    k_neighbors=5,
    sampling_strategy="auto"
)

X_train_bal, y_train_bal = smote_nc.fit_resample(X_train_base, y_train_base)

 """

' from imblearn.over_sampling import SMOTENC\n\n# Indica los índices (posición de columna) que son categóricos\ncategorical_features = X_train_base.select_dtypes(include=["object", "category"]).columns.tolist()\n\n\nsmote_nc = SMOTENC(\n    categorical_features=categorical_features,\n    random_state=42,\n    k_neighbors=5,\n    sampling_strategy="auto"\n)\n\nX_train_bal, y_train_bal = smote_nc.fit_resample(X_train_base, y_train_base)\n\n '

### Preparar conjuntos de evaluación en III y V

- Usamos SOLO sus meses 7–8 como "test externo"
- Alineamos columnas con el espacio de features del train Base

In [64]:
def external_test_slice(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df[MONTH_COL] = pd.to_numeric(df[MONTH_COL], errors="coerce").astype("Int64")
    ext = df[df[MONTH_COL].between(7, 8, inclusive="both")]
    if len(ext) == 0:
        raise ValueError("El conjunto externo quedó vacío (no hay months 7–8).")
    return ext

ext_v3 = external_test_slice(v3)
ext_v5 = external_test_slice(v5)

X_ext_v3, y_ext_v3 = get_X_y(ext_v3)
X_ext_v5, y_ext_v5 = get_X_y(ext_v5)

# Alinear al espacio de entrenamiento (después del balanceo no cambian columnas)
X_ext_v3 = align_feature_space(X_train_base, X_ext_v3)
X_ext_v5 = align_feature_space(X_train_base, X_ext_v5)

# (Opcional) también alinear val/test internos por consistencia absoluta:
X_val_base  = align_feature_space(X_train_base, X_val_base)
X_test_base = align_feature_space(X_train_base, X_test_base)


### Salidas listas para modelado

In [65]:
import joblib
import os

prepared = {
    # Base (para entrenar/validar/probar internamente)
    "X_train_base": X_train_base,       "y_train_base": y_train_base,
    # "X_train_bal": X_train_bal,         "y_train_bal": y_train_bal,  # <- usar este para entrenar
    "X_val_base": X_val_base,           "y_val_base": y_val_base,
    "X_test_base": X_test_base,         "y_test_base": y_test_base,

    # Evaluación externa (generalización/robustez)
    "X_test_v3": X_ext_v3,              "y_test_v3": y_ext_v3,   # Variant III (meses 7–8)
    "X_test_v5": X_ext_v5,              "y_test_v5": y_ext_v5,   # Variant V   (meses 7–8)
}


# Carpeta de salida
os.makedirs("prepared_data", exist_ok=True)

# Guardar cada dataset como pickle (rápido y conserva tipos)
joblib.dump(prepared["X_train_base"], "prepared_data/X_train_base.pkl")
joblib.dump(prepared["y_train_base"], "prepared_data/y_train_base.pkl")

joblib.dump(prepared["X_val_base"], "prepared_data/X_val_base.pkl")
joblib.dump(prepared["y_val_base"], "prepared_data/y_val_base.pkl")

joblib.dump(prepared["X_test_base"], "prepared_data/X_test_base.pkl")
joblib.dump(prepared["y_test_base"], "prepared_data/y_test_base.pkl")

joblib.dump(prepared["X_test_v3"], "prepared_data/X_test_v3.pkl")
joblib.dump(prepared["y_test_v3"], "prepared_data/y_test_v3.pkl")

joblib.dump(prepared["X_test_v5"], "prepared_data/X_test_v5.pkl")
joblib.dump(prepared["y_test_v5"], "prepared_data/y_test_v5.pkl")

['prepared_data/y_test_v5.pkl']