In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.model_selection import train_test_split
from datetime import datetime
import os
import sys

CSV_PATH = "train_df_full.csv"
LOG_FILE = "experiment_log.csv"
MODEL_NAME = "RandomForest"

# ---------- Utilidades ----------
def find_col(df: pd.DataFrame, base: str, required=True):
    """
    Busca una columna por nombre base, aceptando variantes: base, base_y, base_x (en ese orden).
    """
    candidates = [base, f"{base}_y", f"{base}_x"]
    for c in candidates:
        if c in df.columns:
            return c
    if required:
        raise KeyError(f"No se encontró la columna '{base}' ni sus variantes {candidates} en el CSV.")
    return None

def find_label(df: pd.DataFrame):
    for c in ["label", "target", "y"]:
        if c in df.columns:
            return c
    raise KeyError("No se encontró la columna de etiqueta ('label', 'target' o 'y').")

def precision_at_k(y_true, y_scores, k):
    y_true = np.asarray(y_true)
    y_scores = np.asarray(y_scores)
    k = max(1, min(k, len(y_scores)))
    # ordenar de mayor a menor score
    top_k_idx = np.argsort(-y_scores)[:k]
    return float(np.mean(y_true[top_k_idx]))

# ---------- Carga ----------
try:
    df = pd.read_csv(CSV_PATH)
except Exception as e:
    print(f"Error al leer {CSV_PATH}: {e}")
    sys.exit(1)

# ---------- Columnas clave ----------
cust_col = find_col(df, "customer_id")
label_col = find_label(df)

# Features base que quieres usar
base_features = [
    "edad", "total_compras", "gasto_promedio", "antiguedad_dias",
    "item_price", "item_avg_rating", "item_num_ratings"
]

# Resolver nombres reales en el CSV (con/sin sufijo)
feature_cols = []
for b in base_features:
    # 'required=True' porque son necesarias para entrenar
    feature_cols.append(find_col(df, b, required=True))

# ---------- Limpieza mínima ----------
# Sólo elimina filas con NaN en features o etiqueta
df = df.dropna(subset=feature_cols + [label_col])

# ---------- Split cold-start por cliente ----------
clientes = df[cust_col].unique()
if len(clientes) < 5:
    print("Advertencia: hay muy pocos clientes para un split representativo.")

clientes_train, clientes_test = train_test_split(clientes, test_size=0.2, random_state=42)
train_df = df[df[cust_col].isin(clientes_train)]
test_df  = df[df[cust_col].isin(clientes_test)]

X_train = train_df[feature_cols]
y_train = train_df[label_col].astype(int)
X_test  = test_df[feature_cols]
y_test  = test_df[label_col].astype(int)

# ---------- Entrenamiento ----------
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# ---------- Evaluación ----------
y_pred  = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

accuracy   = accuracy_score(y_test, y_pred)
roc_auc    = roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else float("nan")
precision  = precision_score(y_test, y_pred, zero_division=0)
p_at_100   = precision_at_k(y_test.values, y_proba, 100)

# ---------- Logging ----------
log_row = pd.DataFrame([{
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "model": MODEL_NAME,
    "n_estimators": model.n_estimators,
    "features_used": ",".join(feature_cols),
    "accuracy": accuracy,
    "roc_auc": roc_auc,
    "precision": precision,
    "precision@100": p_at_100,
    "n_train_rows": len(train_df),
    "n_test_rows": len(test_df),
    "n_clients_train": len(clientes_train),
    "n_clients_test": len(clientes_test),
    "data_version": "v1.0"
}])

if os.path.exists(LOG_FILE):
    log_row.to_csv(LOG_FILE, mode='a', header=False, index=False)
else:
    log_row.to_csv(LOG_FILE, index=False)

print(f"OK. Experimento registrado en {LOG_FILE}")


KeyError: "No se encontró la columna 'customer_id' ni sus variantes ['customer_id', 'customer_id_y', 'customer_id_x'] en el CSV."