In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.model_selection import train_test_split
from datetime import datetime
import os

# 1 Cargar datos
df = pd.read_csv("train_df_full_enriched.csv")

# 2 Features
features = [
    'edad', 'total_compras', 'gasto_promedio', 'antiguedad_dias',
    'item_price_y', 'item_avg_rating_y', 'item_num_ratings_y'
]
df = df.dropna(subset=features + ['label'])

# 3 División train/test por cliente (cold-start)
clientes = df['customer_id'].unique()
clientes_train, clientes_test = train_test_split(clientes, test_size=0.2, random_state=42)

train_df = df[df['customer_id'].isin(clientes_train)]
test_df = df[df['customer_id'].isin(clientes_test)]

X_train = train_df[features]
y_train = train_df['label']
X_test = test_df[features]
y_test = test_df['label']

# 4 Entrenamiento
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5 Evaluación
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

def precision_at_k(y_true, y_scores, k):
    top_k_idx = np.argsort(y_scores)[-k:]
    return np.mean(y_true[top_k_idx])

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
precision = precision_score(y_test, y_pred)
precision_k = precision_at_k(y_test.values, y_proba, 100)

# 6 Logging en CSV
log_row = pd.DataFrame([{
    "timestamp": str(datetime.now()),
    "model": "RandomForest",
    "n_estimators": model.n_estimators,
    "accuracy": accuracy,
    "roc_auc": roc_auc,
    "precision": precision,
    "precision@100": precision_k,
    "data_version": "v1.0"
}])

log_file = "experiment_log.csv"
if os.path.exists(log_file):
    log_row.to_csv(log_file, mode='a', header=False, index=False)
else:
    log_row.to_csv(log_file, index=False)

print(" Experimento registrado en experiment_log.csv")


 Experimento registrado en experiment_log.csv
