In [14]:
import mlflow
from mlflow.tracking import MlflowClient
import optuna
import re
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
#df_DST = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_DST.csv")
df_DST = pd.read_csv("../data/DST.csv")
df_K = pd.read_csv("../data/K.csv")
df_QB = pd.read_csv("../data/QB.csv")
df_RB = pd.read_csv("../data/RB.csv")
df_TE = pd.read_csv("../data/TE.csv")
df_WR = pd.read_csv("../data/WR.csv")

df_DST['Position'] = 'DST'
df_K['Position'] = 'K'
df_QB['Position'] = 'QB'
df_RB['Position'] = 'RB'
df_TE['Position'] = 'TE'
df_WR['Position'] = 'WR'

df = pd.concat([df_DST, df_K, df_QB, df_RB, df_TE, df_WR], ignore_index=True)

print(f"Total de filas: {len(df)}")
print(f"\nDistribuci√≥n por posici√≥n:")
print(df['Position'].value_counts())
print(f"\nPrimeras filas:")
print(df.head())
print(f"\nColumnas:")
print(df.columns.tolist())
print(f"\nInfo del DataFrame:")
print(df.info())

Total de filas: 938

Distribuci√≥n por posici√≥n:
Position
WR     325
RB     208
TE     192
QB     121
K       58
DST     34
Name: count, dtype: int64

Primeras filas:
   Rank                      Player  SACK  INT   FR   FF  DEF TD  SFTY  \
0   1.0      Seattle Seahawks (SEA)  12.0  7.0  0.0  0.0     0.0   0.0   
1   2.0  Jacksonville Jaguars (JAC)   7.0  9.0  4.0  5.0     0.0   0.0   
2   3.0     Minnesota Vikings (MIN)  11.0  2.0  5.0  8.0     2.0   0.0   
3   4.0   Philadelphia Eagles (PHI)   5.0  3.0  2.0  4.0     0.0   0.0   
4   5.0         Detroit Lions (DET)  14.0  3.0  3.0  4.0     0.0   0.0   

   SPC TD    G  ...  TD  SACKS ATT.1 YDS.1  TD.1  FL  20+  TGT  REC  Y/R  
0     2.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
1     1.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
2     0.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
3     2.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
4     1.0  4

In [16]:
# ============================================================
# PREP: Codificaci√≥n de 'Position' + Selecci√≥n de features
# ============================================================

# ---------- 1) Definir target ----------
TARGET = "FPTS"

# ---------- 2) Quitar columnas que NO deben ser features ----------
# - Identificadores y texto
id_like = ["Player", "Team"]  # agrega otras si las tienes (e.g., 'PlayerId')
# - Fugas de informaci√≥n (derivadas del target o rankings)
leak_like_patterns = [
    r"^FPTS\/G$",      # puntos por juego (deriva del target)
    r"rank",           # cualquier 'rank' o variantes
    r"tier",           # tiers si existieran
]
# Compilar regex para filtrar
leak_regex = re.compile("|".join(leak_like_patterns), flags=re.IGNORECASE)

drop_cols = set(id_like + [TARGET])
drop_cols.update([c for c in df.columns if leak_regex.search(str(c))])

# ---------- 3) Seleccionar columnas num√©ricas y categ√≥ricas ----------
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns
            if c not in drop_cols and c != TARGET]

# Asegurar que Position est√© como categ√≥rica
cat_cols = ["Position"]



In [17]:
# ---------- 4) Imputaci√≥n + OneHot para 'Position' ----------
numeric_transformer = SimpleImputer(strategy="constant", fill_value=0)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

# ---------- 5) Ajustar transformador y generar X, y ----------
X = preprocessor.fit_transform(df)
y = df[TARGET].values

# ---------- 6) Obtener nombres de features transformadas ----------
ohe_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, ohe_feature_names]

print(f"Total features num√©ricas: {len(num_cols)}")
print(f"Total features categ√≥ricas (one-hot): {len(ohe_feature_names)}")
print(f"Total de features finales: {len(feature_names)}")

# Vista r√°pida de las primeras 25 columnas transformadas
print("\nEjemplo de nombres de features resultantes:")
print(feature_names[:25])

Total features num√©ricas: 32
Total features categ√≥ricas (one-hot): 6
Total de features finales: 38

Ejemplo de nombres de features resultantes:
['SACK' 'INT' 'FR' 'FF' 'DEF TD' 'SFTY' 'SPC TD' 'G' 'FG' 'FGA' 'PCT' 'LG'
 '1-19' '20-29' '30-39' '40-49' '50+' 'XPT' 'XPA' 'CMP' 'ATT' 'Y/A' 'TD'
 'SACKS' 'ATT.1']


In [18]:
# ============================================================
# MODELO RANDOM FOREST (TODO EN UNA SOLA CELDA) ‚Äî Todas las posiciones combinadas
# ============================================================
# - Limpieza robusta del target
# - Codificaci√≥n de Position (One-Hot)
# - Selecci√≥n de features sin fugas (sin FPTS/G, Rank, Tier, etc.)
# - Entrenamiento y evaluaci√≥n
# - Importancias y Top 10 predicho (con manejo de alias de columnas)
# ============================================================

# ------------------ 0) Configuraci√≥n ------------------
TARGET = "FPTS"

# ------------------ 1) Asegurar tipos num√©ricos y target limpio ------------------
df2 = df.copy()

# Forzar TARGET a num√©rico
df2[TARGET] = pd.to_numeric(df2[TARGET], errors="coerce")

# Quitar filas con FPTS NaN/inf
mask = np.isfinite(df2[TARGET])
df2 = df2.loc[mask].reset_index(drop=True)

print(f"Filas despu√©s de limpiar {TARGET}: {len(df2)}")


Filas despu√©s de limpiar FPTS: 926


In [19]:
# ------------------ 2) Definir columnas a eliminar (no-features) ------------------
# Identificadores y texto que no deben entrar como features
id_like = ["Player", "Team"]  # si faltan, se manejan luego para imprimir
# Fugas de informaci√≥n: cualquier cosa derivada del target o rankings
leak_like_patterns = [
    r"^FPTS\/G$",   # puntos por juego (deriva del target)
    r"\brank\b",    # rank, Rank, RANK
    r"\btier\b",    # tier, Tier, TIER
]
leak_regex = re.compile("|".join(leak_like_patterns), flags=re.IGNORECASE)

drop_cols = set(id_like + [TARGET])
drop_cols.update([c for c in df2.columns if leak_regex.search(str(c))])

In [20]:
# ------------------ 3) Columnas num√©ricas y categ√≥ricas ------------------
# Asegurar que Position exista
if "Position" not in df2.columns:
    raise ValueError("No se encontr√≥ la columna 'Position' en el DataFrame.")

num_cols = [c for c in df2.select_dtypes(include=[np.number]).columns
            if c not in drop_cols and c != TARGET]
cat_cols = ["Position"]

print(f"Total features num√©ricas (detectadas): {len(num_cols)}")
print(f"Total features categ√≥ricas (one-hot): {len(cat_cols)} ‚Üí {cat_cols}")

# ------------------ 4) Preprocesamiento ------------------
numeric_transformer = SimpleImputer(strategy="constant", fill_value=0)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

Total features num√©ricas (detectadas): 32
Total features categ√≥ricas (one-hot): 1 ‚Üí ['Position']


In [21]:
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df2.drop(columns=[TARGET]),
    df2[TARGET],
    test_size=0.2,
    random_state=42
)

In [22]:
# ------------------ C√ìDIGO COMPLETO PARA RANDOM FOREST CON MLFLOW ------------------

EXPERIMENT_NAME = "/Users/almendarez1002@gmail.com/FantasyDraft"
model_name = "FantasyDraft_RandomForest_Model"  # Nombre diferente para RF

mlflow.set_experiment(EXPERIMENT_NAME)
client = MlflowClient()

# ------------------ 1) PREPARAR DATOS ------------------
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df2.drop(columns=[TARGET]),
    df2[TARGET],
    test_size=0.2,
    random_state=42
)

print(f"üìä Datos preparados:")
print(f"   Train: {X_train_df.shape}")
print(f"   Test: {X_test_df.shape}")

# ------------------ 2) Definir funci√≥n objetivo Optuna para Random Forest ------------------
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "random_state": 42,
        "n_jobs": -1
    }

    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        model = RandomForestRegressor(**params)
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        pipeline.fit(X_train_df, y_train)
        y_pred = pipeline.predict(X_test_df)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    return rmse

# ------------------ 3) ENTRENAMIENTO PRINCIPAL ------------------
print("\nüöÄ INICIANDO ENTRENAMIENTO - RANDOM FOREST")
print("="*60)

with mlflow.start_run(run_name="randomforest_FantasyDraft_training") as run:
    print(f"üìä Run ID: {run.info.run_id}")

    # Optimizaci√≥n con Optuna
    print("\nüîç Optimizando hiperpar√°metros con Optuna...")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=10)  # M√°s trials para RF

    best_params = study.best_params
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
    mlflow.log_metric("best_rmse", study.best_value)

    print(f"\nüéØ Mejores par√°metros encontrados:")
    for k, v in best_params.items():
        print(f"   {k}: {v}")

    # Entrenar modelo final con mejores par√°metros
    print(f"\nüå≤ Entrenando Random Forest final...")
    best_model = RandomForestRegressor(**best_params)
    rf_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", best_model)
    ])
    rf_pipeline.fit(X_train_df, y_train)

    # Evaluar
    y_pred = rf_pipeline.predict(X_test_df)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    mlflow.log_metrics({
        "final_rmse": rmse,
        "final_r2": r2,
        "final_mae": mae
    })

    print(f"\nüìà M√âTRICAS DEL MODELO:")
    print(f"   RMSE: {rmse:.4f}")
    print(f"   MAE:  {mae:.4f}")
    print(f"   R¬≤:   {r2:.4f}")

    # Feature Importance (espec√≠fico de Random Forest)
    try:
        feature_names = (
            preprocessor.named_transformers_['num'].get_feature_names_out().tolist() +
            preprocessor.named_transformers_['cat'].get_feature_names_out().tolist()
        )
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)

        print(f"\nüîù Top 10 Features m√°s importantes:")
        print(feature_importance.head(10).to_string(index=False))

        # Guardar como artefacto
        feature_importance.to_csv("feature_importance.csv", index=False)
        mlflow.log_artifact("feature_importance.csv")

    except Exception as e:
        print(f"‚ö†Ô∏è No se pudo calcular feature importance: {e}")

    # ------------------ 4) GUARDAR Y REGISTRAR MODELO ------------------
    print("\n" + "="*60)
    print("üíæ GUARDANDO MODELO EN MODEL REGISTRY")
    print("="*60)

    try:
        # Registrar modelo
        model_info = mlflow.sklearn.log_model(
            sk_model=rf_pipeline,
            artifact_path="best_model",
            input_example=X_train_df.head(1),
            registered_model_name=model_name
        )

        model_uri = f"runs:/{run.info.run_id}/best_model"
        print(f"‚úÖ Modelo guardado en: {model_uri}")
        print(f"‚úÖ Registrado como: {model_name}")

        # Tags personalizados
        mlflow.set_tags({
            "model_type": "RandomForest",
            "model_version": "latest",
            "rmse": f"{rmse:.6f}",
            "r2": f"{r2:.6f}",
            "mae": f"{mae:.6f}",
            "training_date": pd.Timestamp.now().isoformat(),
            "n_estimators": str(best_params['n_estimators']),
            "max_depth": str(best_params.get('max_depth', 'None'))
        })

    except Exception as e:
        print(f"‚ö†Ô∏è Error al registrar: {type(e).__name__}: {e}")
        model_info = mlflow.sklearn.log_model(
            sk_model=rf_pipeline,
            artifact_path="best_model",
            input_example=X_train_df.head(1)
        )
        print(f"‚úÖ Modelo guardado como artefacto (sin registro)")

    # ------------------ 5) CHAMPION vs CHALLENGER ------------------
    print("\n" + "="*60)
    print("‚öîÔ∏è EVALUACI√ìN CHAMPION vs CHALLENGER")
    print("="*60)

    try:
        # Buscar todos los runs de Random Forest
        all_runs = mlflow.search_runs(
            experiment_ids=[run.info.experiment_id],
            filter_string=f"tags.mlflow.runName LIKE '%randomforest%' AND metrics.final_rmse > 0",
            order_by=["metrics.final_rmse ASC"],
            max_results=10
        )

        if len(all_runs) > 0:
            print(f"\nüìä HISTORIAL DE RUNS RANDOM FOREST (Top 5 por RMSE):")
            print("-"*70)

            for idx, run_row in all_runs.head(5).iterrows():
                run_id = run_row['run_id']
                run_rmse = run_row.get('metrics.final_rmse', float('nan'))
                run_r2 = run_row.get('metrics.final_r2', float('nan'))
                run_mae = run_row.get('metrics.final_mae', float('nan'))
                run_date = pd.to_datetime(run_row['start_time']).strftime('%Y-%m-%d %H:%M')

                symbol = "üèÜ" if idx == all_runs.index[0] else "üå≤"
                is_current = "‚Üê ACTUAL" if run_id == run.info.run_id else ""

                print(f"{symbol} {run_date} | RMSE: {run_rmse:.4f} | MAE: {run_mae:.4f} | R¬≤: {run_r2:.4f} {is_current}")

            # Comparaci√≥n con el mejor hist√≥rico
            best_historical_rmse = all_runs.iloc[0]['metrics.final_rmse']
            best_run_id = all_runs.iloc[0]['run_id']

            print("\n" + "-"*70)
            if run.info.run_id == best_run_id:
                print("‚úÖ üèÜ ESTE ES EL MEJOR MODELO RANDOM FOREST HASTA AHORA")
                print(f"   RMSE: {rmse:.4f}")
            else:
                mejora = ((best_historical_rmse - rmse) / best_historical_rmse * 100)
                if mejora > 0:
                    print(f"‚úÖ üèÜ NUEVO CHAMPION RF - Mejora: {mejora:.2f}%")
                    print(f"   Anterior mejor RMSE: {best_historical_rmse:.4f}")
                    print(f"   Nuevo RMSE: {rmse:.4f}")
                else:
                    print(f"üå≤ CHALLENGER - No supera al champion")
                    print(f"   Champion RMSE: {best_historical_rmse:.4f}")
                    print(f"   Este modelo RMSE: {rmse:.4f}")
                    print(f"   Diferencia: {abs(mejora):.2f}% peor")
        else:
            print("‚ÑπÔ∏è Este es tu primer modelo Random Forest üéâ")

    except Exception as e:
        print(f"‚ö†Ô∏è No se pudo comparar con modelos anteriores: {type(e).__name__}")



    # ------------------ 7) RESUMEN FINAL ------------------
    print("\n" + "="*60)
    print("üìã RESUMEN DEL ENTRENAMIENTO")
    print("="*60)
    print(f"‚úÖ Modelo: {model_name}")
    print(f"‚úÖ Algoritmo: Random Forest")
    print(f"‚úÖ Run ID: {run.info.run_id}")
    print(f"‚úÖ RMSE: {rmse:.4f}")
    print(f"‚úÖ MAE: {mae:.4f}")
    print(f"‚úÖ R¬≤: {r2:.4f}")
    print(f"‚úÖ N¬∞ trials Optuna: 10")
    print(f"‚úÖ N¬∞ estimators: {best_params['n_estimators']}")
    print(f"‚úÖ Max depth: {best_params.get('max_depth', 'None')}")
    print(f"\nüí° Para usar el modelo:")
    print(f"   model_uri = 'runs:/{run.info.run_id}/best_model'")
    print(f"   loaded_model = mlflow.sklearn.load_model(model_uri)")
    print("="*60)

print("\n‚úÖ PROCESO COMPLETADO")



üìä Datos preparados:
   Train: (740, 38)
   Test: (186, 38)

üöÄ INICIANDO ENTRENAMIENTO - RANDOM FOREST


[I 2025-11-11 22:53:37,290] A new study created in memory with name: no-name-8ff91c38-e17c-47ef-aeec-4c2444dc45d2


üìä Run ID: 1d622d94bc314f29b2bbfef85b369cc1

üîç Optimizando hiperpar√°metros con Optuna...


[I 2025-11-11 22:53:37,800] Trial 0 finished with value: 6.66546215573489 and parameters: {'n_estimators': 481, 'max_depth': 35, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 6.66546215573489.
[I 2025-11-11 22:53:38,048] Trial 1 finished with value: 5.598797146925973 and parameters: {'n_estimators': 361, 'max_depth': 36, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}. Best is trial 1 with value: 5.598797146925973.
[I 2025-11-11 22:53:38,394] Trial 2 finished with value: 6.159163196366403 and parameters: {'n_estimators': 325, 'max_depth': 19, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': True}. Best is trial 1 with value: 5.598797146925973.
[I 2025-11-11 22:53:38,786] Trial 3 finished with value: 6.868593050281402 and parameters: {'n_estimators': 407, 'max_depth': 35, 'min_samples_split': 16, 'min_samples_leaf': 10, 'max_features': 'l


üéØ Mejores par√°metros encontrados:
   n_estimators: 369
   max_depth: 13
   min_samples_split: 8
   min_samples_leaf: 1
   max_features: sqrt
   bootstrap: False

üå≤ Entrenando Random Forest final...





üìà M√âTRICAS DEL MODELO:
   RMSE: 5.2151
   MAE:  2.2352
   R¬≤:   0.9145

üîù Top 10 Features m√°s importantes:
feature  importance
     TD    0.170858
    ATT    0.122712
    CMP    0.075293
      G    0.068650
     LG    0.066389
    REC    0.062957
  YDS.1    0.055583
    TGT    0.045089
  ATT.1    0.035886
    PCT    0.034506

üíæ GUARDANDO MODELO EN MODEL REGISTRY


Registered model 'FantasyDraft_RandomForest_Model' already exists. Creating a new version of this model...
Created version '5' of model 'FantasyDraft_RandomForest_Model'.
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 2320.23it/s] 


‚úÖ Modelo guardado en: runs:/1d622d94bc314f29b2bbfef85b369cc1/best_model
‚úÖ Registrado como: FantasyDraft_RandomForest_Model

‚öîÔ∏è EVALUACI√ìN CHAMPION vs CHALLENGER

üìä HISTORIAL DE RUNS RANDOM FOREST (Top 5 por RMSE):
----------------------------------------------------------------------
üèÜ 2025-11-12 04:50 | RMSE: 5.1958 | MAE: 2.1920 | R¬≤: 0.9151 
üå≤ 2025-11-12 04:53 | RMSE: 5.2151 | MAE: 2.2352 | R¬≤: 0.9145 ‚Üê ACTUAL
üå≤ 2025-11-12 04:50 | RMSE: 5.3088 | MAE: 2.2579 | R¬≤: 0.9114 
üå≤ 2025-11-12 04:48 | RMSE: 5.3474 | MAE: 2.1411 | R¬≤: 0.9101 
üå≤ 2025-11-12 04:47 | RMSE: 5.7364 | MAE: 2.4785 | R¬≤: 0.8965 

----------------------------------------------------------------------
üå≤ CHALLENGER - No supera al champion
   Champion RMSE: 5.1958
   Este modelo RMSE: 5.2151
   Diferencia: 0.37% peor

üìã RESUMEN DEL ENTRENAMIENTO
‚úÖ Modelo: FantasyDraft_RandomForest_Model
‚úÖ Algoritmo: Random Forest
‚úÖ Run ID: 1d622d94bc314f29b2bbfef85b369cc1
‚úÖ RMSE: 5.2151
‚úÖ M

In [13]:
# ------------------ 7) Evaluaci√≥n ------------------
y_pred_test = rf_pipeline.predict(X_test_df)
rmse = mean_squared_error(y_test, y_pred_test, squared=False)
r2 = r2_score(y_test, y_pred_test)
print(f"RMSE (test): {rmse:.2f}")
print(f"R¬≤ (test):   {r2:.3f}")


RMSE (test): 5.44
R¬≤ (test):   0.907




In [14]:
# ------------------ 8) Importancias de caracter√≠sticas ------------------
# Recuperar nombres de columnas transformadas
num_cols_fitted = preprocessor.transformers_[0][2]
cat_cols_fitted = preprocessor.transformers_[1][2]
ohe_names = rf_pipeline.named_steps["preprocessor"] \
                       .named_transformers_["cat"] \
                       .named_steps["ohe"] \
                       .get_feature_names_out(cat_cols_fitted)
feature_names_fitted = np.r_[num_cols_fitted, ohe_names]

importances = rf_pipeline.named_steps["model"].feature_importances_
feat_imp = pd.DataFrame({"Feature": feature_names_fitted, "Importance": importances}) \
            .sort_values("Importance", ascending=False)

print("\n--- Top 15 Features m√°s importantes (RandomForest) ---")
print(feat_imp.head(15).reset_index(drop=True))



--- Top 15 Features m√°s importantes (RandomForest) ---
         Feature  Importance
0             TD    0.577748
1             LG    0.134396
2            ATT    0.055451
3            REC    0.043026
4              G    0.042692
5           SACK    0.028223
6            INT    0.016714
7             FG    0.015672
8            CMP    0.015101
9          YDS.1    0.013720
10  Position_DST    0.012268
11           TGT    0.006953
12          TD.1    0.005717
13           FGA    0.003977
14           Y/R    0.003807


In [16]:
# ------------------ 9) Predicciones completas y Top 10 (robusto a alias) ------------------
df_pred = df2.copy()

# Crear columna Team si viene con alias com√∫n
if "Team" not in df_pred.columns:
    for alt in ["Tm", "TEAM", "TeamAbbrev", "Franchise, Club"]:
        if alt in df_pred.columns:
            df_pred["Team"] = df_pred[alt]
            break

# Crear columna Player si viene con alias
if "Player" not in df_pred.columns:
    for alt in ["Name", "PLAYER", "PlayerName"]:
        if alt in df_pred.columns:
            df_pred["Player"] = df_pred[alt]
            break

# Predicci√≥n sobre todo el dataset limpio
df_pred["Pred_FPTS"] = rf_pipeline.predict(df2.drop(columns=[TARGET]))

# Top 10
top_pred = df_pred.sort_values("Pred_FPTS", ascending=False).head(10)

# Seleccionar columnas disponibles para imprimir sin romper
display_cols = ["Player", "Position", "Team", "Pred_FPTS", "FPTS"]
available_cols = [c for c in display_cols if c in top_pred.columns]

print("\n--- Top 10 Jugadores Predichos por FPTS (RandomForest) ---")
print(top_pred[available_cols].reset_index(drop=True))


--- Top 10 Jugadores Predichos por FPTS (RandomForest) ---
                    Player Position  Pred_FPTS  FPTS
0         Josh Allen (BUF)       QB  90.057035  99.5
1      Lamar Jackson (BAL)       QB  87.152801  94.4
2  Patrick Mahomes II (KC)       QB  85.801986  89.6
3        Jalen Hurts (PHI)       QB  84.400590  84.2
4          Drake Maye (NE)       QB  83.987662  85.5
5     Caleb Williams (CHI)       QB  82.580730  84.1
6      Baker Mayfield (TB)       QB  78.077254  80.1
7             Bo Nix (DEN)       QB  74.128717  70.1
8         Jordan Love (GB)       QB  73.368962  75.2
9         Jared Goff (DET)       QB  71.654180  72.3
