In [6]:
import pandas as pd
import re
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [7]:
df_DST = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_DST.csv")
df_K = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_K.csv")
df_QB = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_QB.csv")
df_RB = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_RB.csv")
df_TE = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_TE.csv")
df_WR = pd.read_csv("../data/FantasyPros_Fantasy_Football_Statistics_WR.csv")

df_DST['Position'] = 'DST'
df_K['Position'] = 'K'
df_QB['Position'] = 'QB'
df_RB['Position'] = 'RB'
df_TE['Position'] = 'TE'
df_WR['Position'] = 'WR'

df = pd.concat([df_DST, df_K, df_QB, df_RB, df_TE, df_WR], ignore_index=True)

print(f"Total de filas: {len(df)}")
print(f"\nDistribución por posición:")
print(df['Position'].value_counts())
print(f"\nPrimeras filas:")
print(df.head())
print(f"\nColumnas:")
print(df.columns.tolist())
print(f"\nInfo del DataFrame:")
print(df.info())

Total de filas: 938

Distribución por posición:
WR     325
RB     208
TE     192
QB     121
K       58
DST     34
Name: Position, dtype: int64

Primeras filas:
   Rank                      Player  SACK  INT   FR   FF  DEF TD  SFTY  \
0   1.0      Seattle Seahawks (SEA)  12.0  7.0  0.0  0.0     0.0   0.0   
1   2.0  Jacksonville Jaguars (JAC)   7.0  9.0  4.0  5.0     0.0   0.0   
2   3.0     Minnesota Vikings (MIN)  11.0  2.0  5.0  8.0     2.0   0.0   
3   4.0   Philadelphia Eagles (PHI)   5.0  3.0  2.0  4.0     0.0   0.0   
4   5.0         Detroit Lions (DET)  14.0  3.0  3.0  4.0     0.0   0.0   

   SPC TD    G  ...  TD  SACKS ATT.1 YDS.1  TD.1  FL  20+  TGT  REC  Y/R  
0     2.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
1     1.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
2     0.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
3     2.0  4.0  ... NaN    NaN   NaN   NaN   NaN NaN  NaN  NaN  NaN  NaN  
4     1.0  4.0  ... 

In [8]:
# ============================================================
# PREP: Codificación de 'Position' + Selección de features
# ============================================================

# ---------- 1) Definir target ----------
TARGET = "FPTS"

# ---------- 2) Quitar columnas que NO deben ser features ----------
# - Identificadores y texto
id_like = ["Player", "Team"]  # agrega otras si las tienes (e.g., 'PlayerId')
# - Fugas de información (derivadas del target o rankings)
leak_like_patterns = [
    r"^FPTS\/G$",      # puntos por juego (deriva del target)
    r"rank",           # cualquier 'rank' o variantes
    r"tier",           # tiers si existieran
]
# Compilar regex para filtrar
leak_regex = re.compile("|".join(leak_like_patterns), flags=re.IGNORECASE)

drop_cols = set(id_like + [TARGET])
drop_cols.update([c for c in df.columns if leak_regex.search(str(c))])

# ---------- 3) Seleccionar columnas numéricas y categóricas ----------
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns
            if c not in drop_cols and c != TARGET]

# Asegurar que Position esté como categórica
cat_cols = ["Position"]



In [9]:
# ---------- 4) Imputación + OneHot para 'Position' ----------
numeric_transformer = SimpleImputer(strategy="constant", fill_value=0)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

# ---------- 5) Ajustar transformador y generar X, y ----------
X = preprocessor.fit_transform(df)
y = df[TARGET].values

# ---------- 6) Obtener nombres de features transformadas ----------
ohe_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, ohe_feature_names]

print(f"Total features numéricas: {len(num_cols)}")
print(f"Total features categóricas (one-hot): {len(ohe_feature_names)}")
print(f"Total de features finales: {len(feature_names)}")

# Vista rápida de las primeras 25 columnas transformadas
print("\nEjemplo de nombres de features resultantes:")
print(feature_names[:25])

Total features numéricas: 32
Total features categóricas (one-hot): 6
Total de features finales: 38

Ejemplo de nombres de features resultantes:
['SACK' 'INT' 'FR' 'FF' 'DEF TD' 'SFTY' 'SPC TD' 'G' 'FG' 'FGA' 'PCT' 'LG'
 '1-19' '20-29' '30-39' '40-49' '50+' 'XPT' 'XPA' 'CMP' 'ATT' 'Y/A' 'TD'
 'SACKS' 'ATT.1']


In [10]:
# ============================================================
# MODELO RANDOM FOREST (TODO EN UNA SOLA CELDA) — Todas las posiciones combinadas
# ============================================================
# - Limpieza robusta del target
# - Codificación de Position (One-Hot)
# - Selección de features sin fugas (sin FPTS/G, Rank, Tier, etc.)
# - Entrenamiento y evaluación
# - Importancias y Top 10 predicho (con manejo de alias de columnas)
# ============================================================

# ------------------ 0) Configuración ------------------
TARGET = "FPTS"

# ------------------ 1) Asegurar tipos numéricos y target limpio ------------------
df2 = df.copy()

# Forzar TARGET a numérico
df2[TARGET] = pd.to_numeric(df2[TARGET], errors="coerce")

# Quitar filas con FPTS NaN/inf
mask = np.isfinite(df2[TARGET])
df2 = df2.loc[mask].reset_index(drop=True)

print(f"Filas después de limpiar {TARGET}: {len(df2)}")


Filas después de limpiar FPTS: 926


In [11]:
# ------------------ 2) Definir columnas a eliminar (no-features) ------------------
# Identificadores y texto que no deben entrar como features
id_like = ["Player", "Team"]  # si faltan, se manejan luego para imprimir
# Fugas de información: cualquier cosa derivada del target o rankings
leak_like_patterns = [
    r"^FPTS\/G$",   # puntos por juego (deriva del target)
    r"\brank\b",    # rank, Rank, RANK
    r"\btier\b",    # tier, Tier, TIER
]
leak_regex = re.compile("|".join(leak_like_patterns), flags=re.IGNORECASE)

drop_cols = set(id_like + [TARGET])
drop_cols.update([c for c in df2.columns if leak_regex.search(str(c))])

In [12]:
# ------------------ 3) Columnas numéricas y categóricas ------------------
# Asegurar que Position exista
if "Position" not in df2.columns:
    raise ValueError("No se encontró la columna 'Position' en el DataFrame.")

num_cols = [c for c in df2.select_dtypes(include=[np.number]).columns
            if c not in drop_cols and c != TARGET]
cat_cols = ["Position"]

print(f"Total features numéricas (detectadas): {len(num_cols)}")
print(f"Total features categóricas (one-hot): {len(cat_cols)} → {cat_cols}")

# ------------------ 4) Preprocesamiento ------------------
numeric_transformer = SimpleImputer(strategy="constant", fill_value=0)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

Total features numéricas (detectadas): 32
Total features categóricas (one-hot): 1 → ['Position']


In [13]:
# ------------------ 5) Split train/test con el df crudo (el preprocessor hará la transformación) ------------------
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df2.drop(columns=[TARGET]), df2[TARGET], test_size=0.2, random_state=42
)

# ------------------ 6) Modelo Random Forest ------------------
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", rf)
])

# Entrenar
rf_pipeline.fit(X_train_df, y_train)

In [14]:
# ------------------ 7) Evaluación ------------------
y_pred_test = rf_pipeline.predict(X_test_df)
rmse = mean_squared_error(y_test, y_pred_test, squared=False)
r2 = r2_score(y_test, y_pred_test)
print(f"RMSE (test): {rmse:.2f}")
print(f"R² (test):   {r2:.3f}")


RMSE (test): 5.44
R² (test):   0.907


In [15]:
# ------------------ 8) Importancias de características ------------------
# Recuperar nombres de columnas transformadas
num_cols_fitted = preprocessor.transformers_[0][2]
cat_cols_fitted = preprocessor.transformers_[1][2]
ohe_names = rf_pipeline.named_steps["preprocessor"] \
                       .named_transformers_["cat"] \
                       .named_steps["ohe"] \
                       .get_feature_names_out(cat_cols_fitted)
feature_names_fitted = np.r_[num_cols_fitted, ohe_names]

importances = rf_pipeline.named_steps["model"].feature_importances_
feat_imp = pd.DataFrame({"Feature": feature_names_fitted, "Importance": importances}) \
            .sort_values("Importance", ascending=False)

print("\n--- Top 15 Features más importantes (RandomForest) ---")
print(feat_imp.head(15).reset_index(drop=True))



--- Top 15 Features más importantes (RandomForest) ---
         Feature  Importance
0             TD    0.577753
1             LG    0.134418
2            ATT    0.055232
3            REC    0.043033
4              G    0.042687
5           SACK    0.028223
6            INT    0.016714
7             FG    0.015672
8            CMP    0.015277
9          YDS.1    0.013776
10  Position_DST    0.012268
11           TGT    0.006964
12          TD.1    0.005679
13           FGA    0.003984
14           Y/R    0.003798


In [16]:
# ------------------ 9) Predicciones completas y Top 10 (robusto a alias) ------------------
df_pred = df2.copy()

# Crear columna Team si viene con alias común
if "Team" not in df_pred.columns:
    for alt in ["Tm", "TEAM", "TeamAbbrev", "Franchise, Club"]:
        if alt in df_pred.columns:
            df_pred["Team"] = df_pred[alt]
            break

# Crear columna Player si viene con alias
if "Player" not in df_pred.columns:
    for alt in ["Name", "PLAYER", "PlayerName"]:
        if alt in df_pred.columns:
            df_pred["Player"] = df_pred[alt]
            break

# Predicción sobre todo el dataset limpio
df_pred["Pred_FPTS"] = rf_pipeline.predict(df2.drop(columns=[TARGET]))

# Top 10
top_pred = df_pred.sort_values("Pred_FPTS", ascending=False).head(10)

# Seleccionar columnas disponibles para imprimir sin romper
display_cols = ["Player", "Position", "Team", "Pred_FPTS", "FPTS"]
available_cols = [c for c in display_cols if c in top_pred.columns]

print("\n--- Top 10 Jugadores Predichos por FPTS (RandomForest) ---")
print(top_pred[available_cols].reset_index(drop=True))


--- Top 10 Jugadores Predichos por FPTS (RandomForest) ---
                    Player Position  Pred_FPTS  FPTS
0         Josh Allen (BUF)       QB  90.057035  99.5
1      Lamar Jackson (BAL)       QB  87.152801  94.4
2  Patrick Mahomes II (KC)       QB  85.801986  89.6
3        Jalen Hurts (PHI)       QB  84.400590  84.2
4          Drake Maye (NE)       QB  83.987662  85.5
5     Caleb Williams (CHI)       QB  82.580730  84.1
6      Baker Mayfield (TB)       QB  78.077254  80.1
7             Bo Nix (DEN)       QB  74.128717  70.1
8         Jordan Love (GB)       QB  73.368962  75.2
9         Jared Goff (DET)       QB  71.654180  72.3
