# Masters 1000 modelos

### Cargar dataframe para añadir columnas


In [None]:
# Ruta base del archivo
ruta_base = r"./Masters1000/columnas_añadidas"
# Nombre del archivo
nombre_archivo = "escaladofinal.csv"


In [None]:
ruta_completa = os.path.join(ruta_base, nombre_archivo)

# Leer el CSV correctamente
df = pd.read_csv(
    ruta_completa,
    delimiter=";",   # separador de columnas
    decimal=","      # separador decimal 
)

# Convertir fecha
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")

# Ordenar por fecha
df = df.sort_values(by="Date")

# Mostrar resumen
df.info()


## Modelo Regresión Logistica (M1000)

In [None]:
# Lista de columnas a escalar
features_to_scale = [
    "Surface_WinRate_Favorite", "Surface_WinRate_Not_Favorite",
    "Surface_Matches_Favorite", "Surface_Matches_Not_Favorite",
    "WinStreak_Favorite", "WinStreak_Not_Favorite", "Win_Streak_Diff",
    "Rank_Favorite", "Rank_Not_Favorite", "Rank_Diff_Signed", "Rank_Diff_Abs",
    #"GrandSlams_Favorite", "GrandSlams_Not_Favorite"
    "Masters1000_Favorite","Masters1000_Not_Favorite"
]

# Asegurarse de que son numéricos
for col in features_to_scale:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Escalado con StandardScaler
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


In [None]:

# Contar los valores nulos por columna para comprobar que no haya nulos
print(df[features_to_scale + ["Favorite_Wins"]].isna().sum())

In [None]:

X = df[features_to_scale]  # Asegurarse de que no haya columnas extra
y = df["Favorite_Wins"]

In [None]:

#Comprobamos que no haya nulos en x ni en y
print("¿NaNs en X?:", X.isna().any().any()) 
print("¿NaNs en y?:", y.isna().any())

In [None]:
# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelo de regresión logística
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)

# Calcular y mostrar la métrica
score = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", round(score, 3))


In [None]:
print(y.value_counts(normalize=True))

In [None]:
# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelo de regresión logística con ajuste por desbalance
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)

# Evaluar con balanced accuracy
score = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", round(score, 3))

### Prueba con validación cruzada (M1000)

In [None]:
# Usamos solo las columnas que queremos escalar
X = df[features_to_scale]
y = df["Favorite_Wins"]  

# Modelo con validación cruzada (5 folds)
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
y_pred_cv = cross_val_predict(model, X, y, cv=5)

# Evaluación
score_cv = balanced_accuracy_score(y, y_pred_cv)
print("Balanced Accuracy (CV):", round(score_cv, 3))

###  Evaluación del modelo Regresión logistica (M1000)

In [None]:
# Predicciones con validación cruzada
y_proba = cross_val_predict(model, X, y, cv=5, method='predict_proba')
y_pred = (y_proba[:, 1] >= 0.5).astype(int)  # convertir probabilidades en clases

# Métricas
print("Accuracy:", round(accuracy_score(y, y_pred), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred), 3))
print("Precision:", round(precision_score(y, y_pred), 3))
print("Recall:", round(recall_score(y, y_pred), 3))
print("F1-Score:", round(f1_score(y, y_pred), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba[:, 1]), 3))

# Matriz de confusión e informe
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))


## Random Forest + validación cruzada (M1000)

In [None]:
# Modelo Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Validación cruzada
y_proba_rf = cross_val_predict(rf_model, X, y, cv=5, method='predict_proba')
y_pred_rf = (y_proba_rf[:, 1] >= 0.5).astype(int)

In [None]:
# Métricas
print("Accuracy:", round(accuracy_score(y, y_pred_rf), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred_rf), 3))
print("Precision:", round(precision_score(y, y_pred_rf), 3))
print("Recall:", round(recall_score(y, y_pred_rf), 3))
print("F1-Score:", round(f1_score(y, y_pred_rf), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba_rf[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba_rf[:, 1]), 3))

# Matriz de confusión e informe
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_rf))
print("\nClassification Report:\n", classification_report(y, y_pred_rf))

## Modelo Gradient Boosting (M1000)

In [None]:
# Modelo Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Validación cruzada
y_proba_gb = cross_val_predict(gb_model, X, y, cv=5, method='predict_proba')
y_pred_gb = (y_proba_gb[:, 1] >= 0.5).astype(int)


# Métricas
print("Accuracy:", round(accuracy_score(y, y_pred_gb), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred_gb), 3))
print("Precision:", round(precision_score(y, y_pred_gb), 3))
print("Recall:", round(recall_score(y, y_pred_gb), 3))
print("F1-Score:", round(f1_score(y, y_pred_gb), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba_gb[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba_gb[:, 1]), 3))

# Matriz de confusión e informe
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_gb))
print("\nClassification Report:\n", classification_report(y, y_pred_gb))

## Modelo Gradient Boosting balanceado (M1000)

In [None]:

gb_model = HistGradientBoostingClassifier(class_weight='balanced', max_iter=100, learning_rate=0.1, max_depth=6, random_state=42)

# Validación cruzada
y_proba_gb = cross_val_predict(gb_model, X, y, cv=5, method='predict_proba')
y_pred_gb = (y_proba_gb[:, 1] >= 0.5).astype(int)


print("Accuracy:", round(accuracy_score(y, y_pred_gb), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred_gb), 3))
print("Precision:", round(precision_score(y, y_pred_gb), 3))
print("Recall:", round(recall_score(y, y_pred_gb), 3))
print("F1-Score:", round(f1_score(y, y_pred_gb), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba_gb[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba_gb[:, 1]), 3))

print("Confusion Matrix:\n", confusion_matrix(y, y_pred_gb))
print("\nClassification Report:\n", classification_report(y, y_pred_gb))

## MLPClassifier (M1000)

In [None]:
# Modelo MLP
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100,),  # una capa oculta con 100 neuronas
    max_iter=300,
    alpha=1e-4,
    solver='adam',
    random_state=42
)

# Validación cruzada
y_proba_mlp = cross_val_predict(mlp_model, X, y, cv=5, method='predict_proba')
y_pred_mlp = (y_proba_mlp[:, 1] >= 0.5).astype(int)



# Métricas
print("Accuracy:", round(accuracy_score(y, y_pred_mlp), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred_mlp), 3))
print("Precision:", round(precision_score(y, y_pred_mlp), 3))
print("Recall:", round(recall_score(y, y_pred_mlp), 3))
print("F1-Score:", round(f1_score(y, y_pred_mlp), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba_mlp[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba_mlp[:, 1]), 3))

# Matriz de confusión e informe
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_mlp))
print("\nClassification Report:\n", classification_report(y, y_pred_mlp))


# MLPClassifier con SMOTE (M1000)

In [None]:
# 1. Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Aplicar SMOTE al conjunto de entrenamiento
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 3. Entrenar MLP sobre datos balanceados
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp_model.fit(X_train_sm, y_train_sm)

# 4. Predecir sobre el test original
y_pred_mlp = mlp_model.predict(X_test)
y_proba_mlp = mlp_model.predict_proba(X_test)


# 5. Métricas
print("Accuracy:", round(accuracy_score(y_test, y_pred_mlp), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y_test, y_pred_mlp), 3))
print("Precision:", round(precision_score(y_test, y_pred_mlp), 3))
print("Recall:", round(recall_score(y_test, y_pred_mlp), 3))
print("F1-Score:", round(f1_score(y_test, y_pred_mlp), 3))
print("AUC-ROC:", round(roc_auc_score(y_test, y_proba_mlp[:, 1]), 3))
print("Log Loss:", round(log_loss(y_test, y_proba_mlp[:, 1]), 3))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mlp))
print("\nClassification Report:\n", classification_report(y_test, y_pred_mlp))

## Comparación entre modelos

| Modelo                    | Accuracy | Balanced Accuracy | Precision | Recall | F1-Score | AUC-ROC | Log Loss |
|--------------------------|----------|--------------------|-----------|--------|----------|---------|----------|
| Regresión Logística      | 0.589    | 0.592              | 0.724     | 0.583  | 0.646    | 0.624   | 0.671    |
| Random Forest            | 0.639    | 0.543              | 0.667     | 0.877  | 0.758    | 0.610   | 0.642    |
| Gradient Boosting        | 0.650    | 0.544              | 0.666     | 0.917  | 0.771    | 0.634   | 0.625    |
| Gradient Boosting (bal.) | 0.588    | 0.595              | 0.730     | 0.570  | 0.640    | 0.633   | 0.659    |
| MLPClassifier             | 0.642    | 0.541              | 0.665     | 0.893  | 0.762    | 0.622   | 0.634    |
| MLP + SMOTE              | 0.571    | 0.587              | 0.723     | 0.528  | 0.611    | 0.615   | 0.684    |


## Modelo Gradient Boosting con Optua (M1000)

In [None]:
#Objetivo de optimización
def objective(trial):
    # Hiperparámetros a optimizar
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }

    # Validación cruzada
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        weights = compute_sample_weight(class_weight='balanced', y=y_train)

        model = GradientBoostingClassifier(
            **params,
            random_state=42
        )

        model.fit(X_train, y_train, sample_weight=weights)
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        aucs.append(auc)

    return np.mean(aucs)

# Crear y ejecutar estudio
study = optuna.create_study(direction='maximize', study_name='GB_AUC_Optimization')
study.optimize(objective, n_trials=100, n_jobs=1)  # subir n_trials para mejor resultado

# Mostrar mejor resultado
print(" Mejor AUC-ROC:", round(study.best_value, 4))
print(" Mejores hiperparámetros encontrados:")
print(study.best_params)

Mejor AUC-ROC: 0.6496
 Mejores hiperparámetros encontrados:
{'learning_rate': 0.011215234584834815, 'n_estimators': 392, 'max_depth': 4, 'subsample': 0.7922129095414916}

In [None]:
# Inicializar
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_proba = np.zeros(len(y))
y_pred = np.zeros(len(y))

# Validación cruzada manual con los mejores hiperparámetros encontrados
for train_idx, test_idx in kf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    w = compute_sample_weight(class_weight='balanced', y=y_train)

    model = GradientBoostingClassifier(
        learning_rate=0.011215234584834815,
        n_estimators=392,
        max_depth=4,
        subsample=0.7922129095414916,
        random_state=42
    )

    model.fit(X_train, y_train, sample_weight=w)
    y_proba[test_idx] = model.predict_proba(X_test)[:, 1]
    y_pred[test_idx] = model.predict(X_test)

# Evaluación
y_pred = y_pred.astype(int)

print("MÉTRICAS DEL MODELO FINAL (Optuna):")
print("Accuracy:", round(accuracy_score(y, y_pred), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred), 3))
print("Precision:", round(precision_score(y, y_pred), 3))
print("Recall:", round(recall_score(y, y_pred), 3))
print("F1-Score:", round(f1_score(y, y_pred), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba), 3))
print("Log Loss:", round(log_loss(y, y_proba), 3))
print("\nMatriz de Confusión:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))

| Métrica              | Gradient Boosting (Sin optimizar) | Gradient Boosting (Optuna) |
|----------------------|------------------------------------|-----------------------------|
| **Accuracy**         | 0.650                              | 0.594                       |
| **Balanced Accuracy**| 0.544                              | 0.606                       |
| **Precision**        | 0.666                              | 0.743                       |
| **Recall**           | 0.917                              | 0.563                       |
| **F1-Score**         | 0.771                              | 0.641                       |
| **AUC-ROC**          | 0.634                              | **0.649**                   |
| **Log Loss**         | 0.625                              | 0.654                       |

|                       | Matriz de Confusión                |
|-----------------------|-----------------------------------|
| **Sin optimizar**     | [[ 856, 4165], [ 755, 8297]]       |
| **Optuna optimizado** | [[3256, 1765], [3953, 5099]]       |

|                       | Conclusión                                              |
|-----------------------|----------------------------------------------------------------|
| **Sin optimizar**     | Muy alto recall, pero modelo sesgado hacia victorias del favorito. |
| **Optuna optimizado** | Menor recall pero mejor AUC y balanced accuracy; predicción más equilibrada. |


## MLPClassifier Optua (M1000)

In [None]:
# Mostrar progreso en consola
#optuna.logging.set_verbosity(optuna.logging.INFO)

# Objetivo de optimización
def objective(trial):
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 50), (100, 100)]),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-2, log=True),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 0.0005, 0.1, log=True),
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh'])
    }

    aucs = []
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = MLPClassifier(
            **params,
            max_iter=300,
            random_state=42
        )

        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        aucs.append(auc)

    return np.mean(aucs)

# Crear estudio y optimizar
study = optuna.create_study(direction='maximize', study_name='MLP_AUC_Optimization')
study.optimize(objective, n_trials=150, n_jobs=1)  

# Mostrar mejores resultados
print("Mejor AUC-ROC:", round(study.best_value, 4))
print("Mejores hiperparámetros encontrados:")
print(study.best_params)


In [None]:
Mejor AUC-ROC: 0.638
Mejores hiperparámetros encontrados:
{'hidden_layer_sizes': (50,), 'alpha': 0.002854303279912305, 'learning_rate_init': 0.003089711339816946, 'solver': 'sgd', 'activation': 'relu'}

In [None]:
# 1. Modelo con los mejores hiperparámetros encontrados por Optuna
best_mlp = MLPClassifier(
    hidden_layer_sizes=(50,),
    alpha=0.002854303279912305,
    learning_rate_init=0.003089711339816946,
    solver='sgd',
    activation='relu',
    max_iter=300,
    random_state=42
)

# 2. Validación cruzada (probabilidades)
y_proba = cross_val_predict(best_mlp, X, y, cv=5, method='predict_proba')
y_pred = (y_proba[:, 1] >= 0.5).astype(int)

# 3. Métricas
print("MÉTRICAS DEL MLP OPTIMIZADO (Optuna actualizado):")
print("Accuracy:", round(accuracy_score(y, y_pred), 3))
print("Balanced Accuracy:", round(balanced_accuracy_score(y, y_pred), 3))
print("Precision:", round(precision_score(y, y_pred), 3))
print("Recall:", round(recall_score(y, y_pred), 3))
print("F1-Score:", round(f1_score(y, y_pred), 3))
print("AUC-ROC:", round(roc_auc_score(y, y_proba[:, 1]), 3))
print("Log Loss:", round(log_loss(y, y_proba[:, 1]), 3))

# 4. Matriz de confusión
print("\nMatriz de Confusión:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))

| Métrica              | MLP (Sin optimizar)               | MLP (Optuna)               |
|----------------------|-----------------------------------|----------------------------|
| **Accuracy**         | 0.642                             | 0.647                      |
| **Balanced Accuracy**| 0.541                             | 0.535                      |
| **Precision**        | 0.665                             | 0.661                      |
| **Recall**           | 0.893                             | 0.927                      |
| **F1-Score**         | 0.762                             | 0.772                      |
| **AUC-ROC**          | 0.622                             | **0.635**                  |
| **Log Loss**         | 0.634                             | 0.625                      |

|                       | Matriz de Confusión               |
|-----------------------|----------------------------------|
| **Sin optimizar**     | [[ 947, 4074], [ 969, 8083]]      |
| **Optuna optimizado** | [[ 716, 4305], [ 659, 8393]]      |

|                       | Conclusión                                                  |
|-----------------------|-------------------------------------------------------------|
| **Sin optimizar**     | Buen F1, pero bajo rendimiento en derrotas del favorito.    |
| **Optuna optimizado** | AUC y recall mejorados; más fiable prediciendo victorias.  |


# Comparación mejores modelos GrandSlam y ATP Master1000

| Métrica               | Grand Slams (GB Optuna) | Masters 1000 (GB Optuna) |
|-----------------------|-------------------------|---------------------------|
| **Accuracy**          | 0.643                   | 0.594                     |
| **Balanced Accuracy** | **0.658**               | 0.606                     |
| **Precision**         | **0.835**               | 0.743                     |
| **Recall**            | 0.623                   | 0.563                     |
| **F1-Score**          | 0.714                   | 0.641                     |
| **AUC-ROC**           | **0.713**               | 0.649                     |
| **Log Loss**          | 0.613                   | 0.654                     |

|                           | Matriz de Confusión              |
|---------------------------|----------------------------------|
| **Grand Slams (GB Optuna)** | [[2546, 1125], [3453, 5703]]     |
| **Masters 1000 (GB Optuna)**| [[3256, 1765], [3953, 5099]]     |

### Conclusión Final

- El modelo optimizado con **Gradient Boosting + Optuna** para **Grand Slams** logra **el mejor AUC-ROC (0.713)** y también destaca en **Balanced Accuracy y Precision**, lo que indica que es más capaz de distinguir entre clases y equilibrar el sesgo.
- El modelo equivalente para **Masters 1000** también mejora respecto al no optimizado, pero su rendimiento sigue siendo inferior en AUC-ROC, F1 y Balanced Accuracy.
- **Conclusión**: Aunque los **Masters 1000** tienen más datos, los **Grand Slams resultan más predecibles** desde el punto de vista del modelo, especialmente en cuanto a su capacidad de clasificación general medida por AUC-ROC.
