In [22]:
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report, f1_score



In [23]:
df = pd.read_csv("final.csv")
df.head()

Unnamed: 0,Player,Nation,Age,Pos,Squad,Comp,MP,Starts,Min,90s,...,Won,Lost,Cmp,Clr,FatigueIndex,StyleInsight,PlayerCluster,PlayerStyle,PlayerImg,TeamImg
0,Brenden Aaronson,us USA,25.0,"FW,MF",Leeds United,eng Premier League,13,9,791,8.8,...,6,13,178,10,0.389392,Liga equilibrada,4,9 Goleador de Área,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/en/thum...
1,Jones El-Abdellaoui,ma MAR,19.0,"MF,FW",Celta Vigo,es La Liga,7,0,139,1.5,...,2,3,43,5,-0.935444,Liga equilibrada,0,Interior Conductor,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/en/thum...
2,Himad Abdelli,dz ALG,26.0,MF,Angers,fr Ligue 1,11,9,764,8.5,...,8,3,422,10,0.7381,Liga ofensiva y vertical,3,Organizador / Regista,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/en/thum...
3,Ali Abdi,tn TUN,31.0,"DF,MF",Nice,fr Ligue 1,7,5,353,3.9,...,5,2,85,9,-0.521286,Liga ofensiva y vertical,0,Defensa Posicional,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/en/thum...
4,Salis Abdul Samed,gh GHA,25.0,MF,Nice,fr Ligue 1,10,7,485,5.4,...,4,3,242,3,-0.145675,Liga ofensiva y vertical,0,Interior Mixto,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/en/thum...


## VALOR DE MERCADO

In [24]:
features = [
    "Age", "MP", "Starts", "Min",
    "Touches", "Carries",
    "PrgDist", "PrgC", "PrgP",
    "Gls", "Ast", "xG", "xAG",
    "KP", "SCA", "GCA"
]

df_ml = df[features].dropna()

In [25]:
df_ml["MarketValue"] = (
    df_ml["Gls"] * 8 +
    df_ml["Ast"] * 6 +
    df_ml["xG"] * 10 +
    df_ml["xAG"] * 8 +
    df_ml["KP"] * 4 +
    df_ml["SCA"] * 2
)

In [26]:
X = df_ml.drop("MarketValue", axis=1)
y = df_ml["MarketValue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

## Modelo 1 : REGRESSION LINEAL

In [27]:
lr = LinearRegression()
lr.fit(X_train_s, y_train)

pred_lr = lr.predict(X_test_s)

In [28]:
print("Regression Lineal")
print("MAE:", mean_absolute_error(y_test, pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred_lr)))
print("R2:", r2_score(y_test, pred_lr))


Regression Lineal
MAE: 2.3738922585581724e-14
RMSE: 3.197868009129087e-14
R2: 1.0


## Modelo 2 : KNN 

In [29]:
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train_s, y_train)

pred_knn = knn.predict(X_test_s)


In [30]:
print("KNN")
print("MAE:", mean_absolute_error(y_test, pred_knn))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred_knn)))
print("R2:", r2_score(y_test, pred_knn))

KNN
MAE: 8.13976833976834
RMSE: 12.27975909156076
R2: 0.9704792731631255


## Modelo 3: ARBOL DE DECISION

In [31]:
dt = DecisionTreeRegressor(
    max_depth=8,
    random_state=42
)

dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

In [32]:
print("DECISION TREE")
print("MAE:", mean_absolute_error(y_test, pred_dt))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred_dt)))
print("R2:", r2_score(y_test, pred_dt))

DECISION TREE
MAE: 6.464747980926322
RMSE: 12.701411233540972
R2: 0.9684171507434292


## Modelo 4: RANDOM FOREST

In [33]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42
)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)


In [34]:
print("RANDOM FOREST")
print("MAE:", mean_absolute_error(y_test, pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred_rf)))
print("R2:", r2_score(y_test, pred_rf))

RANDOM FOREST
MAE: 3.5505701847394224
RMSE: 7.911571966262451
R2: 0.987746139879655


## COMPARACIÓN DE LOS MODELOS DE VALOR DE MERCADO

In [35]:
results = pd.DataFrame({
    "Modelo": ["Linear", "KNN", "Decision Tree", "Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, pred_lr),
        mean_absolute_error(y_test, pred_knn),
        mean_absolute_error(y_test, pred_dt),
        mean_absolute_error(y_test, pred_rf)
    ],
    "R2": [
        r2_score(y_test, pred_lr),
        r2_score(y_test, pred_knn),
        r2_score(y_test, pred_dt),
        r2_score(y_test, pred_rf)
    ]
})

results

Unnamed: 0,Modelo,MAE,R2
0,Linear,2.373892e-14,1.0
1,KNN,8.139768,0.970479
2,Decision Tree,6.464748,0.968417
3,Random Forest,3.55057,0.987746


## RENDIMIENDO DEL JUGADOR 

In [36]:
df_ml["PerformanceIndex"] = (
    df_ml["xG"] +
    df_ml["xAG"] +
    df_ml["Gls"] +
    df_ml["Ast"] +
    df_ml["SCA"] +
    df_ml["GCA"]
)


In [37]:
# Variables
X_perf = df_ml[features]
y_perf = df_ml["PerformanceIndex"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_perf, y_perf, test_size=0.2, random_state=42
)

# Escalado
scaler_perf = StandardScaler()
X_train_s = scaler_perf.fit_transform(X_train)
X_test_s = scaler_perf.transform(X_test)

In [38]:
# Regresión Lineal
lr_perf = LinearRegression()
lr_perf.fit(X_train_s, y_train)
pred_lr_perf = lr_perf.predict(X_test_s)

# KNN
knn_perf = KNeighborsRegressor(n_neighbors=7)
knn_perf.fit(X_train_s, y_train)
pred_knn_perf = knn_perf.predict(X_test_s)

# Árbol de Decisión
dt_perf = DecisionTreeRegressor(max_depth=8, random_state=42)
dt_perf.fit(X_train, y_train)
pred_dt_perf = dt_perf.predict(X_test)

# Random Forest 
rf_perf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42
)
rf_perf.fit(X_train, y_train)
pred_rf_perf = rf_perf.predict(X_test)

## COMPARACIÓN DE MODELOS DE RENDIMIENTO 

In [39]:
# COMPARACIÓN DE MODELOS
results_perf = pd.DataFrame({
    "Modelo": ["Linear", "KNN", "Decision Tree", "Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, pred_lr_perf),
        mean_absolute_error(y_test, pred_knn_perf),
        mean_absolute_error(y_test, pred_dt_perf),
        mean_absolute_error(y_test, pred_rf_perf)
    ],
    "R2": [
        r2_score(y_test, pred_lr_perf),
        r2_score(y_test, pred_knn_perf),
        r2_score(y_test, pred_dt_perf),
        r2_score(y_test, pred_rf_perf)
    ]
})

results_perf

Unnamed: 0,Modelo,MAE,R2
0,Linear,8.479184e-15,1.0
1,KNN,2.163944,0.962453
2,Decision Tree,1.182739,0.982233
3,Random Forest,0.6814852,0.991605


In [40]:
# -----------------------------
# 1️⃣ Crear variable objetivo
# -----------------------------
df_ml["MatchResult"] = np.where(
    df_ml["PerformanceIndex"] > df_ml["PerformanceIndex"].median(),
    1,  # Victoria
    0   # No victoria
)

# -----------------------------
# 2️⃣ Variables predictoras y target
# -----------------------------
X_win = df_ml[features]  # Asegúrate de definir 'features'
y_win = df_ml["MatchResult"]

# -----------------------------
# 3️⃣ Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_win, y_win, test_size=0.2, random_state=42
)

# -----------------------------
# 4️⃣ Escalar datos para modelos que lo necesitan
#    (Regresión Logística y, opcional, KNN)
# -----------------------------
scaler_match = StandardScaler()
X_train_s = scaler_match.fit_transform(X_train) # <-- Cambiar aquí
X_test_s = scaler_match.transform(X_test)       # <-- Cambiar aquí

# -----------------------------
# 5️⃣ Modelos
# -----------------------------

# 5a️⃣ Regresión Logística
log_clf = LogisticRegression(max_iter=2000)
log_clf.fit(X_train_s, y_train)
pred_log = log_clf.predict(X_test_s)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_log))
print(classification_report(y_test, pred_log))

# 5b️⃣ Árbol de Decisión
dt_clf = DecisionTreeClassifier(max_depth=8, random_state=42)
dt_clf.fit(X_train, y_train)  # No escala necesario
pred_dt = dt_clf.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, pred_dt))
print(classification_report(y_test, pred_dt))

# 5c️⃣ Random Forest
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)  # No escala necesario
pred_rf = rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf))
print(classification_report(y_test, pred_rf))

Logistic Regression Accuracy: 0.9833679833679834
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       246
           1       1.00      0.97      0.98       235

    accuracy                           0.98       481
   macro avg       0.98      0.98      0.98       481
weighted avg       0.98      0.98      0.98       481

Decision Tree Accuracy: 0.9854469854469855
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       246
           1       0.98      0.99      0.99       235

    accuracy                           0.99       481
   macro avg       0.99      0.99      0.99       481
weighted avg       0.99      0.99      0.99       481

Random Forest Accuracy: 0.9792099792099792
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       246
           1       0.99      0.97      0.98       235

    accuracy                           0.98    

In [41]:
import joblib

# -----------------------------
# Market Value
# -----------------------------
joblib.dump(rf, "rf_market_value.pkl")
joblib.dump(scaler, "scaler_market_value.pkl")

# -----------------------------
# Performance Index
# -----------------------------
joblib.dump(rf_perf, "rf_performance.pkl")
joblib.dump(scaler_perf, "scaler_performance.pkl")

# -----------------------------
# Quién ganará (MatchResult)
# -----------------------------
joblib.dump(rf_clf, "rf_match.pkl")
joblib.dump(scaler_match, "scaler_match.pkl")



['scaler_match.pkl']