In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

In [2]:
df = pd.read_csv('all_players.csv')

# Create a DataFrame containing only goalkeepers
df_gk = df[df['Position'] == 'GK'].copy()

# Define the goalkeeper-specific features and the target
gk_features = ['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']
target = 'OVR'

In [3]:
df_gk = df_gk.dropna(subset=gk_features)

X_gk = df_gk[gk_features]
y_gk = df_gk[target]

In [4]:
X_train_gk, X_test_gk, y_train_gk, y_test_gk = train_test_split(X_gk, y_gk, test_size=0.2, random_state=42)

In [7]:
gk_ovr_model = XGBRegressor(objective='reg:squarederror',
                          n_estimators=300,
                          learning_rate=0.05,
                          max_depth=5,
                          random_state=42,
                          n_jobs=-1)

gk_ovr_model.fit(X_train_gk, y_train_gk)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
y_pred_gk = gk_ovr_model.predict(X_test_gk)
mae_gk = mean_absolute_error(y_test_gk, y_pred_gk)
r2_gk = r2_score(y_test_gk, y_pred_gk)

print("--- Goalkeeper OVR Prediction Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae_gk:.2f}")
print(f"R-squared (R²): {r2_gk:.4f}")
print("(An MAE close to 0 and an R² close to 1.0 indicate a very accurate model)")

--- Goalkeeper OVR Prediction Model Evaluation ---
Mean Absolute Error (MAE): 0.67
R-squared (R²): 0.9859
(An MAE close to 0 and an R² close to 1.0 indicate a very accurate model)


In [10]:
joblib.dump(gk_ovr_model, 'models/gk_ovr_model.pkl')
print("\nGoalkeeper OVR model saved as 'gk_ovr_model.pkl'")


Goalkeeper OVR model saved as 'gk_ovr_model.pkl'


In [15]:
def load_gk_components():
    """
    Loads all models and data files needed for GK prediction.
    """
    components = {
        'gk_ovr_model': joblib.load('models/gk_ovr_model.pkl'),
        'league_model': joblib.load('models/league_tier_model.pkl'),
        'league_encoder': joblib.load('models/league_tier_encoder.pkl'),
        'gk_similarity_scaler': joblib.load('models/gk_similarity_scaler.pkl'),
        'scaled_gk_attributes': np.load('models/scaled_gk_attributes.npy'),
        'gk_player_info': pd.read_csv('models/gk_player_info.csv')
    }
    return components

In [16]:
def predict_gk_profile(gk_attributes, num_similar=5):
    """
    Predicts OVR, league tier, and finds similar players for a goalkeeper.

    Args:
        gk_attributes (dict): A dictionary of the GK's attributes. Must include
                              the 5 main GK stats AND the 6 face card stats.
        num_similar (int): The number of similar players to find.

    Returns:
        dict: A comprehensive profile of the goalkeeper.
    """
    components = load_gk_components()
    gk_df = pd.DataFrame([gk_attributes])

    # --- 1. Predict Goalkeeper OVR ---
    gk_ovr_features = ['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']
    predicted_ovr = round(float(components['gk_ovr_model'].predict(gk_df[gk_ovr_features])[0]))

    # --- 2. Predict League Tier (using hybrid logic) ---
    predicted_league = ''
    if predicted_ovr >= 86:
        predicted_league = 'Top Tier'
    elif predicted_ovr < 72:
        predicted_league = 'Lower Tier'
    else:
        # Use the general league model for the middle ground
        league_features_df = gk_df[['PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY']].copy()
        league_features_df['OVR'] = predicted_ovr
        league_feature_order = ['OVR', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY']
        
        league_pred_encoded = components['league_model'].predict(league_features_df[league_feature_order])
        predicted_league = components['league_encoder'].inverse_transform(league_pred_encoded)[0]

    # --- 3. Find Similar Goalkeepers ---
    gk_similarity_features = ['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes', 'Reactions']
    input_gk_scaled = components['gk_similarity_scaler'].transform(gk_df[gk_similarity_features])
    
    # Calculate distances and find the closest matches
    distances = euclidean_distances(input_gk_scaled, components['scaled_gk_attributes'])
    similar_gk_indices = distances.argsort().flatten()[1 : num_similar + 1]
    
    similar_gks_df = components['gk_player_info'].iloc[similar_gk_indices]
    similar_players = similar_gks_df.to_dict('records')

    return {
        'predicted_ovr': predicted_ovr,
        'predicted_league_tier': predicted_league,
        'similar_players': similar_players
    }

In [19]:
world_class_gk_profile = {
    # Main GK Stats
    'GK Diving': 80, 'GK Handling': 77, 'GK Kicking': 85, 'GK Positioning': 75, 'GK Reflexes': 67,
    # Other important stats
    'Reactions': 81,
    # Face card stats (even if low, they are needed for the league model)
    'PAC': 60, 'SHO': 55, 'PAS': 65, 'DRI': 60, 'DEF': 40, 'PHY': 85
}
# Get the complete profile
full_gk_profile = predict_gk_profile(world_class_gk_profile)

# --- Display the Results ---
print("--- Full Goalkeeper Profile Prediction ---")
print(f"  - Predicted Overall (OVR): {full_gk_profile['predicted_ovr']}")
print(f"  - Predicted League Tier: {full_gk_profile['predicted_league_tier']}")
print("\n--- Top 5 Similar Goalkeepers ---")
for player in full_gk_profile['similar_players']:
    print(f"  - {player['Name']} ({player['OVR']} {player['Position']}) - {player['Team']}")

--- Full Goalkeeper Profile Prediction ---
  - Predicted Overall (OVR): 76
  - Predicted League Tier: Lower Tier

--- Top 5 Similar Goalkeepers ---
  - Matz Sels (77 GK) - Nott'm Forest
  - Justin Bijlow (77 GK) - Feyenoord
  - Ørjan Nyland (76 GK) - Sevilla FC
  - Benjamin Lecomte (77 GK) - Montpellier
  - Kasper Schmeichel (80 GK) - Celtic
