6. Splitting the Dataset into Training and Testing


We will select relevant features and the target variable then split the data into training and testing sets for model building and evaluation.

In [3]:
import pandas as pd
df=pd.read_excel('data/cleaned/sofascore/sofascore_players_cleaned.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'data/cleaned/sofascore/sofascore_players_cleaned.xlsx'

In [None]:
# Define features and target
from sklearn.model_selection import train_test_split


features = [
    'age',
    'overall_rating',
    'p_matches_played',
    'assists',
    'expected_goals_(xg)',
    'successful_dribbles',
    'tackles',
    'accurate_passes_%',
    'p_goals_per_90',
    'assists_per_90',
    'expected_goals_per_90',
    'p_minutes_played',
    
]
X = df[features]
y = df['p_goals']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Standardize features
from sklearn.discriminant_analysis import StandardScaler


goal_scaler = StandardScaler()
X_train_scaled = goal_scaler.fit_transform(X_train)
X_test_scaled = goal_scaler.transform(X_test)


In [None]:
# Train model
from sklearn.metrics import r2_score  
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


goal_model = RandomForestRegressor(n_estimators=100, random_state=42)
goal_model.fit(X_train_scaled, y_train)

# Predict
y_pred = goal_model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output evaluation
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error (MAE): 0.25
R² Score: 0.92


Build the Deep Learning Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.discriminant_analysis import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np


In [None]:
# Feature columns
features = [
    'age', 'overall_rating', 'p_goals', 'assists',
    'expected_goals_(xg)', 'successful_dribbles', 'tackles',
    'accurate_passes_%',  'points', 'p_matches_played',
    'p_goals_per_90', 'assists_per_90', 'expected_goals_per_90', 'p_minutes_played'
]

X = df[features].fillna(0)
df['log_market_value'] = np.log1p(df['market_value'])
y = df['log_market_value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
value_scaler = StandardScaler()
X_train_scaled = value_scaler.fit_transform(X_train)
X_test_scaled = value_scaler.transform(X_test)

In [None]:


# Build model
value_model = Sequential()
value_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
value_model.add(Dropout(0.2))
value_model.add(Dense(32, activation='relu'))
value_model.add(Dropout(0.2))
value_model.add(Dense(1))  # Regression output

value_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train model
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = value_model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stop], verbose=1)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 49934.7305 - mae: 155.7777 - val_loss: 102.4054 - val_mae: 8.1954
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 14652.6562 - mae: 83.3819 - val_loss: 37.5034 - val_mae: 5.1357
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5099.0591 - mae: 50.4428 - val_loss: 13.9319 - val_mae: 3.1713
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2864.1875 - mae: 35.7620 - val_loss: 2.3268 - val_mae: 1.1941
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1310.7671 - mae: 24.2812 - val_loss: 15.5554 - val_mae: 3.6582
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 892.7627 - mae: 20.2346 - val_loss: 81.6030 - val_mae: 8.5983
Epoch 7/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

In [None]:
# Evaluate on test set
loss, mae = value_model.evaluate(X_test_scaled, y_test)
print(f"Test MAE (log space): {mae:.2f}")

# Predict and reverse the log scale
y_pred_log = value_model.predict(X_test_scaled)
y_pred_actual = np.expm1(y_pred_log.flatten())  # exp(log(x)) - 1
y_test_actual = np.expm1(y_test)

# Real-world MAE (in million euros)
real_mae = mean_absolute_error(y_test_actual, y_pred_actual)
print(f"Real Market Value MAE: €{real_mae:.2f}M")


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 24.9377 - mae: 4.9887 
Test MAE (log space): 4.99
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Real Market Value MAE: €140.34M


In [None]:
import joblib

# Save the model
joblib.dump(goal_model, 'goal_model.pkl')

# Save the scaler
joblib.dump(goal_scaler, 'goal_scaler.pkl')
 

value_model.save("market_value_model.h5")
joblib.dump(value_scaler, "market_value_scaler.pkl")




['market_value_scaler.pkl']