6. Splitting the Dataset into Training and Testing


We will select relevant features and the target variable then split the data into training and testing sets for model building and evaluation.

In [1]:
import pandas as pd
df=pd.read_excel('encoded_data.xlsx')

In [None]:
# Define features and target
from sklearn.model_selection import train_test_split


features = [
    'age',
    'overall_rating',
    'p_matches_played',
    'assists',
    'expected_goals_(xg)',
    'successful_dribbles',
    'tackles',
    'accurate_passes_%',
    'p_goals_per_90',
    'assists_per_90',
    'expected_goals_per_90',
    'p_minutes_played',
    
]
X = df[features]
y = df['p_goals']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
# Standardize features
from sklearn.discriminant_analysis import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
# Train model
from sklearn.metrics import r2_score  
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output evaluation
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error (MAE): 0.23
R² Score: 0.93


Build the Deep Learning Model

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.discriminant_analysis import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np


In [14]:
# Feature columns
features = [
    'age', 'overall_rating', 'p_goals', 'assists',
    'expected_goals_(xg)', 'successful_dribbles', 'tackles',
    'accurate_passes_%',  'points', 'p_matches_played',
    'p_goals_per_90', 'assists_per_90', 'expected_goals_per_90', 'p_minutes_played'
]

X = df[features].fillna(0)
df['log_market_value'] = np.log1p(df['market_value'])
y = df['log_market_value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:


# Build model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))  # Regression output

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train model
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stop], verbose=1)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 54352.9258 - mae: 159.8654 - val_loss: 781.4849 - val_mae: 24.0691
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 10548.4570 - mae: 70.4951 - val_loss: 481.4910 - val_mae: 18.7151
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3748.4724 - mae: 41.7451 - val_loss: 9.1135 - val_mae: 2.7441
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2019.1733 - mae: 30.3189 - val_loss: 34.8863 - val_mae: 4.9936
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1307.8899 - mae: 23.5657 - val_loss: 12.0137 - val_mae: 3.0205
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 772.0363 - mae: 17.2774 - val_loss: 6.7068 - val_mae: 2.2497
Epoch 7/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [16]:
loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae:.2f}")


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2690 - mae: 0.4270 
Test MAE: 0.41
