In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from pathlib import Path

In [7]:
# Load CSV file
filepath = Path.cwd().parent/"dataSet" / "player_stats.csv"
data = pd.read_csv(filepath, encoding='latin1')

# Remove dots and dollar signs, and convert to float
data['value'] = data['value'].replace('[^\d]', '', regex=True).astype(float)

# Remove null column and player column as the name should not affect the market value  
data.drop('marking', axis=1, inplace=True)
data.drop('player', axis=1, inplace=True)

#one hot encode the country and club features
encoded_df = pd.get_dummies(data, columns=['country','club'], prefix=['country','club'])

In [8]:
# Separate features and target
X = encoded_df.drop('value', axis=1)  
y = encoded_df['value']

# Split the dataset into training and test sets (70% training, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = 0.7, random_state=42)

# Further split the test set into training and validation sets (70% testing, 30% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,train_size = 0.7, random_state=42)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Create SVR model
svm_model = SVR(kernel='poly')

# Train the model
svm_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = svm_model.predict(X_test)

# Calculate evaluation metrics (MSE and R-squared)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared Score: {r2}')

Mean Squared Error (MSE): 77383602009153.78
R-squared Score: -0.07165676103191321


In [20]:
from sklearn.model_selection import GridSearchCV
# Define SVM model
svm_model = SVR()

# Define hyperparameters grid for tuning
param_grid = {'C': [20, 30, 42], 'kernel': ['Exponential', 'rbf', 'poly'], 'gamma': [5, 15, 32, 0.6]}
print("running")
# Perform GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=5)
grid_search.fit(X_train_scaled, y_train)

# Get best hyperparameters
best_params = grid_search.best_params_

# Train SVM model with best hyperparameters
best_svm_model = SVR(**best_params)
best_svm_model.fit(X_train_scaled, y_train)
                  # Predictions
y_pred = best_svm_model.predict(X_test_scaled)

# Evaluate with different metrics
r2_score = best_svm_model.score(X_test_scaled, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"R^2 Score: {r2_score}")
print(f"Mean Squared Error: {mse}")

running
Fitting 5 folds for each of 36 candidates, totalling 180 fits


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rifaat/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rifaat/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/rifaat/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/rifaat/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_paramete

R^2 Score: -0.23410155434322877
Mean Squared Error: 89113629468652.78
[CV 3/5] END ...C=20, gamma=5, kernel=Exponential;, score=nan total time=   0.2s
[CV 2/5] END ........C=20, gamma=5, kernel=rbf;, score=-0.116 total time=  12.2s
[CV 5/5] END ........C=20, gamma=5, kernel=rbf;, score=-0.089 total time=  13.2s
[CV 4/5] END ........C=20, gamma=5, kernel=poly;, score=0.309 total time=  15.8s
[CV 3/5] END .......C=20, gamma=15, kernel=rbf;, score=-0.078 total time=  17.0s
[CV 2/5] END .......C=20, gamma=15, kernel=poly;, score=0.447 total time=  16.8s
[CV 1/5] END ..C=20, gamma=32, kernel=Exponential;, score=nan total time=   0.0s
[CV 2/5] END ..C=20, gamma=32, kernel=Exponential;, score=nan total time=   0.1s
[CV 3/5] END ..C=20, gamma=32, kernel=Exponential;, score=nan total time=   0.0s
[CV 4/5] END ..C=20, gamma=32, kernel=Exponential;, score=nan total time=   0.1s
[CV 5/5] END ..C=20, gamma=32, kernel=Exponential;, score=nan total time=   0.0s
[CV 1/5] END .......C=20, gamma=32, ker

[CV 4/5] END ...C=20, gamma=5, kernel=Exponential;, score=nan total time=   0.2s
[CV 5/5] END ...C=20, gamma=5, kernel=Exponential;, score=nan total time=   0.0s
[CV 4/5] END ........C=20, gamma=5, kernel=rbf;, score=-0.103 total time=  12.3s
[CV 2/5] END ........C=20, gamma=5, kernel=poly;, score=0.447 total time=  16.1s
[CV 2/5] END ..C=20, gamma=15, kernel=Exponential;, score=nan total time=   0.1s
[CV 4/5] END ..C=20, gamma=15, kernel=Exponential;, score=nan total time=   0.1s
[CV 1/5] END .......C=20, gamma=15, kernel=rbf;, score=-0.057 total time=  13.7s
[CV 4/5] END .......C=20, gamma=15, kernel=rbf;, score=-0.103 total time=  17.4s
[CV 3/5] END .......C=20, gamma=15, kernel=poly;, score=0.424 total time=  16.7s
[CV 2/5] END .......C=20, gamma=32, kernel=rbf;, score=-0.116 total time=  11.0s
[CV 1/5] END .......C=20, gamma=32, kernel=poly;, score=0.359 total time=  14.6s
[CV 5/5] END .......C=20, gamma=32, kernel=poly;, score=0.369 total time=  13.0s
[CV 5/5] END ......C=20, gam