In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Load and preprocess the data
data_pd = pd.read_csv('final_df_new.csv')
data_pd = data_pd.fillna(0)

# One-hot encoding for categorical variables
data_pd = pd.get_dummies(data_pd, columns=['Pos'])
data_pd = pd.get_dummies(data_pd, columns=['League'])

# Feature engineering
data_pd['CountryESP'] = (data_pd['Nation'] == 'ESP').astype(int)
data_pd['CountryENG'] = (data_pd['Nation'] == 'ENG').astype(int)
data_pd['CountryOther'] = ((data_pd['Nation'] != 'ESP') & (data_pd['Nation'] != 'ENG')).astype(int)

# Train-test split
train_df, test_df = train_test_split(data_pd, test_size=0.5)

# Drop unnecessary columns
dropped_columns = ['Unnamed: 0', 'Player', 'Nation', 'Squad', '90s', 'Market Value']
train_df_dropped_cols = train_df.drop(dropped_columns, axis=1)
test_df_dropped_cols = test_df.drop(dropped_columns, axis=1)

# Convert to numpy arrays
train_arr = train_df_dropped_cols.to_numpy()
test_arr = test_df_dropped_cols.to_numpy()

# Normalize the feature data
train_features_sc = StandardScaler()
train_features_norm = train_features_sc.fit_transform(train_arr)
test_features_norm = train_features_sc.transform(test_arr)

# Normalize the target variable
train_mktval = train_df['Market Value'].to_numpy().reshape(-1, 1)
test_mktval = test_df['Market Value'].to_numpy().reshape(-1, 1)

train_mktval_sc = StandardScaler()
train_mktval_norm = train_mktval_sc.fit_transform(train_mktval)
test_mktval_norm = train_mktval_sc.transform(test_mktval)

# Support Vector Machine (SVM) regression
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Train the model
svm_model.fit(train_features_norm, train_mktval_norm.ravel())

# Make predictions
predictions = svm_model.predict(test_features_norm)

# Evaluate the model
mse = mean_squared_error(test_mktval_norm, predictions)
r2 = r2_score(test_mktval_norm, predictions)

# Print evaluation metrics
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# Inverse transform predictions back to the original scale
predictions_original_scale = train_mktval_sc.inverse_transform(predictions.reshape(-1, 1))
test_mktval_original_scale = train_mktval_sc.inverse_transform(test_mktval_norm)

# Print predictions on the original scale
print("Predictions (original scale):", predictions_original_scale.flatten()[:10])  # Display first 10 predictions
print("Actual values (original scale):", test_mktval_original_scale.flatten()[:10])  # Display first 10 actual values

# Evaluate the model on the training data
train_predictions = svm_model.predict(train_features_norm)

# Calculate train MSE and R^2
train_mse = mean_squared_error(train_mktval_norm, train_predictions)
train_r2 = r2_score(train_mktval_norm, train_predictions)

# Print train evaluation metrics
print(f"Train Mean Squared Error (MSE): {train_mse:.4f}")
print(f"Train R-squared (R2): {train_r2:.4f}")

Mean Squared Error (MSE): 0.4130
R-squared (R2): 0.4164
Predictions (original scale): [ 5523517.31973698 12023571.783857   54462122.14480911 23478867.44223009
  8586109.06671578 18578111.48368444 15666426.57055459  1205302.52818821
 34112329.77152855  4794224.7202271 ]
Actual values (original scale): [ 4000000. 50000000. 40000000. 17000000.  8000000. 30000000. 30000000.
  5000000. 55000000. 20000000.]
Train Mean Squared Error (MSE): 0.3028
Train R-squared (R2): 0.6972
