In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import preprocessing

# Fit Random Forest regression model to data

### Load data, feature engineering and encoding

In [20]:
data = pd.read_csv("../data/merged_data.csv")
positions = [(data.Pos == "FW") | (data.Pos == "FWMF") | (data.Pos == "FWDF"),
             (data.Pos == "MF") | (data.Pos == "MFFW") | (data.Pos == "MFDF"),
             (data.Pos == "DF") | (data.Pos == "DFMF") | (data.Pos == "DFFW")]
posNames = ["Forward","Midfielder","Defender"]
gPos = np.select(positions,posNames)

# Create encoding for the categorical variable
new_var = pd.get_dummies(gPos, drop_first=True)
print(data)

data_new = data.join(new_var)

            date    datetime    dateweek  player_id  current_club_id  \
0     2022-08-05  2022-08-05  2022-08-01      41414              398   
1     2022-08-05  2022-08-05  2022-08-01     661284            55686   
2     2022-07-29  2022-07-29  2022-07-25     663225               29   
3     2022-07-29  2022-07-29  2022-07-25     670883              148   
4     2022-07-29  2022-07-29  2022-07-25     289846             1010   
...          ...         ...         ...        ...              ...   
2312  2021-12-22  2021-12-22  2021-12-20     377387              167   
2313  2021-12-16  2021-12-16  2021-12-13      76277               40   
2314  2021-10-13  2021-10-13  2021-10-11      26399              131   
2315  2021-10-05  2021-10-05  2021-10-04     592474              987   
2316  2021-10-05  2021-10-05  2021-10-04     627248               29   

      market_value_in_eur player_club_domestic_competition_id  \
0                 1000000                                 IT1   
1    

### Create test train split and transform variables

In [21]:
X = data_new.drop(columns=["market_value_in_eur"]).select_dtypes(exclude=['object'])
Y = data_new['market_value_in_eur']

Y_log = np.log(Y)
X_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = X_scaler.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,Y_log,test_size = 0.33, shuffle = True)

In [32]:
fs = SelectKBest(score_func = f_regression, k = 20)
fs.fit(X_train, Y_train)

X_train_red = fs.transform(X_train)
X_test_red = fs.transform(X_test)

vars = pd.DataFrame(X.columns, columns = ['vars'])
scores = pd.DataFrame(fs.scores_, columns = ['scores'])

vars.join(scores).sort_values(by = 'scores', ascending=False).head(20).reset_index(drop = True)

Unnamed: 0,vars,scores
0,MP,666.554232
1,90s,646.15148
2,Min,646.054621
3,Starts,627.899742
4,stadium_seats,192.941653
5,DriSucc%,167.013145
6,PasLonCmp%,135.312114
7,ShoDist,132.188172
8,G/SoT,125.037578
9,SoT%,116.06471


According to the regression framework (f_regression), these are the 20 variables that perform the best. 

# Creating models

### Linear Regression 

In [23]:
regr = linear_model.LinearRegression()

regr.fit(X_train_red, Y_train)
print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.4676995953678198
R^2 for testing data is: 0.4763105575048827


### Ridge Linear Regression

Since the regression show clear signs of overfitting, we need to apply regularization. Ridge regression is also referred to as L2 Regularization. 

In [26]:
#linear_model.Ridge(alpha = 0.1, random_state = 0).fit(X_train_red, Y_train)
regr = linear_model.RidgeCV(alphas = [0.01, 0.1, 1, 10]).fit(X_train_red, Y_train) # Does the hyper parameter tuning for us

print("R^2 for training data is {}".format(regr.score(X_train_red, Y_train)))
print("R^2 for testing data is {}".format(regr.score(X_test_red, Y_test)))
print("Alpha used is {}".format(regr.alpha_))

R^2 for training data is 0.46732165011547966
R^2 for testing data is 0.47525845439570746
Alpha used is 10.0


  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


### Random Forest

Basic model with depth of 2.

In [27]:
regr = RandomForestRegressor(max_depth=2, random_state=0).fit(X_train_red, Y_train)

print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.4073640699900307
R^2 for testing data is: 0.38086821198679177


**Hyper Parameter Tuning**

In [28]:
param_grid = {
    "max_depth": [1,2,3,4,5]
}

search = GridSearchCV(estimator=RandomForestRegressor(random_state=0), param_grid = param_grid, cv = 5)
search.fit(X_train_red, Y_train)
print("Best parameters was: {}".format(search.best_estimator_))
print("Best score was: {}".format(search.best_score_))

Best parameters was: RandomForestRegressor(max_depth=5, random_state=0)
Best score was: 0.46742572961517004


# XG Boost

In [29]:
regr = GradientBoostingRegressor(random_state=0)
regr.fit(X_train_red, Y_train)

print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.7097531363692727
R^2 for testing data is: 0.5195733303251673


**Hyper-parameter tuning**

In [30]:
param_grid = {
    "max_depth": [2,3,4,5],
    "min_samples_split": [2,3,4,5],
    "learning_rate": [0.01],
    "loss": ["squared_error"],
}

search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=0), param_grid = param_grid, cv = 5)
search.fit(X_train_red, Y_train)
print("Best parameters was: {}".format(search.best_estimator_))
print("Best score was: {}".format(search.best_score_))

Best parameters was: GradientBoostingRegressor(learning_rate=0.01, max_depth=5, min_samples_split=5,
                          random_state=0)
Best score was: 0.4030636842159071


XGBoost shows signs of overfitting, and may therefore not be the best choice.

# MLP Regressor

In [31]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(random_state=0, activation='logistic', solver = 'lbfgs', alpha = 1, max_iter = 1000).fit(X_train_red, Y_train)
print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.7928114889734811
R^2 for testing data is: 0.35820431489201765


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


**Hyper Parameter tuning**

In [18]:
param_grid = {
    'activation': ['logistic'], 
    'solver': ['lbfgs'], 
    'alpha': [0.1, 0.5, 1]
}

search = GridSearchCV(estimator=MLPRegressor(random_state=0, max_iter = 2000), param_grid = param_grid, cv = 5)
search.fit(X_train_red, Y_train)
print("Best parameters was: {}".format(search.best_estimator_))
print("Best score was: {}".format(search.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best parameters was: MLPRegressor(activation='logistic', alpha=1, max_iter=2000, random_state=0,
             solver='lbfgs')
Best score was: 0.31101179992994804


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
