In [234]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import preprocessing

# Fit Random Forest regression model to data

### Load data, feature engineering and encoding

In [63]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

data = pd.read_csv("merged_data.csv")

positions = [(data.Pos == "FW") | (data.Pos == "FWMF") | (data.Pos == "FWDF"),
             (data.Pos == "MF") | (data.Pos == "MFFW") | (data.Pos == "MFDF"),
             (data.Pos == "DF") | (data.Pos == "DFMF") | (data.Pos == "DFFW")]
posNames = ["Forward","Midfielder","Defender"]
gPos = np.select(positions,posNames)

# Create encoding for the categorical variable
new_var = pd.get_dummies(gPos, drop_first=True)
print(data)

data_new = data.join(new_var)

            date    datetime    dateweek  player_id  current_club_id  \
0     2022-08-05  2022-08-05  2022-08-01      41414              398   
1     2022-08-05  2022-08-05  2022-08-01     661284            55686   
2     2022-07-29  2022-07-29  2022-07-25     663225               29   
3     2022-07-29  2022-07-29  2022-07-25     670883              148   
4     2022-07-29  2022-07-29  2022-07-25     289846             1010   
...          ...         ...         ...        ...              ...   
2312  2021-12-22  2021-12-22  2021-12-20     377387              167   
2313  2021-12-16  2021-12-16  2021-12-13      76277               40   
2314  2021-10-13  2021-10-13  2021-10-11      26399              131   
2315  2021-10-05  2021-10-05  2021-10-04     592474              987   
2316  2021-10-05  2021-10-05  2021-10-04     627248               29   

      market_value_in_eur player_club_domestic_competition_id  \
0                 1000000                                 IT1   
1    

### Create test train split

In [140]:
X = data_new.drop(columns=["market_value_in_eur"]).select_dtypes(exclude=['object'])
Y = data_new['market_value_in_eur']

X_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = X_scaler.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,Y,test_size = 0.33, shuffle = True)

In [187]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func = f_regression, k = 20)
fs.fit(X_train, Y_train)

X_train_red = fs.transform(X_train)
X_test_red = fs.transform(X_test)

vars = pd.DataFrame(X.columns, columns = ['vars'])
scores = pd.DataFrame(fs.scores_, columns = ['scores'])

vars.join(scores).sort_values(by = 'scores', ascending=False).head(20)

Unnamed: 0,vars,scores
10,90s,290.210028
9,Min,290.15649
8,Starts,288.264526
7,MP,224.774937
3,stadium_seats,207.035885
123,RecTarg,98.458549
124,Rec,96.399803
115,Carries,95.695544
117,CarPrgDist,85.28812
116,CarTotDist,84.541563


# Creating models

### Linear Regression 

In [188]:
regr = linear_model.LinearRegression()

regr.fit(X_train_red, Y_train)
print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.3560908934994388
R^2 for testing data is: 0.37117506612756745


### Ridge Linear Regression

Since the regression show clear signs of overfitting, we need to apply regularization. Ridge regression is also referred to as L2 Regularization. 

In [192]:
for i in range(0,100, 10):
    regr = linear_model.Ridge(alpha = i)
    regr.fit(X_train_red, Y_train)
    print('\n','Alpha set to', i)
    print("R^2 for training data is:", regr.score(X_train_red, Y_train))
    print("R^2 for testing data is:", regr.score(X_test_red, Y_test))


 Alpha set to 0
R^2 for training data is: 0.35600945582396226
R^2 for testing data is: 0.370745536932511

 Alpha set to 10
R^2 for training data is: 0.35385841125540796
R^2 for testing data is: 0.37138858046017464

 Alpha set to 20
R^2 for training data is: 0.35194015637296494
R^2 for testing data is: 0.36973289081860605

 Alpha set to 30
R^2 for training data is: 0.35019091058166996
R^2 for testing data is: 0.36805012863553244

 Alpha set to 40
R^2 for training data is: 0.34860130996227556
R^2 for testing data is: 0.36644325573261527

 Alpha set to 50
R^2 for training data is: 0.34715330792042176
R^2 for testing data is: 0.3649356183878364

 Alpha set to 60
R^2 for training data is: 0.3458285180970535
R^2 for testing data is: 0.36352702474770116

 Alpha set to 70
R^2 for training data is: 0.3446101789057451
R^2 for testing data is: 0.36220968796807207

 Alpha set to 80
R^2 for training data is: 0.3434837618839287
R^2 for testing data is: 0.36097392165730413

 Alpha set to 90
R^2 for 

In [199]:
regr = linear_model.RidgeCV(alphas = [0.01, 0.1, 1, 10, 100]).fit(X_train_red, Y_train)

print("R^2 for training data is {}".format(regr.score(X_train_red, Y_train)))
print("R^2 for testing data is {}".format(regr.score(X_test_red, Y_test)))
print("Alpga used is {}".format(regr.alpha_))

R^2 for training data is 0.355493542664586
R^2 for testing data is 0.3722754386029551
Alpga used is 1.0


  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


### Random Forest

Basic model with depth of 2.

In [206]:
regr = RandomForestRegressor(max_depth=2, random_state=0).fit(X_train_red, Y_train)

print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.3814398280038128
R^2 for testing data is: 0.2920456697382401


Simple model optimization for depth. 

In [207]:
for i in range(1,6):
    regr = RandomForestRegressor(max_depth=i, random_state=0)
    regr.fit(X_train_red, Y_train)
    print('\n', 'Max depth set to', i)
    print("R^2 for training data is:", regr.score(X_train_red, Y_train))
    print("R^2 for testing data is:", regr.score(X_test_red, Y_test))


 Max depth set to 1
R^2 for training data is: 0.18154940394665586
R^2 for testing data is: 0.15690928547495697

 Max depth set to 2
R^2 for training data is: 0.3814398280038128
R^2 for testing data is: 0.2920456697382401

 Max depth set to 3
R^2 for training data is: 0.5029434674962263
R^2 for testing data is: 0.37216849359698223

 Max depth set to 4
R^2 for training data is: 0.5958326528995674
R^2 for testing data is: 0.41672854374657475

 Max depth set to 5
R^2 for training data is: 0.6739725881364853
R^2 for testing data is: 0.442269818051107


The Random Forest model performs noticably better than the Ridge Regression. The marginal increase after depth = 5 is very limited, since the training score gains are not transfered to the testing score. Therefore we use depth = 5 going forward.

# XG Boost

In [253]:
regr = GradientBoostingRegressor(random_state=0)
regr.fit(X_train_red, Y_train)

print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.7998655960770708
R^2 for testing data is: 0.4604091358810396


### Hyper-parameter tuning

In [247]:
param_grid = {
    "max_depth": [2,3,4],
    "min_samples_split": [2,3,4],
    "learning_rate": [0.01],
    "loss": ["squared_error"],
}

search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=0), param_grid = param_grid, cv = 5)
search.fit(X_train_red, Y_train)
print("Best parameters was: {}".format(search.best_estimator_))
print("Best score was: {}".format(search.best_score_))

Best parameters was: GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_split=4,
                          random_state=0)
Best score was: 0.33445829872984134


# MLP Regressor

In [259]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(random_state=0, activation='logistic', solver = 'lbfgs', alpha = 1, max_iter = 2000).fit(X_train_red, Y_train)
print("R^2 for training data is:", regr.score(X_train_red, Y_train))
print("R^2 for testing data is:", regr.score(X_test_red, Y_test))

R^2 for training data is: 0.2663210827571666
R^2 for testing data is: 0.28267675834810135


In [260]:
param_grid = {
    'activation': ['logistic'], 
    'solver': ['lbfgs'], 
    'alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1]
}

search = GridSearchCV(estimator=MLPRegressor(random_state=0, max_iter = 2000), param_grid = param_grid, cv = 5)
search.fit(X_train_red, Y_train)
print("Best parameters was: {}".format(search.best_estimator_))
print("Best score was: {}".format(search.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
