In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
import pickle

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.decomposition import PCA # If algorithms are taking too much time

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor

# Baseline from FM

In [2]:
a = pd.read_csv('data/clean_data.csv')
a.value = a.value.str.replace('k', '000').str.replace('m', '000000').str.replace('£', '').astype(int)/1000000
a.cost = a.cost.str.replace('k', '000').str.replace('m', '000000').str.replace('£', '').astype(int)/1000000
a = a[['fee_cleaned','cost','value']].dropna()
a.fee_cleaned = a.fee_cleaned.astype(int)
r2_score(a.fee_cleaned, a.cost), r2_score(a.fee_cleaned, a.value)

(-0.20842484604363998, 0.42839417062946805)

# Actually Evaluating

In [3]:
identifiers = ['player_name', 'club_involved_name', 'season']
player_var = ['age', 'wage', 'ratingFM', 'potential', 'injprone', 'afterloan', 'contract']
team_var = ['reputation', 'balance', 'wagebudget', 'trainingfacilities', 'youthfacilities', 'youthacademy', 'stadiumcap', 'avgattendance']
stats_var = ['gamesPlayed', 'subedOn', 'MotM', 'ratingWS']
offensive_var = ['goalsScored', 'assists', 'shots', 'passes', 'passesSuccess', 'aerialsWon', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'badTouch', 'crosses', 'longBalls', 'throughBalls']
defensive_var = ['yellow', 'red', 'tackles', 'interceptions', 'fouls', 'offsidesProvoked', 'clearances', 'blockedPass', 'ownGoal']
previous_var = ['ATaction', 'ATgoals', 'ATassists', 'ATshots', 'ATpasses', 'ATpassesSuccess', 'ATtackles', 'ATinterceptions', 'ATMotM', 'ATrating', 'CLaction', 'CLgoals', 'CLassists', 'CLMotM', 'CLrating', 'ELaction', 'ELgoals', 'ELassists', 'ELMotM', 'ELrating']
pos_var = ['GK', 'DEF', 'MID', 'STR', 'Wing', 'Central']
external_var = ['year', 'GDP', 'CountryPopulation', 'UEFAranking', 'eurgbp']
features = player_var + team_var + stats_var + offensive_var + defensive_var + previous_var + pos_var + external_var
labels = ['fee_cleaned', 'fee_log']
target = 'fee_cleaned'

removed_features = []

In [4]:
## Bayesian optimization-based feature selection
### Objective function
def objective_selection(trial):
    print(trial.number, end = '\r')
    weighted_df = df[features + [target]]
    for i, feature in enumerate(features):
        if trial.suggest_int(feature, 0, 1) == 0:
            weighted_df = weighted_df.drop(feature, axis = 1)
    features_list = list(weighted_df.drop(target, axis = 1))

    if len(features_list) == 0:
        return 99999999999

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features_list], train_set[target])
    return -r2_score(model.predict(test_set[features_list]), test_set[target])

def bayesian_optimization_selection(model, df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_selection, n_trials=n_trials)

    # need to normalize for feature mean value
    selected_features = []
    for feature in features:
        if study.best_trial.params[feature]:
            selected_features.append(feature)

    return selected_features, study.best_trial.number

In [5]:
## Bayesian optimization-based feature weighting
def objective_weighting(trial):
    print(trial.number, end = '\r')
    weighted_df = df.iloc[:test_loc].copy()
    for i, feature in enumerate(features):
        weighted_df[feature] *= trial.suggest_uniform(feature, 0, 1)

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features], train_set['fee_log'])
    return -r2_score(model.predict(test_set[features]), test_set['fee_log']) 

def bayesian_optimization_weighting(model, df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_weighting, n_trials=n_trials)

    # need to normalize for feature mean value
    weighted_df = df.copy()
    for feature in features:
        weighted_df[feature] *= study.best_trial.params[feature]

    return weighted_df, study.best_trial.number

In [6]:
external_test_validation_split = 0.8
internal_test_validation_split = 0.6

In [7]:
df = pd.read_csv('data/data.csv')
df = df[df.fee_cleaned > 0]
df = df.sample(len(df), random_state=0)
test_loc = int(external_test_validation_split * len(df))

In [8]:
df

Unnamed: 0.1,Unnamed: 0,player_name,club_involved_name,season,age,wage,value,cost,ratingFM,potential,...,STR,Wing,Central,year,GDP,CountryPopulation,UEFAranking,eurgbp,fee_cleaned,fee_log
904,918,Tommy Elphick,Aston Villa,2016/2017,0.147256,-0.081761,-0.377166,-0.370716,0.013683,0.094465,...,-0.662387,-0.655120,1.025943,-0.457995,0.402287,0.172786,-1.231711,-0.020706,3.51,1.506297
159,160,Samuel Umtiti,FC Barcelona,2016/2017,-1.317347,0.127030,0.006686,0.283480,0.157913,0.401829,...,-0.662387,-0.655120,1.025943,-0.457995,0.437768,0.177493,0.183647,-0.020706,22.50,3.157000
974,989,Maicosuel,Ajax Amsterdam,2014/2015,0.147256,-0.238354,-0.377166,-0.458676,-0.687638,-0.101706,...,1.509692,1.526437,-0.974713,-1.481549,-0.053215,0.036322,-0.759925,-0.751406,2.70,1.308333
744,752,James McClean,Stoke City,2018/2019,0.391356,0.440217,0.902342,-0.132827,-0.473095,-0.036617,...,-0.662387,1.526437,-0.974713,0.565559,2.433959,-1.119264,0.419540,0.668371,5.04,1.798404
1452,1475,Miso Brecko,1.FC Nuremberg,2015/2016,0.879557,-0.212255,-0.505117,-0.370716,-0.593888,-0.058765,...,-0.662387,1.526437,-0.974713,-0.969772,0.624527,0.513024,-0.288139,-1.964614,0.45,0.371564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,773,Jasmin Kurtic,SPAL 2013,2018/2019,0.391356,-0.107859,0.518489,0.105063,0.518489,0.248147,...,-0.662387,-0.655120,1.025943,0.565559,-0.530556,-1.178832,-0.759925,0.668371,4.77,1.752672
835,846,Charlie Austin,West Bromwich Albion,2019/2020,0.635457,1.223183,1.286194,0.164535,1.329786,0.564551,...,1.509692,-0.655120,1.025943,1.077336,0.402287,0.172786,-1.231711,0.856442,3.87,1.583094
1216,1233,Tim Ream,Fulham FC,2015/2016,-0.096845,-0.342749,-0.121265,-0.370716,-0.821051,-0.049725,...,-0.662387,-0.655120,1.025943,-0.969772,1.078203,5.629762,0.419540,-1.964614,1.53,0.928219
559,564,José Manuel Jurado,Watford FC,2015/2016,0.391356,0.257525,0.006686,-0.132827,0.678946,0.242271,...,-0.662387,-0.655120,-0.974713,-0.969772,-1.384666,1.791458,1.127219,-1.964614,7.74,2.167910


# Linear Regressor

In [9]:
model = LinearRegression(n_jobs = -1)

In [10]:
bayesian_selection_LR = ['age', 'wage', 'ratingFM', 'afterloan', 'contract', 'balance', 'wagebudget', 'gamesPlayed', 'subedOn', 'MotM', 'ratingWS', 'assists', 'shots', 'aerialsWon', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'throughBalls', 'yellow', 'red', 'tackles', 'interceptions', 'ownGoal', 'ATaction', 'ATpasses', 'ATtackles', 'ATrating', 'CLgoals', 'CLassists', 'ELaction', 'ELassists', 'ELrating', 'Wing', 'Central', 'year', 'GDP', 'CountryPopulation', 'UEFAranking']

In [11]:
scores = cross_val_predict(model, df[bayesian_selection_LR], df['fee_cleaned'], cv=120)
df['LR'] = [max(0,i) for i in scores]

In [12]:
r2_score(df.iloc[test_loc:].fee_cleaned, df.iloc[test_loc:].LR), r2_score(df.fee_cleaned, df.LR)

(0.6595954525286688, 0.6391123626191157)

# K-Nearest Neighbors

In [13]:
model = KNeighborsRegressor(n_neighbors=15, n_jobs = -1)

In [14]:
bayesian_selection_kNN = ['age', 'wage', 'ratingFM', 'contract', 'reputation', 'trainingfacilities', 'youthacademy', 'stadiumcap', 'avgattendance', 'gamesPlayed', 'MotM', 'ratingWS', 'goalsScored', 'shots', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'crosses', 'throughBalls', 'tackles', 'offsidesProvoked', 'blockedPass', 'ownGoal', 'ATgoals', 'ATassists', 'ATshots', 'ATpasses', 'ATpassesSuccess', 'ATtackles', 'ATinterceptions', 'ATMotM', 'ATrating', 'ELaction', 'GK', 'DEF', 'year', 'UEFAranking', 'eurgbp']

In [15]:
BW_df, i = bayesian_optimization_weighting(model, df, bayesian_selection_kNN, 'fee_cleaned', n_trials = 1000)
i



949

In [16]:
scores = cross_val_predict(model, BW_df[bayesian_selection_kNN], BW_df['fee_cleaned'], cv=120)
df['BWKNN'] = scores

In [17]:
r2_score(df.iloc[test_loc:].fee_cleaned, df.iloc[test_loc:].BWKNN), r2_score(df.fee_cleaned, df.BWKNN)

(0.5596835040257819, 0.5390487454012527)

# Random Forest

In [18]:
model = RandomForestRegressor(n_estimators=300, n_jobs = -1)

In [19]:
scores = cross_val_predict(model, df[features], df['fee_cleaned'], cv=120)
df['RF'] = scores

In [20]:
r2_score(df.iloc[test_loc:].fee_cleaned, df.iloc[test_loc:].RF), r2_score(df.fee_cleaned, df.RF)

(0.5803939739853858, 0.5954374072917223)

# Gradient Boosting

In [21]:
model = GradientBoostingRegressor(n_estimators=1000)

In [22]:
scores = cross_val_predict(model, df[features], df['fee_cleaned'], cv=120)
df['GB'] = scores

In [23]:
r2_score(df.iloc[test_loc:].fee_cleaned, df.iloc[test_loc:].GB), r2_score(df.fee_cleaned, df.GB)

(0.5951985358845231, 0.6634573209271926)

# Deep Learning

In [24]:
model = MLPRegressor(hidden_layer_sizes=(200, 200, 200), activation='relu', learning_rate_init=0.001, max_iter=600)

In [25]:
bayesian_selection_NN = ['age', 'wage', 'ratingFM', 'injprone', 'afterloan', 'contract', 'reputation', 'balance', 'wagebudget', 'youthacademy', 'stadiumcap', 'gamesPlayed', 'ratingWS', 'assists', 'shots', 'passesSuccess', 'aerialsWon', 'keyPasses', 'dispossessed', 'badTouch', 'tackles', 'interceptions', 'fouls', 'blockedPass', 'ownGoal', 'ATassists', 'ATshots', 'ATpasses', 'CLaction', 'CLMotM', 'ELassists', 'ELMotM', 'DEF', 'CountryPopulation', 'UEFAranking', 'eurgbp']

In [26]:
scores = cross_val_predict(model, df[bayesian_selection_NN], df['fee_cleaned'], cv=120)
df['NN'] = scores

In [27]:
r2_score(df.iloc[test_loc:].fee_cleaned, df.iloc[test_loc:].NN), r2_score(df.fee_cleaned, df.NN)

(0.5733444365413221, 0.6776206972691076)

# Feature importance

In [28]:
modelLR = LinearRegression(n_jobs = -1)
modelLR.fit(df.iloc[:test_loc][bayesian_selection_LR], df.iloc[:test_loc].fee_cleaned)
#modelLR.coef_

LinearRegression(n_jobs=-1)

In [29]:
bw_weights = {}
for feature in bayesian_selection_kNN:
    bw_weights[feature] = BW_df[feature].mean() / df[feature].mean()

In [30]:
modelRF = RandomForestRegressor(n_estimators=300, n_jobs = -1)
modelRF.fit(df.iloc[:test_loc][features], df.iloc[:test_loc].fee_cleaned)
# modelRF.feature_importances_

RandomForestRegressor(n_estimators=300, n_jobs=-1)

In [31]:
modelGB = GradientBoostingRegressor(n_estimators=1000)
modelGB.fit(df.iloc[:test_loc][features], df.iloc[:test_loc].fee_cleaned)
# modelGB.feature_importances_

GradientBoostingRegressor(n_estimators=1000)

# Feature impacts

In [32]:
big_list = []
for i, feature in enumerate(features):
    small_list = [feature]
    if feature in bayesian_selection_LR:
        small_list.append(modelLR.coef_[bayesian_selection_LR.index(feature)])
    else:
        small_list.append(0)
    if feature in bw_weights.keys():
        small_list.append(bw_weights[feature])
    else:
        small_list.append(0)
    small_list.append(modelRF.feature_importances_[i])
    small_list.append(modelGB.feature_importances_[i])
    if feature in bayesian_selection_NN:
        small_list.append(1)
    else:
        small_list.append(0)
    big_list.append(small_list)

In [33]:
features_impacts = pd.DataFrame(big_list, columns=['feature','LR', 'BWKNN', 'RF', 'GB', 'NN'])
features_impacts.to_csv('misc/feature_impacts.csv')
features_impacts

Unnamed: 0,feature,LR,BWKNN,RF,GB,NN
0,age,-5.252281,0.998725,0.020144,0.047291,1
1,wage,5.513349,0.872637,0.144428,0.132652,1
2,ratingFM,2.882290,0.854993,0.032010,0.016201,1
3,potential,0.000000,1.000000,0.327152,0.322859,0
4,injprone,0.000000,1.000000,0.003201,0.001295,1
...,...,...,...,...,...,...
68,year,1.518230,0.418116,0.001982,0.000186,0
69,GDP,0.006010,1.000000,0.004010,0.001128,0
70,CountryPopulation,0.347647,1.000000,0.004572,0.004685,1
71,UEFAranking,-2.020774,0.933996,0.005542,0.009505,1


# Ensembling

In [34]:
LRstacking = pickle.load(open('misc/lrstacking.sav', 'rb'))
NNstacking = pickle.load(open('misc/nnstacking.sav', 'rb'))

In [35]:
df['LRstack'] = LRstacking.predict(df[['LR', 'BWKNN', 'RF', 'GB', 'NN']])
df['NNstack'] = NNstacking.predict(df[['LR', 'BWKNN', 'RF', 'GB', 'NN']])
df['SimpleAvg'] = (df.LR + df.BWKNN + df.RF + df.GB + df.NN) / 5

In [36]:
r2_score(df.loc[test_loc:].fee_cleaned, df.loc[test_loc:].SimpleAvg), r2_score(df.loc[test_loc:].fee_cleaned, df.loc[test_loc:].LRstack), r2_score(df.loc[test_loc:].fee_cleaned, df.loc[test_loc:].NNstack)

(0.6998018997588282, 0.7202572438702107, 0.7270250155720341)

In [37]:
r2_score(df.fee_cleaned, df.SimpleAvg), r2_score(df.fee_cleaned, df.LRstack), r2_score(df.fee_cleaned, df.NNstack)

(0.6959308706344443, 0.7205184199577142, 0.7294925193778399)

In [38]:
df[['player_name','club_involved_name','fee_cleaned','LR', 'BWKNN', 'RF', 'GB', 'NN', 'LRstack', 'NNstack']].sort_values('fee_cleaned', ascending = False).head(20)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,LR,BWKNN,RF,GB,NN,LRstack,NNstack
0,Neymar,Paris Saint-Germain,199.8,115.019384,65.893333,90.9052,94.076057,158.211968,135.782007,139.422759
1,Kylian Mbappé,Paris Saint-Germain,130.5,75.49884,47.4,32.298367,64.341505,108.483593,90.270519,93.43661
2,Ousmane Dembélé,FC Barcelona,124.2,37.932904,33.255333,29.1499,45.342377,60.401954,52.838002,51.692281
3,João Félix,Atlético Madrid,113.4,30.947381,12.688267,41.838867,65.073057,81.397343,56.73133,70.389442
4,Antoine Griezmann,FC Barcelona,108.0,157.774607,62.503333,88.490267,87.992128,150.584702,136.817789,121.030181
5,Cristiano Ronaldo,Juventus FC,105.3,119.311156,68.551333,98.838783,80.960597,121.186454,118.177789,110.374377
6,Eden Hazard,Real Madrid,103.5,57.156228,46.173333,82.967067,86.928312,86.213095,79.947293,74.042026
7,Paul Pogba,Manchester United,94.5,49.31784,48.663333,53.6048,92.832793,91.049162,79.578102,72.176423
8,Gonzalo Higuaín,Juventus FC,81.0,40.791414,37.959333,43.773867,49.651552,21.544575,35.595506,39.264926
9,Harry Maguire,Manchester United,78.3,33.319654,22.662,22.76515,37.399004,34.714896,33.709079,35.577833


# Error discretization

In [39]:
quartile_data = []
tdf = df[(df.fee_cleaned > df.fee_cleaned.quantile(0.25))]
quartile_data.append([
    r2_score(tdf.fee_cleaned, tdf.LR), 
    r2_score(tdf.fee_cleaned, tdf.BWKNN), 
    r2_score(tdf.fee_cleaned, tdf.RF), 
    r2_score(tdf.fee_cleaned, tdf.GB), 
    r2_score(tdf.fee_cleaned, tdf.NN), 
    r2_score(tdf.fee_cleaned, tdf.LRstack), 
    r2_score(tdf.fee_cleaned, tdf.NNstack)])
tdf = df[(df.fee_cleaned < df.fee_cleaned.quantile(0.5)) & (df.fee_cleaned > df.fee_cleaned.quantile(0.25))]
quartile_data.append([
    r2_score(tdf.fee_cleaned, tdf.LR), 
    r2_score(tdf.fee_cleaned, tdf.BWKNN), 
    r2_score(tdf.fee_cleaned, tdf.RF), 
    r2_score(tdf.fee_cleaned, tdf.GB), 
    r2_score(tdf.fee_cleaned, tdf.NN), 
    r2_score(tdf.fee_cleaned, tdf.LRstack), 
    r2_score(tdf.fee_cleaned, tdf.NNstack)])
tdf = df[(df.fee_cleaned < df.fee_cleaned.quantile(0.75)) & (df.fee_cleaned > df.fee_cleaned.quantile(0.5))]
quartile_data.append([
    r2_score(tdf.fee_cleaned, tdf.LR), 
    r2_score(tdf.fee_cleaned, tdf.BWKNN), 
    r2_score(tdf.fee_cleaned, tdf.RF), 
    r2_score(tdf.fee_cleaned, tdf.GB), 
    r2_score(tdf.fee_cleaned, tdf.NN), 
    r2_score(tdf.fee_cleaned, tdf.LRstack), 
    r2_score(tdf.fee_cleaned, tdf.NNstack)])
tdf = df[(df.fee_cleaned > df.fee_cleaned.quantile(0.75))]
quartile_data.append([
    r2_score(tdf.fee_cleaned, tdf.LR), 
    r2_score(tdf.fee_cleaned, tdf.BWKNN), 
    r2_score(tdf.fee_cleaned, tdf.RF), 
    r2_score(tdf.fee_cleaned, tdf.GB), 
    r2_score(tdf.fee_cleaned, tdf.NN), 
    r2_score(tdf.fee_cleaned, tdf.LRstack), 
    r2_score(tdf.fee_cleaned, tdf.NNstack)])
pd.DataFrame(quartile_data, columns=['LR','BW','RF','GB','NN','LRStack','NNStack'])

Unnamed: 0,LR,BW,RF,GB,NN,LRStack,NNStack
0,0.609294,0.488475,0.555778,0.629342,0.642826,0.69198,0.70162
1,-63.850867,-28.216869,-43.287575,-36.288595,-24.911874,-26.9608,-28.712739
2,-10.022265,-4.74312,-8.498522,-7.837566,-8.678402,-7.513751,-7.441517
3,0.469887,0.168153,0.341302,0.464173,0.481234,0.558508,0.577257


# The outliers

In [42]:
df['LRdelta'] = df.LR - df.fee_cleaned
df['BWKNNdelta'] = df.BWKNN - df.fee_cleaned
df['RFdelta'] = df.RF - df.fee_cleaned
df['GBdelta'] = df.GB - df.fee_cleaned
df['NNdelta'] = df.NN - df.fee_cleaned
df['NNstackdelta'] = df.NNstack - df.fee_cleaned

In [43]:
df[['player_name', 'club_involved_name', 'fee_cleaned', 'NNstackdelta', 'LRdelta', 'BWKNNdelta', 'RFdelta', 'GBdelta', 'NNdelta']].sort_values('fee_cleaned').tail(20)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,NNstackdelta,LRdelta,BWKNNdelta,RFdelta,GBdelta,NNdelta
19,Arthur,Juventus FC,64.8,-19.891302,-23.358196,-33.732,-34.260967,-17.542433,-23.69733
18,Romelu Lukaku,Inter Milan,66.6,-33.624248,-22.716533,-38.598,-30.853833,-26.020976,-48.280316
17,Ángel Di María,Manchester United,67.5,-19.655993,-30.246143,-29.826,-29.7308,-15.096314,-11.102034
16,James Rodríguez,Real Madrid,67.5,-25.565183,-35.375444,-40.076667,-30.261367,-23.329187,-18.637601
15,Kevin De Bruyne,Cardiff City,68.4,-9.325684,-25.895121,-24.018,-27.635767,-15.206705,2.189367
13,Nicolas Pépé,Arsenal FC,72.0,-34.502987,-41.674501,-45.9,-45.643,-47.31048,-29.044265
14,Kai Havertz,Chelsea FC,72.0,-22.157193,-34.446958,-43.356,-24.044933,-26.752086,-11.183836
12,Lucas Hernández,Bayern Munich,72.0,-40.924127,-44.084423,-55.736667,-45.729933,-43.552489,-34.680889
11,Luis Suárez,FC Barcelona,73.55,18.300642,4.844606,-20.894,-20.2986,9.14112,25.556482
10,Romelu Lukaku,Manchester United,76.23,-30.626744,-43.121844,-41.069333,-44.444867,-38.091502,-21.514677
