In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
import pickle

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.decomposition import PCA # If algorithms are taking too much time

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor

In [2]:
identifiers = ['player_name', 'club_involved_name', 'season']
player_var = ['age', 'wage', 'ratingFM', 'potential', 'injprone', 'afterloan', 'contract']
team_var = ['reputation', 'balance', 'wagebudget', 'trainingfacilities', 'youthfacilities', 'youthacademy', 'stadiumcap', 'avgattendance']
stats_var = ['gamesPlayed', 'subedOn', 'MotM', 'ratingWS']
offensive_var = ['goalsScored', 'assists', 'shots', 'passes', 'passesSuccess', 'aerialsWon', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'badTouch', 'crosses', 'longBalls', 'throughBalls']
defensive_var = ['yellow', 'red', 'tackles', 'interceptions', 'fouls', 'offsidesProvoked', 'clearances', 'blockedPass', 'ownGoal']
previous_var = ['ATaction', 'ATgoals', 'ATassists', 'ATshots', 'ATpasses', 'ATpassesSuccess', 'ATtackles', 'ATinterceptions', 'ATMotM', 'ATrating', 'CLaction', 'CLgoals', 'CLassists', 'CLMotM', 'CLrating', 'ELaction', 'ELgoals', 'ELassists', 'ELMotM', 'ELrating']
pos_var = ['GK', 'DEF', 'MID', 'STR', 'Wing', 'Central']
external_var = ['year', 'GDP', 'CountryPopulation', 'UEFAranking', 'eurgbp']
features = player_var + team_var + stats_var + offensive_var + defensive_var + previous_var + pos_var + external_var
labels = ['fee_cleaned', 'fee_log']
target = 'fee_cleaned'

removed_features = []

In [3]:
## Bayesian optimization-based feature selection
### Objective function
def objective_selection(trial):
    #print(trial.number, end = ' ')
    weighted_df = train_df[features + [target]]
    for i, feature in enumerate(features):
        if trial.suggest_int(feature, 0, 1) == 0:
            weighted_df = weighted_df.drop(feature, axis = 1)
    features_list = list(weighted_df.drop(target, axis = 1))

    if len(features_list) == 0:
        return 99999999999

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features_list], train_set[target])
    return -r2_score(model.predict(test_set[features_list]), test_set[target])

def bayesian_optimization_selection(model, train_df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_selection, n_trials=n_trials)

    # need to normalize for feature mean value
    selected_features = []
    for feature in features:
        if study.best_trial.params[feature]:
            selected_features.append(feature)

    return selected_features, study.best_trial.number

In [4]:
## Bayesian optimization-based feature weighting
def objective_weighting(trial):
    #print(trial.number, end = ' ')
    weighted_df = train_df.copy()
    for i, feature in enumerate(features):
        weighted_df[feature] *= trial.suggest_uniform(feature, 0, 1)

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features], train_set['fee_log'])
    return -r2_score(model.predict(test_set[features]), test_set['fee_log']) 

def bayesian_optimization_weighting(model, df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_weighting, n_trials=n_trials)

    # need to normalize for feature mean value
    weighted_df = train_df.copy()
    for feature in features:
        weighted_df[feature] *= study.best_trial.params[feature]

    return weighted_df, study.best_trial.number, study.best_trial.params

In [5]:
external_test_validation_split = 0.8
internal_test_validation_split = 0.6

In [6]:
df = pd.read_csv('data/data.csv')
df = df[df.fee_cleaned > 0]
df = df.sample(len(df), random_state=0)
train_df = df.iloc[:int(external_test_validation_split * len(df))]

In [7]:
train_df

Unnamed: 0.1,Unnamed: 0,player_name,club_involved_name,season,age,wage,value,cost,ratingFM,potential,...,STR,Wing,Central,year,GDP,CountryPopulation,UEFAranking,eurgbp,fee_cleaned,fee_log
904,918,Tommy Elphick,Aston Villa,2016/2017,0.147256,-0.081761,-0.377166,-0.370716,0.013683,0.094465,...,-0.662387,-0.655120,1.025943,-0.457995,0.402287,0.172786,-1.231711,-0.020706,3.51,1.506297
159,160,Samuel Umtiti,FC Barcelona,2016/2017,-1.317347,0.127030,0.006686,0.283480,0.157913,0.401829,...,-0.662387,-0.655120,1.025943,-0.457995,0.437768,0.177493,0.183647,-0.020706,22.50,3.157000
974,989,Maicosuel,Ajax Amsterdam,2014/2015,0.147256,-0.238354,-0.377166,-0.458676,-0.687638,-0.101706,...,1.509692,1.526437,-0.974713,-1.481549,-0.053215,0.036322,-0.759925,-0.751406,2.70,1.308333
744,752,James McClean,Stoke City,2018/2019,0.391356,0.440217,0.902342,-0.132827,-0.473095,-0.036617,...,-0.662387,1.526437,-0.974713,0.565559,2.433959,-1.119264,0.419540,0.668371,5.04,1.798404
1452,1475,Miso Brecko,1.FC Nuremberg,2015/2016,0.879557,-0.212255,-0.505117,-0.370716,-0.593888,-0.058765,...,-0.662387,1.526437,-0.974713,-0.969772,0.624527,0.513024,-0.288139,-1.964614,0.45,0.371564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,843,Nathan Baker,Bristol City,2017/2018,-0.340945,0.048734,-0.249215,-0.251772,0.338201,0.302388,...,-0.662387,-0.655120,1.025943,0.053782,0.402287,0.172786,0.419540,0.565087,3.90,1.589235
951,966,David Ospina,SSC Napoli,2019/2020,0.635457,0.570711,1.158243,-0.251772,1.618247,0.614272,...,-0.662387,-0.655120,-0.974713,1.077336,-1.619825,-0.171556,-1.231711,0.856442,3.15,1.423108
1144,1160,Nick Viergever,Ajax Amsterdam,2014/2015,-0.829146,-0.342749,-0.249215,-0.073354,-0.332471,0.142830,...,-0.662387,-0.655120,1.025943,-1.481549,1.079317,-0.860587,1.599005,-0.751406,1.80,1.029619
1037,1052,Sandro Wagner,TSG 1899 Hoffenheim,2016/2017,0.147256,-0.368848,-0.377166,-0.430189,0.897094,0.279787,...,1.509692,-0.655120,1.025943,-0.457995,0.624527,0.513024,-0.288139,-0.020706,2.52,1.258461


# Linear Regressor

In [8]:
model = LinearRegression(n_jobs = -1)

In [9]:
bayesian_selection_LR, i = bayesian_optimization_selection(model, train_df, features, 'fee_cleaned', n_trials = 500)
i

351

In [10]:
for feature in bayesian_selection_LR: print(feature, end = ', ')

age, wage, ratingFM, afterloan, contract, balance, wagebudget, gamesPlayed, subedOn, MotM, ratingWS, assists, shots, aerialsWon, keyPasses, dribbles, offsides, dispossessed, throughBalls, yellow, red, tackles, interceptions, ownGoal, ATaction, ATpasses, ATtackles, ATrating, CLgoals, CLassists, ELaction, ELassists, ELrating, Wing, Central, year, GDP, CountryPopulation, UEFAranking, 

In [11]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['LRclean'] = scores

In [12]:
scores = cross_val_predict(model, train_df[bayesian_selection_LR], train_df['fee_cleaned'], cv=5)
train_df['LRselection'] = scores

In [13]:
scores = cross_val_predict(model, train_df[features], train_df['fee_log'], cv=5)
train_df['LRlog'] = scores
train_df['LRdelog'] = np.exp(scores) - 1

In [14]:
train_df = train_df.fillna(0)
(r2_score(train_df.fee_cleaned, train_df.LRclean), 
r2_score(train_df.fee_cleaned, train_df.LRselection), 
r2_score(train_df.fee_log, train_df.LRlog),
r2_score(train_df.fee_cleaned, train_df.LRdelog))

(0.581760875165358,
 0.6032958375518847,
 0.6842718491575767,
 -3.0874112638480478)

In [15]:
train_df[['player_name','club_involved_name','fee_cleaned','LRclean','LRselection','LRdelog','fee_log','LRlog']].sort_values('LRclean').tail(10)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,LRclean,LRselection,LRdelog,fee_log,LRlog
677,Robin van Persie,Fulham FC,5.85,48.798678,44.815446,23.598617,1.924249,3.20269
54,Alexis Sánchez,Arsenal FC,38.25,49.3439,47.864326,53.109012,3.669951,3.991001
7,Paul Pogba,Manchester United,94.5,49.922553,50.515867,120.371719,4.559126,4.798858
18,Romelu Lukaku,Inter Milan,66.6,54.411634,45.313887,80.20049,4.213608,4.396921
6,Eden Hazard,Real Madrid,103.5,58.824961,58.757208,79.012312,4.649187,4.382181
103,Cesc Fàbregas,Chelsea FC,29.7,65.982032,63.052931,53.751773,3.424263,4.00281
1,Kylian Mbappé,Paris Saint-Germain,130.5,84.719747,78.069587,79.27114,4.879007,4.38541
0,Neymar,Paris Saint-Germain,199.8,118.730558,120.935769,712.286197,5.302309,6.569883
5,Cristiano Ronaldo,Juventus FC,105.3,136.020411,141.408948,274.94913,4.666265,5.620217
4,Antoine Griezmann,FC Barcelona,108.0,187.355805,170.204157,971.648667,4.691348,6.880023


In [16]:
model.fit(train_df[features], train_df['fee_cleaned'])
for feature, weight in zip(features, model.coef_): print(feature.rjust(20), weight, end = '\n')

                 age -4.949692926940159
                wage 6.013314528014815
            ratingFM 2.667556947366331
           potential -0.030175703353564565
            injprone -0.22965829317901126
           afterloan -0.49352164720704517
            contract 1.866291634011613
          reputation 1.2939898770060538
             balance 0.11220639875317351
          wagebudget -0.23236379871954926
  trainingfacilities 0.15039320171714854
     youthfacilities -0.8631830148993382
        youthacademy 1.027045935034352
          stadiumcap 0.2947008423878441
       avgattendance -1.042568891916747
         gamesPlayed 1.239836485452671
             subedOn -0.22847946967302168
                MotM 0.3614546862791377
            ratingWS 3.860962123712547
         goalsScored 0.3910154475538614
             assists 1.556082336307878
               shots 0.03278099852907712
              passes -1.2304007330596785
       passesSuccess 1.3235557359765142
          aerialsWon -1.0483380

# K-Nearest Neighbors

In [7]:
scores = []
for i in [3, 5, 7, 9, 11, 15, 20, 30]:
    model = KNeighborsRegressor(n_neighbors=i, n_jobs = -1)
    scores.append(cross_val_score(model, train_df[features], train_df['fee_cleaned'], cv=5).mean())

In [8]:
scores

[0.3395082986596324,
 0.33655959114811945,
 0.3522754084029859,
 0.34818120062350816,
 0.3415462912500539,
 0.3344087341044324,
 0.32115103274954737,
 0.29698681089542867]

In [10]:
model = KNeighborsRegressor(n_neighbors=7, n_jobs = -1)

In [11]:
bayesian_selection_kNN, i = bayesian_optimization_selection(model, train_df, features, 'fee_cleaned', n_trials = 500)
i

493

In [12]:
BW_train_df1, i, params = bayesian_optimization_weighting(model, train_df, features, 'fee_cleaned', n_trials = 1000)
i

998

In [13]:
BW_train_df2, i, params = bayesian_optimization_weighting(model, train_df, bayesian_selection_kNN, 'fee_cleaned', n_trials = 1000)
i

937

In [14]:
for feature in bayesian_selection_kNN: print(feature, end = ', ')

age, ratingFM, injprone, reputation, balance, subedOn, ratingWS, goalsScored, aerialsWon, keyPasses, badTouch, yellow, offsidesProvoked, clearances, ATassists, ATpasses, ATpassesSuccess, ATtackles, ATinterceptions, ATrating, CLMotM, CLrating, ELgoals, ELassists, ELMotM, ELrating, GK, DEF, year, CountryPopulation, UEFAranking, 

In [15]:
for feature in bayesian_selection_kNN: print(params[feature], end = ', ')

0.9080832407084423, 0.7012925463875592, 0.2433677465030823, 0.43873643556493297, 0.13733987827695543, 0.12729724414928298, 0.8103820282148873, 0.1276090501672072, 0.9792140578383226, 0.21118815847435216, 0.0958656589375252, 0.3596522735687437, 0.24420936283503494, 0.2357162180810693, 0.3466849770262128, 0.19745340463169717, 0.355265733461544, 0.036631127766739664, 0.13893172122336592, 0.18949802117182915, 0.3764041693301598, 0.8004980043255004, 0.06863175718700944, 0.3342559092858046, 0.5447965017295756, 0.7349332525126426, 0.18799016060440082, 0.5818613058601788, 0.5619548418945728, 0.03938408091365346, 0.8781337827713466, 

In [16]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['KNNclean'] = scores

In [17]:
scores = cross_val_predict(model, train_df[bayesian_selection_kNN], train_df['fee_cleaned'], cv=5)
train_df['KNNselection'] = scores

In [18]:
scores = cross_val_predict(model, BW_train_df1[features], BW_train_df1['fee_cleaned'], cv=5)
train_df['KNNweighted'] = scores

In [19]:
scores = cross_val_predict(model, BW_train_df2[bayesian_selection_kNN], BW_train_df2['fee_cleaned'], cv=5)
train_df['KNNweighted2'] = scores

In [20]:
scores = cross_val_predict(model, train_df[features], train_df['fee_log'], cv=5)
train_df['KNNlog'] = scores
train_df['KNNdelog'] = np.exp(scores) - 1

In [21]:
train_df = train_df.fillna(0)
(r2_score(train_df.fee_cleaned, train_df.KNNclean), 
r2_score(train_df.fee_cleaned, train_df.KNNselection), 
r2_score(train_df.fee_cleaned, train_df.KNNweighted),
r2_score(train_df.fee_cleaned, train_df.KNNweighted2),
r2_score(train_df.fee_log, train_df.KNNlog),
r2_score(train_df.fee_cleaned, train_df.KNNdelog))

(0.3691810541522962,
 0.36187819099713514,
 0.46572916867406844,
 0.37514258822901414,
 0.39860494275885694,
 0.26907509518809836)

In [22]:
train_df[['player_name','club_involved_name','fee_cleaned','KNNclean','KNNselection','KNNweighted','KNNweighted2','KNNdelog','fee_log','KNNlog']].sort_values('KNNclean').tail(10)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,KNNclean,KNNselection,KNNweighted,KNNweighted2,KNNdelog,fee_log,KNNlog
342,Dimitri Payet,West Ham United,13.5,34.457143,30.471429,31.114286,27.617143,20.698453,2.674149,3.077241
56,Henrikh Mkhitaryan,Manchester United,37.8,36.064286,36.257143,31.307143,33.75,23.182884,3.65842,3.185645
59,Sadio Mané,Liverpool FC,37.08,36.367143,24.32,33.757143,31.61,29.549592,3.639689,3.419351
17,Ángel Di María,Manchester United,67.5,36.54,55.478571,38.764286,32.592857,30.8417,4.226834,3.460777
16,James Rodríguez,Real Madrid,67.5,42.364286,27.835714,21.664286,18.9,30.320605,4.226834,3.444276
6,Eden Hazard,Real Madrid,103.5,44.524286,55.864286,37.697143,35.948571,36.437239,4.649187,3.622666
103,Cesc Fàbregas,Chelsea FC,29.7,45.064286,33.814286,42.544286,33.364286,35.848766,3.424263,3.606822
0,Neymar,Paris Saint-Germain,199.8,58.088571,75.445714,66.728571,61.817143,42.025396,5.302309,3.761791
4,Antoine Griezmann,FC Barcelona,108.0,82.671429,35.994286,86.335714,29.462857,40.448393,4.691348,3.724449
5,Cristiano Ronaldo,Juventus FC,105.3,85.95,62.228571,81.578571,61.778571,58.493169,4.666265,4.085862


# Random Forest

In [31]:
scores = []
for i in tqdm([30, 100, 300, 500]):
    model = RandomForestRegressor(n_estimators=i, n_jobs = -1)
    scores.append(cross_val_score(model, train_df[features], train_df['fee_cleaned'], cv=5).mean())

100%|██████████| 4/4 [00:33<00:00,  8.38s/it]


In [32]:
scores

[0.48367680749018616,
 0.5324610047528318,
 0.5277942465130059,
 0.5306557942519766]

In [33]:
model = RandomForestRegressor(n_estimators=300, n_jobs = -1)

In [34]:
bayesian_selection_RF, i = bayesian_optimization_selection(model, train_df, features, 'fee_cleaned', n_trials = 500)
i

454

In [35]:
for feature in bayesian_selection_RF: print(feature, end = ', ')

age, wage, potential, injprone, afterloan, contract, balance, trainingfacilities, youthfacilities, stadiumcap, gamesPlayed, subedOn, MotM, ratingWS, assists, dispossessed, throughBalls, interceptions, offsidesProvoked, ownGoal, ATaction, ATpasses, ATpassesSuccess, ATtackles, CLaction, CLgoals, CLassists, CLrating, ELaction, GK, MID, Central, CountryPopulation, 

In [36]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['RFclean'] = scores

In [37]:
scores = cross_val_predict(model, train_df[bayesian_selection_RF], train_df['fee_cleaned'], cv=5)
train_df['RFselection'] = scores

In [38]:
scores = cross_val_predict(model, train_df[features], train_df['fee_log'], cv=5)
train_df['RFlog'] = scores
train_df['RFdelog'] = np.exp(scores) - 1

In [39]:
train_df = train_df.fillna(0)
(r2_score(train_df.fee_cleaned, train_df.RFclean), 
r2_score(train_df.fee_cleaned, train_df.RFselection), 
r2_score(train_df.fee_log, train_df.RFlog),
r2_score(train_df.fee_cleaned, train_df.RFdelog))

(0.5393858286364899,
 0.5593179253834581,
 0.6488220457017393,
 0.45828373824975155)

In [40]:
train_df[['player_name','club_involved_name','fee_cleaned','RFclean','RFselection','RFdelog','fee_log','RFlog']].sort_values('RFclean')

Unnamed: 0,player_name,club_involved_name,fee_cleaned,RFclean,RFselection,RFdelog,fee_log,RFlog
1538,Gabriele Perico,AC Cesena,0.090,0.679253,0.833697,0.636132,0.086178,0.492335
1505,Sebastian Eriksson,FK Krasnodar,0.293,0.691377,0.609533,0.620375,0.256965,0.482658
1465,Branko Jovicic,SBV Excelsior Rotterdam,0.450,0.737377,0.780553,0.629659,0.371564,0.488371
1402,Ismaël Diomandé,SM Caen,0.675,0.851703,1.174503,0.872858,0.515813,0.627465
854,Davide Brivio,Genoa CFC,3.600,0.893727,0.899117,0.779190,1.526056,0.576158
...,...,...,...,...,...,...,...,...
4,Antoine Griezmann,FC Barcelona,108.000,93.878033,111.219767,37.668931,4.691348,3.655036
6,Eden Hazard,Real Madrid,103.500,93.928367,97.005600,47.844853,4.649187,3.888649
29,Luka Jovic,Real Madrid,54.000,99.932867,96.203433,18.353797,4.007333,2.962889
184,Luka Jovic,Eintracht Frankfurt,20.250,99.983867,96.204333,18.016731,3.056357,2.945319


# Gradient Boosting

In [41]:
scores = []
for i in tqdm([30, 100, 300, 500, 700, 1000]):
    model = GradientBoostingRegressor(n_estimators=i)
    scores.append(cross_val_score(model, train_df[features], train_df['fee_cleaned'], cv=5).mean())

100%|██████████| 6/6 [01:51<00:00, 18.62s/it]


In [42]:
scores

[0.5126168389858454,
 0.5643571104130096,
 0.5769807395129604,
 0.5595037987341529,
 0.5637608356579934,
 0.5643680740521304]

In [43]:
model = GradientBoostingRegressor(n_estimators=1000)

In [44]:
bayesian_selection_GB, i = bayesian_optimization_selection(model, train_df, features, 'fee_cleaned', n_trials = 500)
i

463

In [45]:
for feature in bayesian_selection_GB: print(feature, end = ', ')

age, ratingFM, potential, afterloan, contract, reputation, balance, wagebudget, trainingfacilities, stadiumcap, gamesPlayed, subedOn, passes, passesSuccess, keyPasses, offsides, dispossessed, crosses, yellow, red, interceptions, offsidesProvoked, clearances, ownGoal, ATaction, CLgoals, CLMotM, ELaction, GK, MID, Central, year, CountryPopulation, UEFAranking, 

In [46]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['GBclean'] = scores

In [47]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['GBselection'] = scores

In [48]:
scores = cross_val_predict(model, train_df[features], train_df['fee_log'], cv=5)
train_df['GBlog'] = scores
train_df['GBdelog'] = np.exp(scores) - 1

In [49]:
train_df = train_df.fillna(0)
(r2_score(train_df.fee_cleaned, train_df.GBclean), 
r2_score(train_df.fee_cleaned, train_df.GBselection), 
r2_score(train_df.fee_log, train_df.GBlog),
r2_score(train_df.fee_cleaned, train_df.GBdelog))

(0.5861165041680523,
 0.5776995254336622,
 0.6993029745784354,
 0.6185865195934572)

In [50]:
train_df[['player_name','club_involved_name','fee_cleaned','GBclean','GBselection','GBdelog','fee_log','GBlog']].sort_values('GBclean')

Unnamed: 0,player_name,club_involved_name,fee_cleaned,GBclean,GBselection,GBdelog,fee_log,GBlog
856,Miha Mevlja,FC Sochi,3.600,-4.991574,-4.991574,1.380422,1.526056,0.867278
1382,Rajiv van La Parra,Huddersfield Town,0.810,-2.978535,-2.901342,1.020566,0.593327,0.703377
1464,Emil Hallfredsson,Frosinone Calcio,0.450,-2.552578,-2.399905,0.003374,0.371564,0.003369
1451,Gegé,ACF Fiorentina,0.450,-2.474441,-2.862698,0.035454,0.371564,0.034840
1497,Kenneth Paal,PEC Zwolle,0.315,-2.282616,-1.439295,0.181563,0.273837,0.166838
...,...,...,...,...,...,...,...,...
29,Luka Jovic,Real Madrid,54.000,89.171178,99.225470,30.156088,4.007333,3.439010
0,Neymar,Paris Saint-Germain,199.800,107.244404,104.902972,271.153740,5.302309,5.606367
6,Eden Hazard,Real Madrid,103.500,110.200801,107.340505,83.884898,4.649187,4.441296
5,Cristiano Ronaldo,Juventus FC,105.300,122.142792,110.584632,19.454063,4.666265,3.018182


# Deep Learning

In [51]:
model = MLPRegressor(hidden_layer_sizes=(200, 200, 200), activation='relu', learning_rate_init=0.001, max_iter=600)

In [52]:
bayesian_selection_NN, i = bayesian_optimization_selection(model, train_df, features, 'fee_cleaned', n_trials = 500)
i

370

In [53]:
for feature in bayesian_selection_NN: print(feature, end = ', ')

age, wage, ratingFM, injprone, afterloan, contract, reputation, balance, wagebudget, youthacademy, stadiumcap, gamesPlayed, ratingWS, assists, shots, passesSuccess, aerialsWon, keyPasses, dispossessed, badTouch, tackles, interceptions, fouls, blockedPass, ownGoal, ATassists, ATshots, ATpasses, CLaction, CLMotM, ELassists, ELMotM, DEF, CountryPopulation, UEFAranking, eurgbp, 

In [54]:
scores = cross_val_predict(model, train_df[features], train_df['fee_cleaned'], cv=5)
train_df['NNclean'] = scores

In [55]:
scores = cross_val_predict(model, train_df[bayesian_selection_NN], train_df['fee_cleaned'], cv=5)
train_df['NNselection'] = scores

In [56]:
scores = cross_val_predict(model, train_df[features], train_df['fee_log'], cv=5)
train_df['NNlog'] = scores
train_df['NNdelog'] = np.exp(scores) - 1

In [57]:
train_df = train_df.fillna(0)
(r2_score(train_df.fee_cleaned, train_df.NNclean), 
r2_score(train_df.fee_cleaned, train_df.NNselection),
r2_score(train_df.fee_log, train_df.NNlog),
r2_score(train_df.fee_cleaned, train_df.NNdelog))

(0.659420705035654,
 0.6810572310566261,
 0.5989513616802553,
 -10.255606898063055)

In [58]:
train_df[['player_name','club_involved_name','fee_cleaned','NNclean','NNselection','NNdelog','fee_log','NNlog']].sort_values('NNclean')

Unnamed: 0,player_name,club_involved_name,fee_cleaned,NNclean,NNselection,NNdelog,fee_log,NNlog
1003,Thomas Touré,SCO Angers,2.700,-1.488517,0.535790,6.426560,1.308333,2.005063
1531,Anouar Kali,FC Utrecht,0.135,-0.945229,0.417662,-0.062590,0.126633,-0.064635
1404,Cauê,Ajax Amsterdam,0.675,-0.837385,0.473847,0.947005,0.515813,0.666292
1500,Clint Leemans,PEC Zwolle,0.315,-0.799803,0.906280,0.496794,0.273837,0.403326
1270,Alfred Gomis,SPAL 2013,1.350,-0.704799,2.100146,0.941437,0.854415,0.663429
...,...,...,...,...,...,...,...,...
7,Paul Pogba,Manchester United,94.500,80.320398,105.356395,124.921293,4.559126,4.835657
1,Kylian Mbappé,Paris Saint-Germain,130.500,89.215063,83.025670,74.087242,4.879007,4.318651
5,Cristiano Ronaldo,Juventus FC,105.300,106.885456,112.721457,308.940666,4.666265,5.736381
4,Antoine Griezmann,FC Barcelona,108.000,162.868999,184.955370,1766.602249,4.691348,7.477379


# Ensembling

In [81]:
edf = pd.read_csv('misc/data_from_modelling.csv')

In [82]:
edf['SimpleAvg'] = (edf.LRselection + edf.KNNweighted2 + edf.RFselection + edf.GBselection + edf.NNselection) / 5
r2_score(edf.fee_cleaned, edf.SimpleAvg)

0.6851896083905907

In [83]:
stacking_features = ['LRselection', 'KNNweighted2', 'RFclean', 'GBclean', 'NNselection']

In [84]:
model = LinearRegression(n_jobs = -1)
model.fit(edf[stacking_features], edf.fee_cleaned)
pickle.dump(model, open('misc/lrstacking.sav', 'wb'))
edf['stackingLR'] = model.predict(edf[stacking_features])
r2_score(edf.fee_cleaned, edf.stackingLR)

0.7269798204347787

In [85]:
for feature, weight in zip(stacking_features, model.coef_):
    print(feature.rjust(20), weight, end = '\n')

         LRselection 0.15624793239724905
        KNNweighted2 0.37279796886935535
             RFclean 0.09296444454788022
             GBclean 0.010547433350557966
         NNselection 0.536519292366993


In [88]:
model = MLPRegressor(hidden_layer_sizes=(20, 20), activation='relu', learning_rate_init=0.001, max_iter=600)
model.fit(edf[stacking_features], edf.fee_cleaned)
pickle.dump(model, open('misc/nnstacking.sav', 'wb'))
edf['stackingNN'] = model.predict(edf[stacking_features])
r2_score(edf.fee_cleaned, edf.stackingNN)

0.747058684272197

In [89]:
edf[['player_name','club_involved_name','fee_cleaned','stackingLR','stackingNN']].sort_values('fee_cleaned').tail(20)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,stackingLR,stackingNN
1031,Thomas Lemar,Atlético Madrid,63.0,51.309805,56.47394
62,Arthur,Juventus FC,64.8,36.998948,40.165025
122,Romelu Lukaku,Inter Milan,66.6,40.419884,40.75558
1001,Ángel Di María,Manchester United,67.5,50.521454,49.712469
613,James Rodríguez,Real Madrid,67.5,38.7925,41.573533
577,Kevin De Bruyne,Cardiff City,68.4,42.001307,43.637566
735,Nicolas Pépé,Arsenal FC,72.0,39.285877,40.092912
148,Kai Havertz,Chelsea FC,72.0,62.985563,62.901593
909,Lucas Hernández,Bayern Munich,72.0,27.091183,27.698575
549,Romelu Lukaku,Manchester United,76.23,62.061249,63.403807


# The outliers

In [90]:
train_df['LRdelta'] = train_df.LRclean - train_df.fee_cleaned
train_df['KNNdelta'] = train_df.KNNclean - train_df.fee_cleaned
train_df['RFdelta'] = train_df.RFclean - train_df.fee_cleaned
train_df['GBdelta'] = train_df.GBclean - train_df.fee_cleaned
train_df['NNdelta'] = train_df.NNclean - train_df.fee_cleaned

In [91]:
train_df[['player_name','club_involved_name','fee_cleaned','LRdelta', 'KNNdelta', 'RFdelta', 'GBdelta', 'NNdelta']].sort_values('fee_cleaned')

Unnamed: 0,player_name,club_involved_name,fee_cleaned,LRdelta,KNNdelta,RFdelta,GBdelta,NNdelta
1544,Birkir Bjarnason,Delfino Pescara 1936,0.018,1.353014,4.900867,3.315510,2.194562,1.044757
1543,Oualid El Hajjam,ES Troyes AC,0.032,-3.565990,3.235667,1.795057,-0.150551,0.427235
1542,Chaker Alhadhur,SM Caen,0.045,-1.998812,3.960000,1.727753,1.483532,3.142972
1539,Domenico Maietta,Bologna FC 1909,0.090,-3.863142,1.760467,1.663130,2.080873,1.683154
1537,Thorsten Kirschbaum,1.FC Nuremberg,0.090,-22.300631,3.538800,1.630707,-1.278075,-0.330693
...,...,...,...,...,...,...,...,...
4,Antoine Griezmann,FC Barcelona,108.000,79.355805,-53.978667,-14.121967,32.286205,54.868999
3,João Félix,Atlético Madrid,113.400,-79.905963,-93.870000,-67.815567,-70.429045,-46.756628
2,Ousmane Dembélé,FC Barcelona,124.200,-86.791721,-111.054000,-98.164333,-83.852464,-87.928909
1,Kylian Mbappé,Paris Saint-Germain,130.500,-45.780253,-101.010000,-98.957333,-82.539438,-41.284937


In [92]:
train_df.to_csv('misc/data_from_modelling.csv')