In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
import pickle

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.decomposition import PCA # If algorithms are taking too much time

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor

# Actually Evaluating

In [2]:
identifiers = ['player_name', 'club_involved_name', 'season']
player_var = ['age', 'wage', 'ratingFM', 'potential', 'injprone', 'afterloan', 'contract']
team_var = ['reputation', 'balance', 'wagebudget', 'trainingfacilities', 'youthfacilities', 'youthacademy', 'stadiumcap', 'avgattendance']
stats_var = ['gamesPlayed', 'subedOn', 'MotM', 'ratingWS']
offensive_var = ['goalsScored', 'assists', 'shots', 'passes', 'passesSuccess', 'aerialsWon', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'badTouch', 'crosses', 'longBalls', 'throughBalls']
defensive_var = ['yellow', 'red', 'tackles', 'interceptions', 'fouls', 'offsidesProvoked', 'clearances', 'blockedPass', 'ownGoal']
previous_var = ['ATaction', 'ATgoals', 'ATassists', 'ATshots', 'ATpasses', 'ATpassesSuccess', 'ATtackles', 'ATinterceptions', 'ATMotM', 'ATrating', 'CLaction', 'CLgoals', 'CLassists', 'CLMotM', 'CLrating', 'ELaction', 'ELgoals', 'ELassists', 'ELMotM', 'ELrating']
pos_var = ['GK', 'DEF', 'MID', 'STR', 'Wing', 'Central']
external_var = ['year', 'GDP', 'CountryPopulation', 'UEFAranking', 'eurgbp']
features = player_var + team_var + stats_var + offensive_var + defensive_var + previous_var + pos_var + external_var
labels = ['fee_cleaned', 'fee_log']
target = 'fee_cleaned'

removed_features = []

In [3]:
## Bayesian optimization-based feature selection
### Objective function
def objective_selection(trial):
    weighted_df = df[features + [target]]
    for i, feature in enumerate(features):
        if trial.suggest_int(feature, 0, 1) == 0:
            weighted_df = weighted_df.drop(feature, axis = 1)
    features_list = list(weighted_df.drop(target, axis = 1))

    if len(features_list) == 0:
        return 99999999999

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features_list], train_set[target])
    return -r2_score(model.predict(test_set[features_list]), test_set[target])

def bayesian_optimization_selection(model, df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_selection, n_trials=n_trials)

    # need to normalize for feature mean value
    selected_features = []
    for feature in features:
        if study.best_trial.params[feature]:
            selected_features.append(feature)

    return selected_features, study.best_trial.number

In [4]:
## Bayesian optimization-based feature weighting
def objective_weighting(trial):
    weighted_df = df.copy()
    for i, feature in enumerate(features):
        weighted_df[feature] *= trial.suggest_uniform(feature, 0, 1)

    train_set = weighted_df.iloc[:int(internal_test_validation_split * len(weighted_df))]
    test_set = weighted_df.iloc[int(internal_test_validation_split * len(weighted_df)):]

    model.fit(train_set[features], train_set['fee_log'])
    return -r2_score(model.predict(test_set[features]), test_set['fee_log']) 

def bayesian_optimization_weighting(model, df, features, target, n_trials = 500):
    study = optuna.create_study()
    study.optimize(objective_weighting, n_trials=n_trials)

    # need to normalize for feature mean value
    weighted_df = df.copy()
    for feature in features:
        weighted_df[feature] *= study.best_trial.params[feature]

    return weighted_df, study.best_trial.number

In [5]:
external_test_validation_split = 0.8
internal_test_validation_split = 0.6

In [6]:
df = pd.read_csv('data/data.csv')
train_df = df[df.fee_cleaned > 0]
test_df = df[df.fee_cleaned == 0]

In [7]:
df

Unnamed: 0.1,Unnamed: 0,player_name,club_involved_name,season,age,wage,value,cost,ratingFM,potential,...,STR,Wing,Central,year,GDP,CountryPopulation,UEFAranking,eurgbp,fee_cleaned,fee_log
0,0,Neymar,Paris Saint-Germain,2017/2018,-0.585046,12.315201,7.299882,11.404813,3.330984,0.957797,...,1.509692,1.526437,-0.974713,0.053782,-1.434860,3.183356,-1.703498,0.565087,199.8,5.302309
1,1,Kylian Mbappé,Paris Saint-Germain,2018/2019,-2.049648,7.434713,6.148324,8.490667,1.275699,-3.630068,...,1.509692,-0.655120,1.025943,0.565559,0.437768,0.177493,0.183647,0.668371,130.5,4.879007
2,2,Ousmane Dembélé,FC Barcelona,2017/2018,-1.805548,0.701205,2.565702,6.706495,0.680749,-7.472122,...,1.509692,1.526437,-0.974713,0.053782,0.437768,0.177493,-0.288139,0.565087,124.2,4.829912
3,3,João Félix,Atlético Madrid,2019/2020,-2.049648,-0.499342,0.518489,2.602902,1.492045,0.993957,...,1.509692,-0.655120,1.025943,1.077336,-0.675067,-1.008040,0.655433,0.856442,113.4,4.739701
4,4,Antoine Griezmann,FC Barcelona,2019/2020,0.147256,20.040466,8.835291,6.647023,4.178338,1.147640,...,1.509692,-0.655120,1.025943,1.077336,0.437768,0.177493,-1.703498,0.856442,108.0,4.691348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2944,3001,Daniele Capelli,AC Cesena,2015/2016,0.391356,-0.368848,-0.505117,-0.430189,-0.451461,0.014912,...,-0.662387,-0.655120,1.025943,-0.969772,-0.053215,0.036322,-0.759925,-1.964614,0.0,0.000000
2945,3002,Henrikh Mkhitaryan,AS Roma,2020/2021,0.879557,4.094058,3.205456,-0.073354,2.249255,0.781514,...,-0.662387,-0.655120,-0.974713,1.589113,-1.795314,-1.160675,-1.231711,0.939686,0.0,0.000000
2946,3003,Gastón Ramírez,Middlesbrough FC,2016/2017,-0.585046,0.440217,0.390539,0.342952,0.356230,0.257187,...,-0.662387,-0.655120,-0.974713,-0.457995,-1.238797,-1.150154,-1.231711,-0.020706,0.0,0.000000
2947,3004,Antonio Cassano,Parma FC,2014/2015,0.879557,0.205327,-0.249215,-0.073354,0.911517,0.283403,...,1.509692,-0.655120,1.025943,-1.481549,-0.053215,0.036322,-0.759925,-0.751406,0.0,0.000000


# Linear Regressor

In [8]:
model = LinearRegression(n_jobs = -1)

In [9]:
bayesian_selection_LR = ['age', 'wage', 'ratingFM', 'afterloan', 'contract', 'balance', 'wagebudget', 'gamesPlayed', 'subedOn', 'MotM', 'ratingWS', 'assists', 'shots', 'aerialsWon', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'throughBalls', 'yellow', 'red', 'tackles', 'interceptions', 'ownGoal', 'ATaction', 'ATpasses', 'ATtackles', 'ATrating', 'CLgoals', 'CLassists', 'ELaction', 'ELassists', 'ELrating', 'Wing', 'Central', 'year', 'GDP', 'CountryPopulation', 'UEFAranking']

In [10]:
model.fit(train_df[bayesian_selection_LR], train_df['fee_cleaned'])
test_df['LR'] = model.predict(test_df[bayesian_selection_LR])

# K-Nearest Neighbors

In [11]:
model = KNeighborsRegressor(n_neighbors=7, n_jobs = -1)

In [12]:
bayesian_selection_kNN = ['age', 'wage', 'ratingFM', 'contract', 'reputation', 'trainingfacilities', 'youthacademy', 'stadiumcap', 'avgattendance', 'gamesPlayed', 'MotM', 'ratingWS', 'goalsScored', 'shots', 'keyPasses', 'dribbles', 'offsides', 'dispossessed', 'crosses', 'throughBalls', 'tackles', 'offsidesProvoked', 'blockedPass', 'ownGoal', 'ATgoals', 'ATassists', 'ATshots', 'ATpasses', 'ATpassesSuccess', 'ATtackles', 'ATinterceptions', 'ATMotM', 'ATrating', 'ELaction', 'GK', 'DEF', 'year', 'UEFAranking', 'eurgbp']

In [13]:
BW_df, i = bayesian_optimization_weighting(model, df, bayesian_selection_kNN, 'fee_cleaned', n_trials = 1000)
i

904

In [14]:
bw_train_df = BW_df[BW_df.fee_cleaned > 0]
bw_test_df = BW_df[BW_df.fee_cleaned == 0]

In [15]:
model.fit(bw_train_df[bayesian_selection_kNN], bw_train_df.fee_cleaned)
test_df['BWKNN'] = model.predict(bw_test_df[bayesian_selection_kNN])

# Random Forest

In [16]:
model = RandomForestRegressor(n_estimators=300, n_jobs = -1)

In [17]:
model.fit(train_df[features], train_df['fee_cleaned'])
test_df['RF'] = model.predict(test_df[features])

# Gradient Boosting

In [18]:
model = GradientBoostingRegressor(n_estimators=1000)

In [19]:
model.fit(train_df[features], train_df['fee_cleaned'])
test_df['GB'] = model.predict(test_df[features])

# Deep Learning

In [20]:
model = MLPRegressor(hidden_layer_sizes=(200, 200, 200), activation='relu', learning_rate_init=0.001, max_iter=600)

In [21]:
bayesian_selection_NN = ['age', 'wage', 'ratingFM', 'injprone', 'afterloan', 'contract', 'reputation', 'balance', 'wagebudget', 'youthacademy', 'stadiumcap', 'gamesPlayed', 'ratingWS', 'assists', 'shots', 'passesSuccess', 'aerialsWon', 'keyPasses', 'dispossessed', 'badTouch', 'tackles', 'interceptions', 'fouls', 'blockedPass', 'ownGoal', 'ATassists', 'ATshots', 'ATpasses', 'CLaction', 'CLMotM', 'ELassists', 'ELMotM', 'DEF', 'CountryPopulation', 'UEFAranking', 'eurgbp']

In [22]:
model.fit(train_df[bayesian_selection_NN], train_df['fee_cleaned'])
test_df['NN'] = model.predict(test_df[bayesian_selection_NN])

# Ensembling

In [23]:
LRstacking = pickle.load(open('misc/lrstacking.sav', 'rb'))
NNstacking = pickle.load(open('misc/nnstacking.sav', 'rb'))

In [24]:
test_df['LRstack'] = LRstacking.predict(test_df[['LR', 'BWKNN', 'RF', 'GB', 'NN']])
test_df['NNstack'] = NNstacking.predict(test_df[['LR', 'BWKNN', 'RF', 'GB', 'NN']])
test_df['SimpleAvg'] = (test_df.LR + test_df.BWKNN + test_df.RF + test_df.GB + test_df.NN) / 5

In [25]:
r2_score(test_df.fee_cleaned, test_df.SimpleAvg), r2_score(test_df.fee_cleaned, test_df.LRstack), r2_score(test_df.fee_cleaned, test_df.NNstack)

(0.0, 0.0, 0.0)

In [26]:
test_df[['player_name','club_involved_name','fee_cleaned','LR', 'BWKNN', 'RF', 'GB', 'NN', 'LRstack', 'NNstack']].sort_values('fee_cleaned', ascending = False).head(20)

Unnamed: 0,player_name,club_involved_name,fee_cleaned,LR,BWKNN,RF,GB,NN,LRstack,NNstack
1545,Mustapha Carayol,Ajax Amsterdam,0.0,-6.440048,1.659286,1.871997,1.343791,1.311756,-0.577101,0.939864
2478,Stefan Reinartz,Eintracht Frankfurt,0.0,-0.365743,2.841429,4.401177,4.620612,2.152995,1.533724,1.380247
2486,Carlos Eduardo,Ajax Amsterdam,0.0,1.167366,2.931429,3.44418,2.549051,3.025505,2.164123,1.980953
2485,Vladimir Granat,Spartak Moscow,0.0,-9.194761,1.023429,2.368557,0.409032,-0.658815,-2.265512,0.449783
2484,Fábio Santos,De Graafschap Doetinchem,0.0,-1.991573,2.282143,3.422077,4.29901,0.535559,0.108992,1.15287
2483,Anton Sosnin,Dynamo Moscow,0.0,-3.700289,2.058429,1.995663,0.621963,-1.804053,-1.668028,0.789097
2482,Aílton,SC Heerenveen,0.0,0.554641,2.661429,3.977777,2.408555,1.879789,1.401156,1.567022
2481,César Navas,FK Rostov,0.0,-8.377565,0.867857,3.385443,2.961919,-1.545384,-2.550024,1.138358
2480,Peter Niemeyer,SV Darmstadt 98,0.0,-10.242729,1.883571,1.52883,-3.585183,1.894001,-0.859153,0.4124
2479,Oliver Kirch,SC Paderborn 07,0.0,-4.77499,3.018571,2.893607,-0.68905,2.102624,0.687648,1.2386


In [27]:
test_df.to_csv('misc/free_transfers.csv')