In [120]:
import numpy as np
import pandas as pd

In [121]:
df = pd.read_csv('../data/df_clean.csv', index_col=0)

Load all result file with the prediction from the individual models

In [122]:
simple_results_fs = pd.read_csv('../data/df_simple_model_fs_results.csv', index_col=0)
simple_results_fs.columns = ['PredictedValueSimpleModelFs']

simple_results_tw = pd.read_csv('../data/df_simple_model_tw_results.csv', index_col=0)
simple_results_tw.columns = ['PredictedValueSimpleModelTw']

extensive_results_fs = pd.read_csv('../data/df_extensive_model_fs_results.csv', index_col=0)
extensive_results_fs.columns = ['PredictedValueExtensiveModelFs']

extensive_results_tw = pd.read_csv('../data/df_extensive_model_tw_results.csv', index_col=0)
extensive_results_tw.columns = ['PredictedValueExtensiveModelTw']

Combine all predictions in one file.

In [123]:
df = pd.concat([df, simple_results_fs, simple_results_tw, extensive_results_fs, extensive_results_tw], axis=1)

Aggregate the prediction score in one score:
    - use extended model if available, otherwise use the simple model

Create a consistent predicted value column in which all values 10'000 is used as a min-value and if the predicted value is below. Then, fill all records without a prediction with the minimum value, because it is probably not a very good player if essential information is missing in the data.

In [124]:
def find_best_prediction(row):
    if row['PositionCategory'] == 'Torwart':
        if np.isnan(row['PredictedValueExtensiveModelTw']):
            return row['PredictedValueSimpleModelTw']
        else:
            return row['PredictedValueExtensiveModelTw']
    else:
        if np.isnan(row['PredictedValueExtensiveModelFs']):
            return row['PredictedValueSimpleModelFs']
        else:
            return row['PredictedValueExtensiveModelFs']


df['PredictedValue'] = df.apply(find_best_prediction, axis=1)

ZERO_VALUE = 10000
df['PredictedValueConsistent'] = df['PredictedValue'].apply(lambda value: max(value, ZERO_VALUE))
df['PredictedValueConsistent'] = df['PredictedValueConsistent'].fillna(ZERO_VALUE)


In [125]:
def normalize_variable(df, col):
   df[col + 'Normalized'] = (df[col] - df[col].mean()) / df[col].std()
   return df

df = normalize_variable(df, 'Value')
df = normalize_variable(df, 'PredictedValue')
df = normalize_variable(df, 'PredictedValueConsistent')

create and try different measures for sorting and evaluating if a player is over- or undervalued

In [126]:
# positive difference means overvalued, negative difference means undervalued

df['AbsoluteDifference'] = df['Value'] - df['PredictedValue']
df['AbsoluteDifferenceNormalized'] = df['ValueNormalized'] - df['PredictedValueNormalized']
df['AbsoluteDifferenceConsistent'] = df['Value'] - df['PredictedValueConsistent']
df['AbsoluteDifferenceConsistentNormalized'] = df['ValueNormalized'] - df['PredictedValueConsistentNormalized']

df['PercentDifference'] = df['PredictedValue'] / df['Value'] * 100
df['PercentDifferenceNormalized'] = df['PredictedValueNormalized'] / df['ValueNormalized'] * 100
df['PercentDifferenceConsistent'] = df['PredictedValueConsistent'] / df['Value'] * 100
df['PercentDifferenceConsistentNormalized'] = df['PredictedValueConsistentNormalized'] / df['ValueNormalized'] * 100

In [127]:
df.sort_values('AbsoluteDifference', ascending=False)[['League', 'Club', 'Name', 'Position', 'Value', 'PredictedValue', 'AbsoluteDifference']].head(20)

Unnamed: 0,League,Club,Name,Position,Value,PredictedValue,AbsoluteDifference
3423,Ligue 1,Paris SG,Kylian Mbappé,Mittelstürmer,180000000.0,89146340.0,90853660.0
7677,Premier League,Manchester City,Erling Haaland,Mittelstürmer,170000000.0,80677110.0,89322890.0
9479,Eredivisie,PSV Eindhoven,Walter Benítez,Torwart,12000000.0,596159.7,11403840.3
9503,Eredivisie,Feyenoord,Justin Bijlow,Torwart,13000000.0,3268679.8,9731320.2
9453,Eredivisie,Ajax Amsterdam,Gerónimo Rulli,Torwart,10000000.0,930885.6,9069114.4
9976,Süper Lig,Fenerbahce,Altay Bayindir,Torwart,13000000.0,5757902.0,7242098.0
1107,Bundesliga,B. Leverkusen,Florian Wirtz,Offensives Mittelfeld,85000000.0,77868024.0,7131976.0
7683,Premier League,FC Chelsea,Wesley Fofana,Innenverteidiger,65000000.0,57985588.0,7014412.0
8141,Premier League,FC Fulham,Issa Diop,Innenverteidiger,18000000.0,11240158.0,6759842.0
8111,Premier League,Leeds United,Georginio Rutter,Mittelstürmer,22000000.0,15247075.0,6752925.0


In [128]:
df.sort_values('PercentDifference', ascending=False)[['League', 'Club', 'Name', 'Position', 'Value', 'PredictedValue', 'PercentDifference']].head(10)

Unnamed: 0,League,Club,Name,Position,Value,PredictedValue,PercentDifference
15186,Ekstraklasa,Jagiellonia,Michal Ozga,Innenverteidiger,25000.0,6457719.5,25830.878
5348,Serie B,Benevento,Francesco Perlingieri,Rechter Verteidiger,10000.0,637620.25,6376.2025
17292,SuperLiga,FC Voluntari,Aymar Meleke,Mittelstürmer,10000.0,597704.8,5977.048
17825,Allsvenskan,IFK Värnamo,Johnbosco Samuel Kalu,Mittelstürmer,10000.0,591292.06,5912.9206
10804,1.Lig,Genclerbirligi,Chukwuma Akabueze,Offensives Mittelfeld,25000.0,1341897.2,5367.5888
7574,Liga Sabseg,Vilafranquense,Nenê,Mittelstürmer,25000.0,1043500.4,4174.0016
18262,Eliteserien,HamKam,Lars Brotangen,Innenverteidiger,10000.0,414704.62,4147.0462
7504,Liga Sabseg,CF Estrela,Miguel Lopes,Rechter Verteidiger,50000.0,1987235.5,3974.471
12055,U19 M-Liga,Akhmat II,Abubakar Inalkaev,Defensives Mittelfeld,50000.0,1969307.6,3938.6152
17724,Allsvenskan,Kalmar FF,Rony Jansson,Innenverteidiger,10000.0,345408.78,3454.0878


In [129]:
df.sort_values('AbsoluteDifferenceConsistentNormalized', ascending=False)[['League', 'Club', 'Name', 'Position', 'Value', 'PredictedValueConsistent', 'AbsoluteDifferenceConsistentNormalized']].head(30)

Unnamed: 0,League,Club,Name,Position,Value,PredictedValueConsistent,AbsoluteDifferenceConsistentNormalized
3423,Ligue 1,Paris SG,Kylian Mbappé,Mittelstürmer,180000000.0,89146340.0,11.933662
7677,Premier League,Manchester City,Erling Haaland,Mittelstürmer,170000000.0,80677110.0,11.789791
9479,Eredivisie,PSV Eindhoven,Walter Benítez,Torwart,12000000.0,596159.7,1.605818
9503,Eredivisie,Feyenoord,Justin Bijlow,Torwart,13000000.0,3268679.8,1.351432
9453,Eredivisie,Ajax Amsterdam,Gerónimo Rulli,Torwart,10000000.0,930885.6,1.278388
9976,Süper Lig,Fenerbahce,Altay Bayindir,Torwart,13000000.0,5757902.0,0.984958
8141,Premier League,FC Fulham,Issa Diop,Innenverteidiger,18000000.0,11240158.0,0.87321
8111,Premier League,Leeds United,Georginio Rutter,Mittelstürmer,22000000.0,15247075.0,0.839593
9494,Eredivisie,PSV Eindhoven,Xavi Simons,Offensives Mittelfeld,30000000.0,23370034.0,0.756295
8242,Championship,FC Watford,João Pedro,Mittelstürmer,24000000.0,17761638.0,0.747538


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17796 entries, 0 to 18357
Data columns (total 63 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   LeagueCountry                           17796 non-null  object 
 1   League                                  17796 non-null  object 
 2   NationalLeagueLevel                     17796 non-null  object 
 3   Club                                    17796 non-null  object 
 4   No                                      17511 non-null  float64
 5   Name                                    17796 non-null  object 
 6   Value                                   17796 non-null  float64
 7   HighestValue                            17796 non-null  float64
 8   Age                                     17794 non-null  float64
 9   Height                                  16717 non-null  float64
 10  Nationality                             17796 non-null  ob

In [131]:
df.to_csv('../data/df_model_full_merge.csv')  # write the file which is then used by the streamlit app