In [1]:
# Data Manipulation
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

# Models
from sklearn.ensemble import RandomForestClassifier

sns.set(rc={"figure.figsize": (16, 3)})

# Ideas

- List possible models
    - Linear regression
    - Tree regressors
    - Ensemble regressors
    - XGBoost
    
- Use optuna for hyperparam optimization

- Build Webapp with sliders to visualize model

# Overview

Our goal is to build a model that gets information about a player (age, position, skills, etc.) and predicts the value of the player. To do this, we first apply a baseline model, which has to be beaten by a more sophisticated one we build with Feature Engineering, Hyperparameter Tuning, etc.

## Note: Use Optuna for Param Tuning

## Baseline: Random Forest

In [2]:
df = pd.read_csv(os.path.join("..", "data", "post_eda_data.csv"))
df.head()

Unnamed: 0,short_name,long_name,age,height_cm,weight_kg,nationality,club,overall,potential,value_eur,...,is_LW,is_LWB,is_RB,is_RM,is_RW,is_RWB,is_ST,is_defender,is_midfielder,is_striker
0,L. Messi,Lionel Andrés Messi Cuccittini,32,170,72,Argentina,FC Barcelona,94,94,95500000,...,0,0,0,0,1,0,1,False,False,True
1,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,187,83,Portugal,Juventus,93,93,58500000,...,1,0,0,0,0,0,1,False,False,True
2,Neymar Jr,Neymar da Silva Santos Junior,27,175,68,Brazil,Paris Saint-Germain,92,92,105500000,...,1,0,0,0,0,0,0,False,True,True
3,J. Oblak,Jan Oblak,26,188,87,Slovenia,Atlético Madrid,91,93,77500000,...,0,0,0,0,0,0,0,False,False,False
4,E. Hazard,Eden Hazard,28,175,74,Belgium,Real Madrid,91,91,90000000,...,1,0,0,0,0,0,0,False,False,True


Lets get rid of all information we dont want to provide to our model for predicting the value of a player:

In [3]:
drop_columns = [
    "short_name", "long_name", "nationality", "club", "potential", "wage_eur", "player_positions",
    "release_clause_eur", "player_tags", "team_jersey_number"
]

df = df.drop(drop_columns, axis=1)

In [4]:
df.columns

Index(['age', 'height_cm', 'weight_kg', 'overall', 'value_eur',
       'preferred_foot', 'international_reputation', 'weak_foot',
       'skill_moves', 'team_position', 'contract_valid_until', 'pace',
       'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'player_traits', 'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking', 'defending_standing_tackle',
       'defending_sliding_ta