In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
df = pd.read_csv('../../Data/data_footballer_processed.csv', parse_dates=['Birth Date'])

In [3]:
# encode columns league and nation
one_hot_nation_league_df = pd.get_dummies(df[['League', 'Nation']])
# encode column position
position_df = df['Preferred Positions']
one_hot_position_df = position_df.str.get_dummies(', ')
# custom order from GK -> ST
custom_order = ['GK', 'LWB', 'LB', 'CB', 'RB', 'RWB', 'CDM', 'CM', 'LM', 'RM', 'CAM', 'LW', 'RW', 'CF', 'ST']
one_hot_position_df = one_hot_position_df[custom_order]
# merge encode df
one_hot_df = one_hot_nation_league_df.merge(one_hot_position_df, left_index=True, right_index=True)
one_hot_df

Unnamed: 0,League_Argentina Primera Division (1),League_Australia A-League (1),League_Austria Bundesliga (1),League_Belgium Pro League (1),League_CONMEBOL Libertadores,League_CONMEBOL Sudamericana,League_China Super League (1),League_Denmark Superliga (1),League_England Championship (2),League_England League One (3),...,RWB,CDM,CM,LM,RM,CAM,LW,RW,CF,ST
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10016,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
10017,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
10018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
numeric_columns = ['Height', 'Weight', 'Age', 'OVR', 'POT', 'Ball Skills', 'Defence', 'Mental', 'Passing', 'Physical', 'Shooting', 'Goalkeeper']
numeric_df = df[numeric_columns]
encoded_df = numeric_df.merge(one_hot_df, left_index=True, right_index=True)
encoded_df

Unnamed: 0,Height,Weight,Age,OVR,POT,Ball Skills,Defence,Mental,Passing,Physical,...,RWB,CDM,CM,LM,RM,CAM,LW,RW,CF,ST
0,195,94,23,91,94,80.5,38.0,80.2,59.0,83.7,...,0,0,0,0,0,0,0,0,0,1
1,182,75,24,91,94,92.5,33.0,76.7,78.3,89.0,...,0,0,0,0,0,0,1,0,0,1
2,181,75,32,91,91,89.0,61.5,84.0,94.3,75.7,...,0,0,1,0,0,1,0,0,0,0
3,188,85,30,90,90,84.5,42.0,81.3,85.0,75.9,...,0,0,0,0,0,0,0,0,0,1
4,199,96,31,90,90,18.0,17.0,41.5,27.3,54.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10015,191,87,21,65,75,16.5,12.5,30.0,31.3,40.4,...,0,0,0,0,0,0,0,0,0,0
10016,177,68,20,65,81,66.0,58.0,58.7,60.7,70.7,...,1,0,0,0,0,0,0,0,0,0
10017,170,60,23,65,74,65.0,58.5,60.3,59.0,58.4,...,0,0,1,0,0,0,0,0,0,0
10018,187,77,25,65,72,48.0,67.5,55.2,52.0,63.1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X = encoded_df
y = df['Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
rf_reg = RandomForestRegressor(bootstrap = True, max_depth = 30, max_features = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 100)
rf_reg.fit(X_train, y_train)

In [7]:
pickle.dump(rf_reg, open('model.pkl','wb'))