In [34]:
%load_ext autoreload
%autoreload 2

In [35]:
import pandas as pd
import numpy as np

In [36]:
from market_value_predictor.preproc import manual_encoding, reduce_number_of_classes

In [92]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config; set_config(display='diagram')
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [67]:
df = pd.read_csv("../../raw_data/master_df_with_webscraping.csv")
df_train = pd.read_csv("../../raw_data/master_data_train.csv").drop(columns="Unnamed: 0")
df_test = pd.read_csv("../../raw_data/master_data_test.csv").drop(columns="Unnamed: 0")

# Preprocessing

In [38]:
from sklearn.preprocessing import FunctionTransformer

drop_nas = FunctionTransformer(lambda df: df.dropna(subset=list(df.select_dtypes(object).columns)))

feat_eng_player_traits = FunctionTransformer(lambda df: manual_encoding(df, "player_traits"))
feat_eng_player_tags = FunctionTransformer(lambda df: manual_encoding(df, "player_tags"))
feat_eng_player_positions = FunctionTransformer(lambda df: manual_encoding(df, "player_positions"))

dim_reduction_nationality = FunctionTransformer(lambda df: reduce_number_of_classes(df, "nationality", 50))
dim_reduction_league_name = FunctionTransformer(lambda df: reduce_number_of_classes(df, "league_name", 100))

cluster_team_position = FunctionTransformer(lambda df: cluster_team_position(df))

feat_eng = Pipeline([
    ("player_traits", feat_eng_player_traits),
    ("player_tags", feat_eng_player_tags),
    ("player_positions", feat_eng_player_positions),
    ("nationality", dim_reduction_nationality),
    ("league_name", dim_reduction_league_name),
    ("drop_nas", drop_nas),
#     ("team_position", cluster_team_position)
])

feat_eng

In [40]:
df_train_transformed = feat_eng.fit_transform(df_train)

In [41]:
df_test_transformed = feat_eng.transform(df_test)

# Pipeline

## Define lists of columns for pipeline

In [51]:
all_cats = list(df_train.select_dtypes(object).columns)

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

all_numerics = list(df_train.select_dtypes(include=numerics).columns)

all_numerics.remove("fee_cleaned")

encoded_columns = [elem for elem in all_numerics if "player_tags_" in elem] + [
    elem for elem in all_numerics if "player_positions_" in elem
] + [elem for elem in all_numerics if "player_traits_" in elem]

all_numerics_wo_encoded = []
for elem in all_numerics:
    if elem not in encoded_columns:
        all_numerics_wo_encoded.append(elem)

numericals_zero_impute = [
    "gk_diving", "gk_handling", "gk_kicking", "gk_reflexes", "gk_speed",
    "gk_positioning", "release_clause_eur"
]

numericals_mean_impute = []

for elem in all_numerics_wo_encoded:
    if elem not in numericals_zero_impute:
        numericals_mean_impute.append(elem)

## Define transformers

In [84]:
from tempfile import mkdtemp
from shutil import rmtree
cachedir = mkdtemp()

In [90]:
num_zero_tr = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", MinMaxScaler())
])

num_mean_tr = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

cat_tr = OneHotEncoder(handle_unknown='ignore', )

## Build pipeline

In [96]:
preprocessor = ColumnTransformer(
    [("numerics_zero_imputing", num_zero_tr, numericals_zero_impute),
     ("numerics_mean_imputing", num_mean_tr, numericals_mean_impute),
     ("cat_tr", cat_tr, all_cats), ],
remainder="passthrough")

pipe = Pipeline([
    #("feat_eng", feat_eng),
    ("preprocessing", preprocessor), 
    ("regressor", ElasticNet())], memory=cachedir)

pipe

## Train pipeline    

In [97]:
X_train = df_train.drop(columns="fee_cleaned")
y_train = df_train[["fee_cleaned"]]
X_test = df_test.drop(columns="fee_cleaned")
y_test = df_test[["fee_cleaned"]]

In [98]:
pipe.fit(X_train, y_train)

In [99]:
y_pred = pipe.predict(X_test)

In [101]:
r2_score(y_test, y_pred)

0.04508533798990666

## Grid search

In [102]:
X = df.drop(columns="fee_cleaned")
y = df[["fee_cleaned"]]

In [103]:
cross_val_score(pipe, X, y, cv=5, scoring='r2').mean()

0.032431296126178656

In [104]:
pipe.get_params()

{'memory': '/tmp/tmpugcxxij0',
 'steps': [('preprocessing',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('numerics_zero_imputing',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(fill_value=0,
                                                                   strategy='constant')),
                                                    ('scaler', MinMaxScaler())]),
                                    ['gk_diving', 'gk_handling', 'gk_kicking',
                                     'gk_reflexes', 'gk_speed', 'gk_positioning',
                                     'release_clause_eur']),
                                   ('numerics_mean_imputing',
                                    Pipeline(steps=[('imputer', Simple...
                                     'skill_curve', 'skill_fk_accuracy',
                                     'skill_long_passing', 'skill_ball_control',
   

In [105]:
param_grid = {
    "preprocessing__numerics_zero_imputing__scaler": [RobustScaler(), MinMaxScaler(), StandardScaler()],
    "preprocessing__numerics_mean_imputing__scaler": [RobustScaler(), MinMaxScaler(), StandardScaler()],
    'preprocessing__numerics_zero_imputing__imputer__strategy': ['constant', "mean"],
    'regressor__alpha': [0.01, 0.1, 1], 
    'regressor__l1_ratio': [0.2, 0.5, 0.8]
}

In [106]:
grid_search = GridSearchCV(
    pipe, 
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1)

grid_search.fit(X, y)
print(grid_search.best_params_)
print(grid_search.best_score_)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.spar

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.spar

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly p

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
  model = cd_fast.sparse_enet_coordinate_descent(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments f

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

{'preprocessing__numerics_mean_imputing__scaler': StandardScaler(), 'preprocessing__numerics_zero_imputing__imputer__strategy': 'mean', 'preprocessing__numerics_zero_imputing__scaler': StandardScaler(), 'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.2}
0.33531911844772627
