In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from market_value_predictor.preproc import reduce_number_of_classes

In [None]:
random.seed(42)

# Import data

In [None]:
df_train = pd.read_csv("../../raw_data/master_data_train.csv").drop(columns="Unnamed: 0")
df_test = pd.read_csv("../../raw_data/master_data_test.csv").drop(columns="Unnamed: 0")

In [None]:
df_train.info(verbose=True, show_counts=True)

In [None]:
numericals = ["int64", "float64"]

In [None]:
X = df_train.select_dtypes(numericals).drop(columns="fee_cleaned")

In [None]:
y = df_train.fee_cleaned

# Feature selection

In [None]:
correlations = df_train.select_dtypes(numericals).corr().fee_cleaned

In [None]:
correlations.sort_values(ascending=False)[1:11]

In [None]:
top_5_corr_num_features = list(correlations.sort_values(ascending=False)[1:6].index)

In [None]:
X_selection = X[top_5_corr_num_features]

In [None]:
X_selection

# Train model 1: top 5 correlated numerical features

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config; set_config(display='diagram')
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", MinMaxScaler()),
    ("estimator", LinearRegression())
])
pipe

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_selection, y, cv=5, scoring='r2').mean()

# Feature engineering

In [None]:
from market_value_predictor.preproc import manual_encoding

In [None]:
df_train = manual_encoding(df_train, "player_tags")

In [None]:
df_train = manual_encoding(df_train, "player_positions")

In [None]:
df_train = manual_encoding(df_train, "player_traits")

In [None]:
df_train.select_dtypes(object)

## Reduce number of classes on 'nationality'

In [None]:
nationality_df = pd.DataFrame(
    df_train.nationality.value_counts()).reset_index().rename(
        columns={
            "index": "nationality",
            "nationality": "count"
        })

In [None]:
temp_list = []
for i, elem in enumerate(list(nationality_df["count"])):
    if elem > 50:
        temp_list.append(list(nationality_df.nationality)[i])
    else:
        temp_list.append("other")

In [None]:
nationality_df["nationality_cleaned"] = temp_list

In [None]:
nationality_df = nationality_df.drop(columns="count")

In [None]:
df_train = df_train.merge(nationality_df, on="nationality", how="left").drop(columns="nationality")

## Reduce number of classes on 'league_name'

In [None]:
league_name_df = pd.DataFrame(
    df_train.league_name.value_counts()).reset_index().rename(
        columns={
            "index": "league_name",
            "league_name": "count"
        })

In [None]:
temp_list = []
for i, elem in enumerate(list(league_name_df["count"])):
    if elem > 100:
        temp_list.append(list(league_name_df.league_name)[i])
    else:
        temp_list.append("other")

In [None]:
league_name_df["league_name_cleaned"] = temp_list

In [None]:
league_name_df = league_name_df.drop(columns="count")

In [None]:
df_train = df_train.merge(league_name_df, on="league_name", how="left").drop(columns="league_name")

## Club_name

In [None]:
ohe = OneHotEncoder(sparse=False)

In [None]:
club_name_encoded = pd.DataFrame(ohe.fit_transform(df_train[["club_name"]]))

In [None]:
club_name_encoded.columns = ohe.get_feature_names_out()

In [None]:
club_name_encoded["fee_cleaned"] = df_train.fee_cleaned

In [None]:
club_name_encoded.corr().fee_cleaned.sort_values(ascending=False)[:20]

In [None]:
### Drop club_name

In [None]:
df_train.drop(columns="club_name", inplace=True)

## Reduce number of classes on 'team_position'

In [None]:
attack = ["ST", "LS", "LW", "RS", "RW", "RF", "LF", "CF"]
mid = ["LCM", "RM", "CB", "CAM", "LM", "CM", "CDM", "RCM", "LCM", "RDM", "LDM", "RAM", "LAM"]
defense = ["RCB", "LCB", "CB", "RB", "LB", "RWB", "LWB"]
goal = ["GK"]
sub = ["SUB", "RES"]

In [None]:
# df_train["position_cluster"] = df_train.team_position.map(lambda x: "attack" if x in attack else "mid"
#                            if x in mid else "defense" if x in defense else "goal"
#                            if x in goal else "sub" if x in sub else "nan")

In [None]:
df_train.drop(columns="team_position", inplace=True)

## Pipelining feature engineering

In [None]:
from sklearn.preprocessing import FunctionTransformer

feat_eng_player_traits = FunctionTransformer(lambda df: manual_encoding(df, "player_traits"))
feat_eng_player_tags = FunctionTransformer(lambda df: manual_encoding(df, "player_tags"))
feat_eng_player_positions = FunctionTransformer(lambda df: manual_encoding(df, "player_positions"))

dim_reduction_nationality = FunctionTransformer(lambda df: reduce_number_of_classes(df, "nationality", 50))
dim_reduction_league_name = FunctionTransformer(lambda df: reduce_number_of_classes(df, "league_name", 100))

cluster_team_position = FunctionTransformer(lambda df: cluster_team_position(df))

feat_eng = Pipeline([
    ("player_traits", feat_eng_player_traits),
    ("player_tags", feat_eng_player_tags),
    ("player_positions", feat_eng_player_positions),
    ("nationality", dim_reduction_nationality),
    ("league_name", dim_reduction_league_name),
#     ("team_position", cluster_team_position)
])

feat_eng

# Drop nas from object columns

In [None]:
df_train = df_train.dropna(subset=list(df_train.select_dtypes(object).columns))

# Train model 2: After feature engineering

## Define lists for column transformers

In [None]:
all_cats = list(df_train.select_dtypes(object).columns)

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

all_numerics = list(df_train.select_dtypes(include=numerics).columns)

all_numerics.remove("fee_cleaned")

encoded_columns = [elem for elem in all_numerics if "player_tags_" in elem] + [
    elem for elem in all_numerics if "player_positions_" in elem
] + [elem for elem in all_numerics if "player_traits_" in elem]

all_numerics_wo_encoded = []
for elem in all_numerics:
    if elem not in encoded_columns:
        all_numerics_wo_encoded.append(elem)

numericals_zero_impute = [
    "gk_diving", "gk_handling", "gk_kicking", "gk_reflexes", "gk_speed",
    "gk_positioning", "release_clause_eur"
]

numericals_mean_impute = []

for elem in all_numerics_wo_encoded:
    if elem not in numericals_zero_impute:
        numericals_mean_impute.append(elem)

## Define transformers

In [None]:
num_zero_tr = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", MinMaxScaler())
])

num_mean_tr = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

cat_tr = OneHotEncoder(handle_unknown='ignore', )

## Build pipeline

In [None]:
preprocessor = ColumnTransformer(
    [("numerics_zero_imputing", num_zero_tr, numericals_zero_impute),
     ("numerics_mean_imputing", num_mean_tr, numericals_mean_impute),
     ("cat_tr", cat_tr, all_cats)],
remainder="passthrough")

pipe_2 = Pipeline([
    #("feat_eng", feat_eng),
    ("preprocessing", preprocessor), 
    ("regressor", CatBoostRegressor(silent=True))])

pipe_2

In [None]:
X_2_train = df_train.drop(columns="fee_cleaned")
y_2_train = df_train["fee_cleaned"]

In [None]:
cross_val_score(pipe_2, X_2, y_2, cv=5, scoring="r2").mean()

## Grid search

In [None]:
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [None]:
pipe_2.get_params()

In [None]:
param_grid={
#     'columntransformer__num_tr__scaler': [RobustScaler()],
    'regressor': [
        #Ridge(), 
        #LinearRegression(), 
        #XGBRegressor(), 
        #CatBoostRegressor(), 
        #SGDRegressor(), 
        KernelRidge(), 
        ElasticNet(),
        BayesianRidge(), 
        GradientBoostingRegressor(),
        SVR()
    ]}

In [None]:
grid_search = GridSearchCV(
    pipe_2, 
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1)

grid_search.fit(X_2, y_2)
print(grid_search.best_params_)
print(grid_search.best_score_)