In [None]:
from datetime import date
import pandas as pd

pd.set_option('display.max_columns', 350)

In [None]:
import os
def get_project_root_path():
    project_directory = "fifa22-players-analysis"
    separated_path = os.getcwd().split(os.sep)
    project_directory_index = separated_path.index(project_directory) + 1
    return os.sep.join(separated_path[0:project_directory_index])


import joblib
def save_model(model, model_name):
    filename = f"model_name_{model_name}.pkl"
    print(filename)
    model_path = os.sep.join([get_project_root_path(), "saved_models", filename])
    joblib.dump(model, model_path)

In [None]:
data = pd.read_csv(f"data/processed_data.tsv", sep="\t")
data.head()

In [None]:
def count_na(df: pd.DataFrame) -> None:
    nan_sum = df.isna().sum()
    display(nan_sum[nan_sum > 0])

count_na(data)

Analisando essa tabela com os valores nulos podemos perceber que alguns jogadores do dataset não possuem um time atual, isso explica
`wage_eur`, `club_team_id`, `club_position`, `club_jersey_number` terem a mesma quantidade de valores nulos (61),
porém `value_eur` deveria seguir esse mesmo padrão, assim podemos dizer que 13 (74 - 61) jogadores que estão ativos não possuem um valor,
portanto precisamos prever esses dados.

Também podemos perceber que temos 2132 jogadores não possuem `pace`, `shooting`, `passing`, `dribbling`, `defending`, `physic`. Vamos analisar esses jogadores

In [None]:
def percent_of_gk(df: pd.DataFrame) -> None:
    gk = df[df['GK'] == 1]
    display(gk.shape[0] / df.shape[0])


missing_main_attr_data = data[data.pace.isna()]

percent_of_gk(missing_main_attr_data)
missing_main_attr_data

Como vimos que os goleiros não tem os atributos listados acima, mas possuem um overall em cada posição, seria interessante prever esses atributos.

In [None]:
missing_gk_speed_data = data[data.goalkeeping_speed.isna()]
sum_of_missing_data = missing_gk_speed_data.shape[0] + missing_main_attr_data.shape[0]
print(f"Jogadores sem gk speed + jogadores sem atributos primários: {sum_of_missing_data}")
print(f"Numero de jogadores: {data.shape[0]}")

Assim podemos ver que todos os jogadores que não são goleiros não possuem `goalkeeping_speed`

## Prevendo valor dos jogadores
Nessa etapa vamos criar um modelo para prever o valor dos 13 jogadores que não possuem valor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer
import numpy as np

def predict_and_populate(df_to_populate, df_to_train, model, column_to_predict, round_=None, save=False) -> pd.DataFrame:
    # Separating the target variable
    target = df_to_train[column_to_predict]
    features = df_to_train.drop(column_to_predict, axis=1)

    # Poluting the missing features
    mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    mean_imputer.fit(features)
    imputed_features = mean_imputer.transform(features)

    # Train test split
    features_train, features_test, target_train, target_test = train_test_split(imputed_features, target, test_size=0.2)

    # Training the model
    trained_model = model.fit(features_train, target_train)

    # Saving the model
    if save:
        model_name = str(model).split("(")[0]
        filename = f"{model_name}_{column_to_predict}"
        save_model(trained_model, filename)

    # Calculating model score
    model_score = trained_model.score(features_test, target_test)
    print(f"Score do R² modelo: {model_score:.4f}")


    if model == LinearRegression or DummyRegressor or RandomForestRegressor:
        mse = mean_squared_error(target_test, trained_model.predict(features_test))
        mae = mean_absolute_error(target_test, trained_model.predict(features_test))
        print(f"MSE: {mse:.4f}")
        print(f"MAE: {mae:.4f}")

    # Populating the missing values
    feature_to_populate = mean_imputer.transform(df_to_populate.drop(column_to_predict, axis=1))
    predicted_values = trained_model.predict(feature_to_populate)


    if round_ is not None:
        predicted_values = predicted_values.round(round_)

    df_to_populate[column_to_predict] = predicted_values

    return df_to_populate




In [None]:
players_with_club_and_no_value = data.query("not club_team_id.isna() and value_eur.isna()").copy()
data_without_players_with_club_and_no_value = data.drop(players_with_club_and_no_value.index)

data_to_train = data_without_players_with_club_and_no_value.query("not value_eur.isna()").copy()
columns_to_drop = [
    "sofifa_id", "player_url", "short_name", "long_name",
    "dob", "club_position","player_traits", "player_tags",
    "player_positions", "work_rate", "body_type",
    "preferred_foot", "nationality_name"
]

data_to_train = data_to_train.drop(columns=columns_to_drop)
data_to_fill = players_with_club_and_no_value.drop(columns=columns_to_drop)

In [None]:
# Normalizando os dados
# from sklearn.preprocessing import StandardScaler
#
# standerd_scaler =  StandardScaler()
# standardized_features = standerd_scaler.fit_transform(imputed_features)
# standardized_features

## Comparando modelos diferentes

In [None]:
# TODO ver tanto o MSE quanto o MAE
# TODO aplicar RFECV em cada modelo

from sklearn.dummy import DummyRegressor
dummy = DummyRegressor()
predict_and_populate(data_to_fill, data_to_train, dummy, "value_eur")

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
predict_and_populate(data_to_fill, data_to_train, linear_regression, "value_eur")

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(random_state=0, n_jobs=-1, oob_score=True)
players_with_club_and_no_value = predict_and_populate(
    data_to_fill, data_to_train, random_forest, "value_eur", save=True
);

### Preenchendo os dados a partir do modelo criado

In [None]:
data.loc[players_with_club_and_no_value.index, "value_eur"] = players_with_club_and_no_value.value_eur
data.loc[players_with_club_and_no_value.index]

In [None]:
count_na(data)

In [None]:
# Removendo jogadores não ativos
data = data.query("not club_team_id.isna()").copy()

count_na(data)

# Prevendo pace, shooting, passing, dribbling, defending, physic dos goleiros

In [None]:
gk_data = data.query("GK == 1").copy()
data_without_gk = data.query("GK == 0").copy()

columns_to_predict = ["pace", "shooting", "passing", "dribbling", "defending", "physic"]

for column_to_predict in columns_to_predict:

    columns_to_drop_with_other_atributes = [col for col in columns_to_predict if col != column_to_predict]
    columns_to_drop_with_other_atributes.extend(columns_to_drop)

    data_to_train = data_without_gk.drop(columns=columns_to_drop_with_other_atributes)
    data_to_predict = gk_data.drop(columns=columns_to_drop_with_other_atributes)

    linear_regression = LinearRegression()
    print(f"Prevendo {column_to_predict}")
    filled_data = predict_and_populate(
        data_to_predict, data_to_train, linear_regression,
        column_to_predict, 0, save=True
    )
    print()

    data.loc[filled_data.index, column_to_predict] = filled_data[column_to_predict]

# data.loc[gk_data.index]

In [None]:
count_na(data)

## Prevendo goalkeeping_speed dos jogadores

In [None]:
players_data = data.query("GK == 0").copy()
players_data

In [None]:
data_to_predict = players_data.drop(columns=columns_to_drop)
data_to_train = gk_data.drop(columns=columns_to_drop)

column_to_predict = "goalkeeping_speed"

linear_regression = LinearRegression()
filled_data = predict_and_populate(
    data_to_predict, data_to_train, linear_regression, column_to_predict, 0, save=True
)

data.loc[filled_data.index, column_to_predict] = filled_data[column_to_predict]

In [None]:
count_na(data)

In [None]:
from datetime import date
data.drop(columns=["release_clause_eur"], inplace=True)

filled_data_save_path = os.sep.join([get_project_root_path(), "preprocessing", "data", f"filled_data.tsv"])
data.to_csv(filled_data_save_path, index=False, sep="\t")
filled_data_save_path

In [None]:
count_na(data)

In [None]:
data.describe().T.round()