In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Filtramos el dataset por posicion

In [2]:
df=pd.read_csv('data/df_cleaned.csv')

In [3]:
df=df[df['player_positions']=='portero']

## Elimino columnas overall

In [4]:
df.drop(["ls","st","rs","lw","lf","cf","rf","rw", 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'ldm', 'cdm', 'rdm',
     'lwb', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb',"gk"],axis=1,inplace=True)

## feature engineering

In [5]:
df.preferred_foot.value_counts() #categorica a OHE

Right    8565
Left     1069
Name: preferred_foot, dtype: int64

In [6]:
df.work_rate.value_counts() #categorica transformarla con un ordinal encoder

Medium/Medium    9634
Name: work_rate, dtype: int64

In [7]:
df.body_type.value_counts() #Transformar la variable de más a menos mosculado con un ordinal encoder
#(Lean < Normal < Stocky) + Unique = -1
#oe_cat = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, categories=[unique_categories for _ in ["columns", "you", "want"]])

Normal    6863
Lean      2140
Stocky     614
Unique      17
Name: body_type, dtype: int64

In [8]:
df.real_face.value_counts() #categorica a OHE

No     8861
Yes     773
Name: real_face, dtype: int64

In [9]:
df.año_version.value_counts() #categorica a OHE

2019    1476
2020    1461
2021    1435
2017    1400
2018    1386
2016    1266
2015    1210
Name: año_version, dtype: int64

In [10]:
df.player_positions.value_counts() #categorica a OHE

portero    9634
Name: player_positions, dtype: int64

In [11]:
#ver todas las variables 
pd.set_option('display.max_columns', None)
print(df.columns.tolist())

['sofifa_id', 'short_name', 'long_name', 'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur', 'age', 'dob', 'height_cm', 'weight_kg', 'club_team_id', 'club_name', 'league_name', 'league_level', 'club_position', 'club_jersey_number', 'club_joined', 'club_contract_valid_until', 'nationality_id', 'nationality_name', 'nation_team_id', 'nation_position', 'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'work_rate', 'body_type', 'real_face', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentalit

In [12]:
#eliminamos columnas que sabemos seguro que sólo aportan ruido
df.drop(["sofifa_id",'short_name', 'long_name',"nationality_id","dob",'club_team_id', 'club_name', 'league_name',
        'club_jersey_number', 'club_joined', 'club_contract_valid_until','nationality_id', 'nationality_name','player_traits',
        'ranking','puntos'],axis=1,inplace=True)

## Feacture selection

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [14]:
work_rate_ord=['Low/Low','Low/Medium','Low/High','Medium/Low','Medium/Medium','Medium/High','High/Low','High/Medium','High/High']
league_level_ord=['0.0','4.0','3.0','2.0','1.0']
#body_type_ord=['Lean','Normal','Stocky'] con OHE

In [15]:
df['league_level']=df['league_level'].apply(str)

In [16]:
df.drop('player_positions',axis=1,inplace=True)

In [17]:
categorical = df.dtypes == object
categorical[categorical==True]

league_level      True
preferred_foot    True
work_rate         True
body_type         True
real_face         True
dtype: bool

In [18]:
#definimos X e y
X=df.drop('progresion_anual',axis=1)
y=df['progresion_anual']

In [19]:

feature_engineering=make_column_transformer((OrdinalEncoder(categories=[work_rate_ord,league_level_ord]),['work_rate','league_level']),
                                            (OneHotEncoder(drop='first'),['preferred_foot','body_type','real_face','año_version','nation_team_id','club_position']),
                                            (StandardScaler(),  X.select_dtypes('number').drop(['año_version','club_position','nation_team_id'],axis=1).columns) 
                                           )

In [20]:
#transformamos las X en variables numéricas todas
X_features = feature_engineering.fit_transform(X)

In [21]:
#comprobamos tamaño
X_features.shape

(9634, 72)

In [22]:
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline

In [23]:
X_model = SelectFromModel(Ridge()).fit_transform(X_features, y)

In [24]:
X_model = SelectFromModel(Ridge(max_iter=10000)).fit_transform(X_features, y)

In [25]:
X_model.shape #seleccionar 19 variables

(9634, 19)

## Feacture importance

In [26]:
#formula para sacar los nombres de un X_features
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

In [27]:
m=mutual_info_regression(X_features, y)

In [28]:
#obtengo todos los nombres en una lista
variables=(get_column_names_from_ColumnTransformer(feature_engineering))+X.select_dtypes('number').drop(['año_version','club_position','nation_team_id'],axis=1).columns.to_list()



In [29]:
pd.Series(m,index=variables).sort_values(kind="quicksort",ascending=False).head(19)

overall                     2.222923
value_eur                   0.732051
goalkeeping_diving          0.640205
goalkeeping_positioning     0.629509
goalkeeping_reflexes        0.621344
potential                   0.610446
goalkeeping_handling        0.555294
wage_eur                    0.379606
movement_reactions          0.370051
goalkeeping_kicking         0.281461
mean_league/year            0.188652
mean_team/year              0.169631
age                         0.147012
mentality_composure         0.123727
international_reputation    0.118070
overall_team/year           0.106865
power_shot_power            0.088913
power_jumping               0.081460
mentality_vision            0.079157
dtype: float64

In [30]:
top_X=pd.Series(m,index=variables).sort_values(kind="quicksort",ascending=False).head(19)

In [31]:
top_X=top_X.to_frame('top').reset_index()['index'].to_list() #x5_1 es igual a 'club_position'

In [32]:
top_X

['overall',
 'value_eur',
 'goalkeeping_diving',
 'goalkeeping_positioning',
 'goalkeeping_reflexes',
 'potential',
 'goalkeeping_handling',
 'wage_eur',
 'movement_reactions',
 'goalkeeping_kicking',
 'mean_league/year',
 'mean_team/year',
 'age',
 'mentality_composure',
 'international_reputation',
 'overall_team/year',
 'power_shot_power',
 'power_jumping',
 'mentality_vision']

In [33]:
'''top_X.remove('x5_1')
top_X.remove('x2_Yes')'''

"top_X.remove('x5_1')\ntop_X.remove('x2_Yes')"

In [34]:
'''top_X.append('club_position')
top_X.append('real_face')'''

"top_X.append('club_position')\ntop_X.append('real_face')"

## Creación de pipeline

In [35]:
#definimos X e y
X=df[top_X]
y=df['progresion_anual']

In [36]:
#Con top 19 variables
feature_engineering=make_column_transformer((StandardScaler(),  X.select_dtypes('number').columns)
                                           )

In [37]:
'''#Con top 20 variables
feature_engineering=make_column_transformer((OneHotEncoder(drop='first'),['club_position']),
                                            (StandardScaler(),  X.select_dtypes('number').drop(['club_position'],axis=1).columns)
                                           )
                                           '''

"#Con top 20 variables\nfeature_engineering=make_column_transformer((OneHotEncoder(drop='first'),['club_position']),\n                                            (StandardScaler(),  X.select_dtypes('number').drop(['club_position'],axis=1).columns)\n                                           )\n                                           "

In [38]:
'''#Con top 30 variables
feature_engineering=make_column_transformer((OneHotEncoder(drop='first'),['club_position','real_face']),
                                            (StandardScaler(),  X.select_dtypes('number').drop(['club_position'],axis=1).columns)
                                           )
                                           '''

"#Con top 30 variables\nfeature_engineering=make_column_transformer((OneHotEncoder(drop='first'),['club_position','real_face']),\n                                            (StandardScaler(),  X.select_dtypes('number').drop(['club_position'],axis=1).columns)\n                                           )\n                                           "

In [39]:
#creamos un pipeline?
pipe = make_pipeline(feature_engineering, Ridge())

In [40]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  Index(['overall', 'value_eur', 'goalkeeping_diving', 'goalkeeping_positioning',
       'goalkeeping_reflexes', 'potential', 'goalkeeping_handling', 'wage_eur',
       'movement_reactions', 'goalkeeping_kicking', 'mean_league/year',
       'mean_team/year', 'age', 'mentality_composure',
       'international_reputation', 'overall_team/year', 'power_shot_power',
       'power_jumping', 'mentality_vision'],
      dtype='object'))])),
                ('ridge', Ridge())])

In [41]:
pipe.score(X,y)

0.18529726717577655

In [42]:
from sklearn.model_selection import GridSearchCV

In [43]:
params={'ridge__alpha':10.0**np.arange(-3,3)}

In [44]:
model=GridSearchCV(pipe, param_grid=params, cv=5) .fit(X,y)

In [45]:
model.best_params_

{'ridge__alpha': 0.001}

In [46]:
model.best_estimator_.fit(X,y).score(X,y) #mejor modelo de regresion

0.18530815168845216

## Modelos clasificación

In [47]:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn import ensemble


In [48]:
from sklearn.metrics import classification_report
target=[]
#elegimos el % de target para que sea 0 y 1
for i in df.progresion_anual:
    if i>0.1:
        target.append(1)
    else:
        target.append(0)
df["target"]=target


In [49]:
X_features = feature_engineering.fit_transform(X)

In [50]:
#elegir la X
X=X_features
y=df.target


In [51]:
#rebalanceo
import imblearn
from imblearn.under_sampling import RandomUnderSampler
X_res, y_res = RandomUnderSampler().fit_resample(X,y)
#split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.15, random_state=42)

In [52]:
from sklearn.svm import SVC

In [53]:
X_train

array([[-1.02485241, -0.33818712, -0.61946142, ..., -0.59499779,
        -0.47476068, -1.0975021 ],
       [-1.54896144, -0.35240928, -0.87732053, ..., -1.44822434,
        -0.5630452 ,  0.03984778],
       [ 0.15439293, -0.26885407,  0.02518636, ...,  1.18902137,
         0.67293808, -1.7040887 ],
       ...,
       [-2.07307048, -0.34885374, -1.2641092 , ...,  0.33579482,
         0.14323096,  0.03984778],
       [ 0.15439293, -0.24218751,  0.92769326, ...,  1.34415347,
         2.0854904 , -0.7942088 ],
       [-0.36971611, -0.3088539 ,  0.02518636, ..., -0.67256384,
         0.7612226 , -0.7942088 ]])

In [54]:
#modelo3
model3=DecisionTreeClassifier().fit(X_train,y_train)
y_pred=model3.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.60      0.66        63
           1       0.68      0.78      0.72        67

    accuracy                           0.69       130
   macro avg       0.70      0.69      0.69       130
weighted avg       0.70      0.69      0.69       130



In [55]:
#modelo4
model4=SVC()

In [56]:
model4

SVC()

In [57]:
model4.fit(X_train,y_train).score(X_test,y_test)


0.7

In [58]:
y_pred=model4.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.59      0.65        63
           1       0.68      0.81      0.73        67

    accuracy                           0.70       130
   macro avg       0.71      0.70      0.69       130
weighted avg       0.71      0.70      0.70       130



In [59]:
y_pred

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0],
      dtype=int64)