In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTEN

In [3]:
df=pd.read_csv('data/df_cleaned.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## depuración columna "gk"

In [4]:
df['gk'] =df['gk'].replace("+","")

In [5]:
df['gk'] = df['gk'].str.replace(r'\+', '', regex=True)


In [6]:
df.gk.fillna(0,inplace=True)

In [7]:
df['gk']=df.gk.astype(int)

In [8]:
df['gk']

0        15
1        16
2        14
3        17
4        87
         ..
87742     0
87743     0
87744     0
87745     0
87746     0
Name: gk, Length: 87747, dtype: int64

## Añadir variable años pendientes de contrato

In [9]:
df['años_de_contrato']=np.where((df['club_contract_valid_until']-df['año_version'])<0,0,(df['club_contract_valid_until']-df['año_version']))

In [10]:
#comprobación
df.groupby('años_de_contrato')['años_de_contrato'].count()

años_de_contrato
0.0    30707
1.0    21182
2.0    17779
3.0     9287
4.0     4872
5.0     2502
6.0     1413
7.0        3
8.0        2
Name: años_de_contrato, dtype: int64

## Eliminación de ruido

In [11]:
#elimino columnas (según Sergio)
'''df.drop(['sofifa_id',
 'short_name',
 'long_name',
 'player_positions',
     'dob', 'club_team_id',
 'club_name',
 'league_name',
 'league_level',
'club_jersey_number',
 'club_joined',
 'club_contract_valid_until',
 'nationality_id',
 'nationality_name',
 'nation_team_id','work_rate',
 'body_type',
 'real_face',
    'player_traits'],axis=1,inplace=True)'''

"df.drop(['sofifa_id',\n 'short_name',\n 'long_name',\n 'player_positions',\n     'dob', 'club_team_id',\n 'club_name',\n 'league_name',\n 'league_level',\n'club_jersey_number',\n 'club_joined',\n 'club_contract_valid_until',\n 'nationality_id',\n 'nationality_name',\n 'nation_team_id','work_rate',\n 'body_type',\n 'real_face',\n    'player_traits'],axis=1,inplace=True)"

In [12]:
#eliminamos columnas que sabemos seguro que sólo aportan ruido (según Carlos)
df.drop(["sofifa_id",'short_name', 'long_name',"nationality_id","dob",'club_team_id', 'club_name', 'league_name',
        'club_jersey_number', 'club_joined', 'club_contract_valid_until','nationality_id', 'nationality_name','player_traits',
        'ranking','puntos','año_version','player_positions'],axis=1,inplace=True)

## Agrupar columna bodytype

In [13]:
Lean=['Lean (170-)','Lean (185+)','Lean (170-185)']
Normal=['Normal (170-)','Normal (170-185)','Normal (185+)']
Stocky=['Stocky (170-)','Stocky (170-185)','Stocky (185+)']

#agrupo posiciones en posiciones genéricas    
for i in df.index:
    if df.at[i,'body_type'] in Lean:
        df.at[i,'body_type']='Lean'
    elif df.at[i,'body_type'] in Normal:
        df.at[i,'body_type']='Normal'
    elif df.at[i,'body_type'] in Stocky:
        df.at[i,'body_type']='Stocky'
    else:
        pass

## Exploración feature engineering

In [14]:
df.preferred_foot.value_counts() #categorica a OHE

Right    66970
Left     20777
Name: preferred_foot, dtype: int64

In [15]:
df.work_rate.value_counts() #categorica transformarla con un ordinal encoder

Medium/Medium    47248
High/Medium      15911
Medium/High       8196
High/High         4712
Medium/Low        4072
High/Low          3332
Low/Medium        2078
Low/High          2058
Low/Low            140
Name: work_rate, dtype: int64

In [16]:
df.body_type.value_counts() #categorica a OHE

Normal    51581
Lean      30898
Stocky     5099
Unique      169
Name: body_type, dtype: int64

In [17]:
df.real_face.value_counts() #categorica a OHE

No     78885
Yes     8862
Name: real_face, dtype: int64

## Feature engineering (transformar todas las variables a numéricas menos player_positions)

In [18]:
#preparación para transformación categorica a num
work_rate_ord=['Low/Low','Low/Medium','Low/High','Medium/Low','Medium/Medium','Medium/High','High/Low','High/Medium','High/High']
league_level_ord=['0.0','4.0','3.0','2.0','1.0']
df['league_level']=df['league_level'].apply(str)

In [19]:
#Ordinal Encoder
df[['work_rate','league_level']]=OrdinalEncoder(categories=[work_rate_ord,league_level_ord]).fit_transform(df[['work_rate','league_level']])

In [20]:
#OHE
df[['preferred_foot','body_type_1','body_type_2','body_type_3','real_face']]=OneHotEncoder(drop='first',sparse=False).fit_transform(df[['preferred_foot','body_type','real_face']])                                          

In [21]:
#comprobar columnas
categorical = df.dtypes == object
categorical[categorical==True]

body_type    True
dtype: bool

In [22]:
#eliminar columna body_type
df.drop(["body_type"],axis=1,inplace=True)

In [23]:
target=[]
#elegimos el % de target para que sea 0 y 1
for i in df.progresion_anual:
    if i>0.1:
        target.append(1)
    else:
        target.append(0)
df["target"]=target

In [24]:
X=df.drop(["progresion_anual","target"],axis=1)
y=df.target

In [25]:
#seleccion de columnas 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X,y)

SelectFromModel(estimator=RandomForestClassifier())

In [26]:
#columnas que coge
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True,  True, False,
       False, False, False])

In [27]:
selected_feat= X.columns[(sel.get_support())]
len(selected_feat) #selecciona 42 columnas

48

In [28]:
#vemos que variables son
selected_feat

Index(['overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm',
       'weight_kg', 'pace', 'physic', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes', 'gk',
       'overall_team

In [29]:
#redefinimos X:
X=df[selected_feat.to_list()]

## Primer modelo naive

In [30]:
#split el dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [31]:
model=RandomForestClassifier()

In [32]:
model.fit(X_train,y_train)

RandomForestClassifier()

In [33]:
model.score(X_test,y_test)

0.9490598290598291

In [34]:
y_pred=model.predict(X_test)

In [35]:
print(classification_report(y_test,y_pred)) 
#Se observa accury muy alta => consecuencia posible de df muy desbalanceado

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     16667
           1       0.30      0.01      0.02       883

    accuracy                           0.95     17550
   macro avg       0.62      0.50      0.50     17550
weighted avg       0.92      0.95      0.93     17550



In [36]:
#desbalanceo de X
1845/1927

0.9574468085106383

## Segundo modelo con undersampling

In [37]:
rus = RandomUnderSampler(random_state=0)
X_res, y_res =rus.fit_resample(X, y)

#split el dataframe
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)



In [38]:
model=RandomForestClassifier()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

print(classification_report(y_test, y_pred))
#random forest classifier consigue un 73% de accuracy

              precision    recall  f1-score   support

           0       0.82      0.77      0.79       853
           1       0.77      0.83      0.80       824

    accuracy                           0.80      1677
   macro avg       0.80      0.80      0.80      1677
weighted avg       0.80      0.80      0.80      1677



In [39]:
model2=SVC()
model2.fit(X_train,y_train)
y_pred=model2.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.62      0.69       853
           1       0.68      0.82      0.74       824

    accuracy                           0.72      1677
   macro avg       0.73      0.72      0.72      1677
weighted avg       0.73      0.72      0.72      1677



## Tercer modelo con Over-sampling

In [40]:
res= SMOTEN()

In [41]:
X_res,y_res=res.fit_resample(X, y)

In [42]:
#comprobación
sum(y_res)/len(y_res) 

0.5

In [43]:
#split el dataframe
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)

In [44]:
model3=RandomForestClassifier()
model3.fit(X_train,y_train)
y_pred=model3.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.99      0.97     16576
           1       0.99      0.95      0.97     16846

    accuracy                           0.97     33422
   macro avg       0.97      0.97      0.97     33422
weighted avg       0.97      0.97      0.97     33422



In [45]:
#model4=SVC()
#model4.fit(X_train,y_train).score(X_test,y_test)
#y_pred=model4.predict(X_test)
#print(classification_report(y_test, y_pred))
#SVC con oversampling consigue un 77% de accuracy

In [47]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     16576
           1       0.99      0.95      0.97     16846

    accuracy                           0.97     33422
   macro avg       0.97      0.97      0.97     33422
weighted avg       0.97      0.97      0.97     33422



In [48]:
import pickle

In [50]:
with open('./models/model_all.pkl','wb') as f:
    pickle.dump(ros,f)