In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('fifa21_train.csv')

### Cleaning 

In [3]:
print(df.columns.tolist())

['ID', 'Name', 'Age', 'Nationality', 'Club', 'BP', 'Position', 'Team & Contract', 'Height', 'Weight', 'foot', 'Growth', 'Joined', 'Loan Date End', 'Value', 'Wage', 'Release Clause', 'Contract', 'Attacking', 'Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Skill', 'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control', 'Movement', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Power', 'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Mentality', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Defending', 'Marking', 'Standing Tackle', 'Sliding Tackle', 'Goalkeeping', 'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes', 'Total Stats', 'Base Stats', 'W/F', 'SM', 'A/W', 'D/W', 'IR', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'Hits', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB'

#### Removing Nulls from several columns

In [4]:
df = df[df[['A/W', 'D/W']].isna().any(axis=1) == False]


#### Function to group values in column 'Positions' in less features to have better Onehot enconder results

In [5]:
def cleanOperation(x):
    x = x.lower()
    if 'cb' in x or 'rb' in x or 'lb' in x:  # check for each value separately
        return "defender"
    elif 'cm' in x or 'cam' in x or 'cdm' in x or 'lm' in x or 'rm' in x or 'lwb' in x or 'rwb' in x:
        return "midfielder"
    elif 'st' in x or 'rw' in x or 'lw' in x or 'cf' in x:
        return 'forward'
    else:
        return 'goalkeeper'
df['BP'] = df['BP'].apply(cleanOperation)

#### Removing strings and transforming it to numeric

In [6]:
df['IR']=df['IR'].str.replace('★','')
df['W/F']=df['W/F'].str.replace('★','')
df['SM']=df['SM'].str.replace('★','')


In [7]:
df['IR']=df['IR'].astype(int)
df['W/F']=df['W/F'].astype(int)
df['SM']=df['SM'].astype(int)

#### Removing strings and making columns numeric with the rigth amount value

In [8]:
df.loc[df['Hits'].str.endswith('K') & ~df['Hits'].isnull(), 'Hits'] = df.loc[df['Hits'].str.endswith('K') & ~df['Hits'].isnull(), 'Hits'].str.replace('K', '').astype(float) * 1000
df['Hits'] = df['Hits'].astype(int)

In [9]:
#df['Hits'].unique()

In [10]:
#print(df.columns.tolist())

In [11]:
df['Value'] = df['Value'].str.replace('€', '')
df.loc[df['Value'].str.endswith('M') & ~df['Value'].isnull(), 'Value'] = df.loc[df['Value'].str.endswith('M') & ~df['Value'].isnull(), 'Value'].str.replace('M', '').astype(float) * 1000000
df.loc[df['Value'].str.endswith('K') & ~df['Value'].isnull(), 'Value'] = df.loc[df['Value'].str.endswith('K') & ~df['Value'].isnull(), 'Value'].str.replace('K', '').astype(float) * 1000
df['Value'] = df['Value'].astype(int)

In [12]:
#print(df.columns.tolist())


#### Function to clean data the remaining data

In [13]:
def clean_data(df):
    cols = [] # Making all columns lower case
    for column in df.columns:
        cols.append(column.lower())
    cols
    df.columns = cols 
    df=df.drop(['id', 'name', 'age', 'nationality', 'club', 'position', 'team & contract', 'height', 'weight', 'joined', 'loan date end', 'wage', 'release clause', 'contract'], axis=1) # droping irrelevant columns
    df['composure'].fillna(df['composure'].median(), inplace=True)
    df_to_clean = ['gk', 'rb', 'rcb', 'cb', 'lcb', 'lb', 'rwb', 'rdm', 'cdm', 'ldm', 'lwb', 'rm', 'rcm', 'cm', 'lcm', 'lm', 'ram', 'cam', 'lam', 'rw', 'rf', 'cf', 'lf', 'lw', 'rs', 'st', 'ls'] #ating all the categoricals columns that I want to convert into numerical that is required for the prediction model 
    for col in df_to_clean:
        df[col] = df[col].str.split('+').str[0]
        df[col] = df[col].str.split('-').str[0]
    df[df_to_clean]= df[df_to_clean].apply(pd.to_numeric)
    df=pd.concat([df, df[df_to_clean]], axis=1)
    #df_cat=df.select_dtypes(include=[np.object])
     # attributing all the cleaned data and only the numerical columns to a variable
    
    return df

In [14]:
df=clean_data(df)

In [15]:
#print(df.columns.tolist())

#### X-y Split

In [16]:
y = df['ova']
X = df.drop(['ova'], axis=1)

#### Onehot enconding and training the model withou bleeding

In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [98]:
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

In [99]:
from sklearn.preprocessing import MinMaxScaler

In [100]:
transformer = MinMaxScaler().fit(X_train_num) 
X_train_scaled_arr = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_num.columns)
X_train_scaled.head()

Unnamed: 0,growth,value,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,...,cam,lam,rw,rf,cf,lf,lw,rs,st,ls
0,0.730769,0.002184,0.541772,0.5,0.586957,0.5,0.457831,0.552941,0.522613,0.602273,...,0.520548,0.520548,0.56,0.52,0.52,0.52,0.56,0.52,0.52,0.52
1,0.576923,0.003736,0.496203,0.613636,0.402174,0.511364,0.506024,0.364706,0.479899,0.590909,...,0.520548,0.520548,0.573333,0.493333,0.493333,0.493333,0.573333,0.466667,0.466667,0.466667
2,0.038462,0.012644,0.589873,0.625,0.565217,0.659091,0.650602,0.317647,0.603015,0.704545,...,0.684932,0.684932,0.693333,0.653333,0.653333,0.653333,0.693333,0.626667,0.626667,0.626667
3,0.0,0.002184,0.706329,0.590909,0.663043,0.715909,0.650602,0.729412,0.648241,0.636364,...,0.643836,0.643836,0.64,0.626667,0.626667,0.626667,0.64,0.653333,0.653333,0.653333
4,0.153846,0.009483,0.640506,0.568182,0.630435,0.465909,0.674699,0.717647,0.718593,0.738636,...,0.684932,0.684932,0.693333,0.666667,0.666667,0.666667,0.693333,0.64,0.64,0.64


In [101]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
encoded_cat = encoder.transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
# Note: in version 1.0 and higher of sklearn this method is called 'get_feature_names_out()'
# we will reuse encoder and cols when encoding the X_test_cat
onehot_encoded = pd.DataFrame(encoded_cat, columns=cols)
onehot_encoded.head()

Unnamed: 0,bp_forward,bp_goalkeeper,bp_midfielder,foot_Right,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0


In [102]:
X_train_treated = pd.concat([X_train_scaled, onehot_encoded], axis=1)

In [103]:
from sklearn import linear_model
classification = linear_model.LinearRegression()
classification.fit(X_train_treated, y_train)

LinearRegression()

In [104]:
X_test_scaled_arr = transformer.transform(X_test_num)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
X_test_scaled.head()

Unnamed: 0,growth,value,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,...,cam,lam,rw,rf,cf,lf,lw,rs,st,ls
0,0.230769,0.048276,0.663291,0.693182,0.652174,0.511364,0.759036,0.541176,0.756281,0.761364,...,0.753425,0.753425,0.746667,0.706667,0.706667,0.706667,0.746667,0.666667,0.666667,0.666667
1,0.307692,0.01092,0.594937,0.488636,0.619565,0.511364,0.662651,0.564706,0.592965,0.738636,...,0.684932,0.684932,0.68,0.653333,0.653333,0.653333,0.68,0.6,0.6,0.6
2,0.0,0.026437,0.653165,0.659091,0.347826,0.806818,0.771084,0.541176,0.678392,0.454545,...,0.561644,0.561644,0.506667,0.506667,0.506667,0.506667,0.506667,0.506667,0.506667,0.506667
3,0.538462,0.057471,0.675949,0.670455,0.673913,0.522727,0.650602,0.694118,0.668342,0.818182,...,0.712329,0.712329,0.76,0.72,0.72,0.72,0.76,0.693333,0.693333,0.693333
4,0.0,0.034483,0.501266,0.454545,0.293478,0.75,0.674699,0.258824,0.482412,0.590909,...,0.506849,0.506849,0.506667,0.48,0.48,0.48,0.506667,0.48,0.48,0.48


In [105]:
encoded_test_cat = encoder.transform(X_test_cat).toarray()
onehot_encoded_test = pd.DataFrame(encoded_test_cat, columns=cols)
onehot_encoded_test.head()

Unnamed: 0,bp_forward,bp_goalkeeper,bp_midfielder,foot_Right,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [106]:
X_test_treated = pd.concat([X_test_scaled, onehot_encoded_test], axis=1)

In [107]:
predictions = classification.predict(X_test_treated)
predictions
classification.score(X_test_treated, y_test)

0.9219251581099903

In [108]:
from sklearn.metrics import mean_squared_error

In [109]:
rmse = round(np.sqrt(mean_squared_error(y_test, predictions)), 4)  # Previous rmse was 2.2675
rmse

1.9209