In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('fifa21_train.csv')

In [3]:
#
# Standardize column names
#
def standardize_column_names(data):
    cols = []
    for col in data.columns:
        cols.append(col.lower())
    data.columns = cols
    return data

In [4]:
#
# Clean Columns 0-33
#
def drop_cols(df):
    df=df.dropna(axis=1, 
                thresh=int((10/100)*df.shape[0] + 1))
    return df

def fill_club(df, fill):
    df['club']=df['club'].fillna(fill)
    return df

def fix_height(x):
    s=x.split("'")
    s[1]=s[1].replace('"',"")
    return float(s[0])+(float(s[1])/100)*8

def fix_weight(value):
    if type(value) == str:
        if value.find('lbs') != -1:
            return float(value.replace('lbs',''))
        else:
            return float(value)
        
def fill_pos(df, fill):
    df['position']=df['position'].fillna(fill)
    return df

def fix_money(x):
    x=x.replace('€','')
    s=x.replace('$','')
    if s.find("K") != -1:
        s=s.replace("K","")
        return float(s)*1000
    elif s.find("M") != -1:
        s=s.replace("M","")
        return float(s)*1000000
    else:
        return float(s)    

def clean_0_33(df):
    """
        Cleans columns 0-33.
        Input -> data   Dataframe to clean
        Output -> Dataframe with cleaned columns 0-33
    """
    
    clean=df.copy()
    clean=drop_cols(clean)
    clean=fill_club(clean, 'Free Agent')
    clean=fill_pos(clean, clean['bp'])
    dropped_rows=clean[clean['joined'].isna()==True].index
    clean=clean.drop(dropped_rows, axis=0)
    clean['height']=clean['height'].apply(fix_height)
    clean['weight']=clean['weight'].apply(fix_weight)
    clean['value']=clean['value'].apply(fix_money)
    clean['release clause']=clean['release clause'].apply(fix_money)
    clean['wage']=clean['wage'].apply(fix_money)
    return clean


In [5]:
#
# Cleans columns 34-66 of given dataset
#
def clean_34_66(data):
    """
        Cleans columns 34-66.
        Input -> data   Dataframe to clean
        Output -> Dataframe with cleaned columns 34-66
    """
    
    clean = data.copy()    
    clean['balance'] = clean['balance'].fillna(round(clean['balance'].mean(), 2))
    clean['jumping'] = clean['jumping'].fillna(round(clean['jumping'].mean(), 2))
    clean['interceptions'] = clean['interceptions'].fillna(round(clean['interceptions'].mean(), 2))
    clean['positioning'] = clean['positioning'].fillna(round(clean['positioning'].mean(), 2))
    clean['vision'] = clean['vision'].fillna(round(clean['vision'].mean(), 2))
    clean['composure'] = clean['composure'].fillna(round(clean['composure'].mean(), 2))
    clean['sliding tackle'] = clean['sliding tackle'].fillna(round(clean['sliding tackle'].mean(), 2))
    clean['a/w'] = clean['a/w'].fillna('Medium')
    clean['d/w'] = clean['d/w'].fillna('Medium')
    clean['w/f'] = clean['w/f'].apply(lambda val: val.replace('★', ''))
    clean['sm'] = clean['sm'].apply(lambda val: val.replace('★', ''))
    clean['ir'] = clean['ir'].apply(lambda val: val.replace('★', ''))
    return clean
    

In [6]:
#
# Clean Columns 67-101
#

def clean_hits(value):
    if value.find('K') != -1:
        v = value.replace('K', '')
        return float(v) * 1000
    else:
        return float(value) *1000

def clean_positions(value):
    if type(value) is str:
        if value.find('+-') or value.find('+'):
            s = value.split('+')
            return float(s[0])
        elif value.find('-'):
            s = value.split('-')
            return float(s[0])
    else:
        return value
    
def clean_67_101(df3):
    """
        Cleans columns 67-101.
        Input -> data   Dataframe to clean
        Output -> Dataframe with cleaned columns 67-101
    """
    
    clean = df3.copy()
    clean['hits'] = clean['hits'].apply(clean_hits)
    clean['ls'] = clean['ls'].apply(clean_positions)
    clean['st'] = clean['st'].apply(clean_positions)
    clean['rs'] = clean['rs'].apply(clean_positions)
    clean['lw'] = clean['lw'].apply(clean_positions)
    clean['lf'] = clean['lf'].apply(clean_positions)
    clean['cf'] = clean['cf'].apply(clean_positions)
    clean['rf'] = clean['rf'].apply(clean_positions)
    clean['rw'] = clean['rw'].apply(clean_positions)
    clean['lam'] = clean['lam'].apply(clean_positions)
    clean['cam'] = clean['cam'].apply(clean_positions)
    clean['ram'] = clean['ram'].apply(clean_positions)
    clean['lm'] = clean['lm'].apply(clean_positions)
    clean['lcm'] = clean['lcm'].apply(clean_positions)
    clean['cm'] = clean['cm'].apply(clean_positions)
    clean['rcm'] = clean['rcm'].apply(clean_positions)
    clean['rm'] = clean['rm'].apply(clean_positions)
    clean['lwb'] = clean['lwb'].apply(clean_positions)
    clean['ldm'] = clean['ldm'].apply(clean_positions)
    clean['cdm'] = clean['cdm'].apply(clean_positions)
    clean['rdm'] = clean['rdm'].apply(clean_positions)
    clean['rwb'] = clean['rwb'].apply(clean_positions)
    clean['lb'] = clean['lb'].apply(clean_positions)
    clean['lcb'] = clean['lcb'].apply(clean_positions)
    clean['cb'] = clean['cb'].apply(clean_positions)
    clean['rcb'] = clean['rcb'].apply(clean_positions)
    clean['rb'] = clean['rb'].apply(clean_positions)
    clean['gk'] = clean['gk'].apply(clean_positions)
    return clean

In [7]:
def clean_FIFA_dataframe(data):
    """
        Cleans the complete FIFA dataframe.
        Input -> data   Dataframe to clean
        Output -> The cleaned dataframe
    """
    
    df = data.copy()
    df = standardize_column_names(df)
    df = clean_0_33(df)
    df = clean_34_66(df)
    df = clean_67_101(df)
    
    return df


In [8]:
df = clean_FIFA_dataframe(df)
df.isna().sum()

id             0
name           0
age            0
nationality    0
club           0
              ..
cb             0
rcb            0
rb             0
gk             0
ova            0
Length: 100, dtype: int64

In [9]:
df_num = df.select_dtypes(np.number)

In [10]:
# DO X-y split
# Keep best correlated columns
X = df_num[['base stats', 'reactions', 'pas']]
y = df_num['ova']

In [11]:
X

Unnamed: 0,base stats,reactions,pas
0,357,66,63
1,412,75,68
2,404,78,78
3,329,55,54
4,360,62,60
...,...,...,...
11695,337,71,47
11696,347,53,56
11697,387,65,66
11698,337,62,60


In [12]:
from sklearn.preprocessing import MinMaxScaler

minMaxScaler = MinMaxScaler()
minMaxScaler.fit(X)
X_normalized = minMaxScaler.transform(X)
X = pd.DataFrame(X_normalized, columns=X.columns)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def train_and_validate_model(X, y, test_size=0.2, random_state=42):
    
    # Split X,y into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=random_state)
    print(f'X_train: {X_train.shape}')
    print(f'X_test:  {X_test.shape}')
    print(f'y_train: {y_train.shape}')
    print(f'y_test:  {y_test.shape}')


    # Apply linear regression, train/fit the model
    lm = LinearRegression()
    lm.fit(X_train,y_train)
    
    # Predict training set
    y_pred_train = lm.predict(X_train)
    print(f"y_pred_train = {y_pred_train}")

    # R2 score for training set
    r2_train = r2_score(y_train, y_pred_train)
    print(f"r2_train     = {r2_train}")

    # Predict test set
    y_pred_test  = lm.predict(X_test)
    print(f"y_pred_test = {y_pred_test}")

    # R2 score for test set
    r2_test = r2_score(y_test, y_pred_test)
    print(f"r2_test     = {r2_test}")

    # MSE - Mean Squared Error
    mse = mean_squared_error(y_test,y_pred_test)
    print(f'MSE = {mse}')

    # RMSE - Root Mean Squared Error
    rmse = np.sqrt(mse)
    print(f'RMSE = {rmse}')

    # MAE - Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred_test)
    print(f'MAE = {mae}')
    
    return y_pred_test, y_pred_train

In [14]:
y_pred_test, y_pred_train = train_and_validate_model(X, y, 0.3, 25)

X_train: (8162, 3)
X_test:  (3498, 3)
y_train: (8162,)
y_test:  (3498,)
y_pred_train = [69.44845587 65.12032111 75.6441522  ... 63.78600986 76.19613763
 59.97651889]
r2_train     = 0.8350156625500983
y_pred_test = [69.49622754 64.90059153 74.51893392 ... 68.78014526 66.2581453
 65.94207164]
r2_test     = 0.8393484312745522
MSE = 7.367439000622082
RMSE = 2.7143026729939463
MAE = 2.121328157898138
