# Fifa 21 Project

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
import warnings

#ignore python warnings
warnings.filterwarnings("ignore")

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
%matplotlib inline

# Import dataset
dataset = pd.read_csv('fifa21_train.csv')

# Function to convert K and M to normal numbering
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    return float(x)

# Character replacing function
def remove_char(x,y, z=''):
    return x.apply(lambda d: d.replace(y, z))

# function to split the columns and sum the values
def split_columns(data):
    return pd.to_numeric(data.str.split('+',n=1,expand=True)[0]) + pd.to_numeric(data.str.split('+',n=1,expand=True)[1])

# Functions to apply python snake column names
def std_data(x, y='_'):
    x.columns = [e.lower().replace(' ','_') for e in x]
    return x

# Data = working dataset, drp = columns to drop
def fill_na(data, drp):
    
    # Dropping unwanted columns
    data = data.drop(columns=drp, axis=1)
    data = data[data['composure'].isna() == False]
    
    # Fill Nan's with values
#     data['position'].fillna(value=data['bp'], inplace=True)
#     data['club'].fillna(value='None', inplace=True)
    data = data.reset_index(drop=True)
    
    return data

# Function that encodes categoricals either with one hot encoder, or a mix of one hot encoder + label encoder
def cat_encode(data, exclude='', type_d=True):

    if type_d:
        tmp = data.drop(columns=exclude, axis=1)
        encoder = OneHotEncoder(drop='first').fit(tmp)
    else:
        encoder = global_encoder

    # Categorical encoder Label + onehot
    # If no fields to label encode are provided only the one hot encoder is executed
    if exclude == '':
        
        encoded = encoder.transform(data).toarray()
        cols = encoder.get_feature_names(input_features=data.columns)
        onehot_encoded = pd.DataFrame(encoded, columns=cols)
        
        return onehot_encoded
        
    else:
        
        # Label Encoder
        le = LabelEncoder()
        label_encoded = data[exclude]
        label_encoded[exclude] = label_encoded[exclude].apply(le.fit_transform)
        
        # One hot encoder
        tmp = data.drop(columns=exclude, axis=1)
        encoded = encoder.transform(tmp).toarray()
        cols = encoder.get_feature_names(input_features=tmp.columns)
        onehot_encoded = pd.DataFrame(encoded, columns=cols)
        
        return pd.concat([onehot_encoded, label_encoded], axis=1), encoder
    
# Function that builds the linear regression model
def build_model(data, target):

    # X-y Split
    y = data[target]
    X = data.drop([target], axis=1)
    
    transformer = MinMaxScaler().fit(X)
    X_normalized = transformer.transform(X)
    X_normalized = pd.DataFrame(X_normalized, columns=X.columns)
 
    display(X.head())
        
    #train test Splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

    # Creating the linear regression object and training it
    lm = linear_model.LinearRegression()
    lm.fit(X_train,y_train)

    # Making predictions with the taining and tes sub datasets to use in the evaluation section bellow
    predictions = lm.predict(X_train)
    predictions_test = lm.predict(X_test)

    # Results Validation using the previously computed calculations
    print('\n\nLinear Regression Performance Results\n')
    print('  R2 SCORE: Train', round(r2_score(y_train, predictions),3), ' | Test', round(r2_score(y_test, predictions_test), 3))
    print(' MSE SCORE: Train', round(mean_squared_error(y_train,predictions),3), ' | Test', round(mean_squared_error(y_test,predictions_test), 3))
    print('RMSE SCORE: Train', round(np.sqrt(mean_squared_error(y_train,predictions)),3), '| Test', round(np.sqrt(mean_squared_error(y_test,predictions_test)),3))
    print(' MAE SCORE: Train', round(mean_absolute_error(y_train, predictions),3), '| Test', round(mean_absolute_error(y_test, predictions_test), 3))

    # Printing just 5 results that we know from the Label set(y_train) and 5 predictions to check visualy  
    # the model working and the scores calculated above
    print('\n\nTraining Values')
    display(y_train[:5])
    print('Prediction Values')
    display(predictions[:5])
    
    return lm

# Function that does numerous dataset manipulation operations, all described individually
def data_crunch(data, columns_to_drop, currency_values, to_drop, plus_items, type_d=True):

    # Replacing all the column values that have currency type like €numberK/M to simple numbers
    for e in currency_values:
        data[e] = remove_char(data[e],'€')                # Removing the € character
        data[e] = data[e].apply(value_to_float)           # Converting the numbers to simple numbers

    # Converting the numbers of column 'hits' to simple numbers
    data['hits'] = data['hits'].apply(value_to_float)     
    
    # Converting weight_kg column from pounds to kg
    data['weight_kg'] = remove_char(data['weight_kg'],'lbs')
    data['weight_kg'] = data['weight_kg'].astype(float)*0.4532
    # Converting height_cm column from in to cm
    data['height_cm'] = remove_char(data['height_cm'],'"')
    data['height_cm'] = ((data.height_cm.str.split("'").str[0].astype(int) * 12) + (data.height_cm.str.split("'").str[1].astype(int)))*2.54
    
    # Removing ★ character from columns and converting to numerical type
    for e in to_drop:
        data[e] = remove_char(data[e],'★')
        data[e] = data[e].astype(int)
        
    # Summing the values in the columns that have structure like n1+n2
    for e in plus_items:
        for i in data.columns:
            #test if values of i and e are equal
            if i == e:
                #calling split function to make the sum
                data[e] = split_columns(data[e])
                
    # Getting numerical and categorical data separated
    numerical = data.select_dtypes(np.number)
    categorical = data.select_dtypes('object')

    # Encoding the categorical data, a/w and d/w will be label encoded and the remaining will be one hot encoded
    categorical, g_encoder = cat_encode(categorical, ['a/w', 'd/w'],type_d)
    
    # Ploting the correlations in numerical dataset
    correlations_matrix = numerical.corr()
    sns.set(rc = {'figure.figsize':(40,40)})
    sns.heatmap(correlations_matrix)
    plt.show()
    display(correlations_matrix)
    
    # Concat the numerical and categorical sets to be fitted to linear regression and return the dataset
    return pd.concat([numerical, categorical], axis=1), g_encoder
    

# This is our main function wich defines activelly all the manipulation to do on the dataset
# It has many variables that change everything in the operation, making it essy to change and see results almost imediatly
# without having to change any code at all.
# This function only needs the dataset as argument, and retuns the linear regression as intended.
def prep_data(data, type_d=True):
    
    # PARAMETERS TO AJUST THE DATASET /BEGIN
    # Variables to change parameters of cleaning
    columns_to_drop = ['loan_date_end', 'id', 'name','club','position' 'team_&_contract', 'growth', 'joined', 'contract',
                       'crossing','finishing','heading_accuracy','short_passing','volleys','dribbling','curve','fk_accuracy','long_passing',
                       'ball_control','acceleration','sprint_speed','agility','reactions','balance','shot_power','jumping','stamina','strength','long_shots',
                       'aggression','interceptions','positioning','vision','penalties','marking','standing_tackle','sliding_tackle',
                       'gk_diving','gk_handling','gk_kicking','gk_positioning','gk_reflexes','total_stats','pac','sho','pas','dri','def','phy',
                       'lwb','ldm','rdm','rwb','lb','lcb','rcb','rb', 'ls','rs','lf','rf','rw','cam','lm','nationality',
                       'st','lw','cf','lam','ram','lcm','cm','rcm', 'attacking', 'skill', 'cdm', 'value_euro'] # My sugestions only
    currency_values = ['wage_euro', 'release_clause_euro']
    # Special coluns to remove ★ character
    to_drop = ['w/f', 'sm', 'ir']
    # Columns to split by '+' and sum the two halves
    plus_items = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
    # Target for the linear regression module
    target = 'ova'
    # PARAMETERS TO AJUST THE DATASET /END

    cleandata = std_data(data)
    cleandata.rename(columns = {'height':'height_cm', 'weight':'weight_kg', 'value':'value_euro', 'wage':'wage_euro', 'release_clause':'release_clause_euro'}, inplace = True)
    cleandata = fill_na(cleandata, columns_to_drop)
    
    results, g_encoder = data_crunch(cleandata, columns_to_drop, currency_values, to_drop, plus_items, type_d)
    
    return results, g_encoder

# Using the model
data, global_encoder = prep_data(dataset)
lm = build_model(data, 'ova')

KeyError: "['positionteam_&_contract'] not found in axis"

In [None]:
def evaluate (d1, d2):   
    #     # Printing just 5 results that we know from the Label set(y_train) and 5 predictions to check visualy  
    #     # the model working and the scores calculated above
    print('  R2 SCORE: Train', round(r2_score(d1, d2),3))
    print('RMSE SCORE: Train', round(np.sqrt(mean_squared_error(d1, d2)),3))
    print('Prediction Values')
    display(d2[:5])
    print('Real Values')
    display(d1[:5])
    return

In [None]:
dataset_v = pd.read_csv('fifa21_validate.csv')
data_v = prep_data(dataset_v, False)

# _set = data_v.drop(['ova'], axis=1)
 # Making predictions with the new dataset
predictions = lm.predict(data_v)

evaluate(dataset_v['ova'], predictions)

In [None]:
# !git add .
# !git status
# !git commit -m "Fifa 2021 Final "
# !git push