In [144]:
import pandas as pd
import statistics as stats
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder  
from sklearn.model_selection import train_test_split

In [145]:
file = pd.read_csv('fifa21_train.csv')

## Data Cleaning

In [146]:
def split_str(categoricals):   
    for column in categoricals:
        if [categoricals[column].isin(['+-'])]:
            categoricals[column] = categoricals[column].str.split('+').str[0]
        elif [(categoricals[column].isin(['-']))]:
            categoricals[column] = categoricals[column].str.split('-').str[0]
        else: 
            categoricals[column] = categoricals[column].str.split('+').str[0]
    return categoricals

def split_str(df): 
    
    lst = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF',
       'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    
    for i in lst:
        df[[i, i + '_pot']] = df[i].str.split('+', expand=True)
        df[i + '_pot'] = df[i + '_pot'].astype('int') + df[i].astype('int')
        df[i + '_pot'] = df[i + '_pot'].astype('int')
        df[i] = df[i].astype('int')
    
    return df

In [147]:
def remove_char(categoricals):
    specialchars = ['€','★',"lbs"]
    for char in specialchars:
        for column in categoricals:
            categoricals[column] = categoricals[column].replace(specialchars,'', regex=True)
    return categoricals

In [148]:
def height_to_cm(categoricals):
    H_feet = categoricals['Height'].str.split("'").str[0]
    H_inch = categoricals['Height'].str.split("'").str[1].str.split("\"").str[0]
    categoricals['Height'] = (H_feet.astype(float) * 30.48) + (H_inch.astype(float) * 2.54)
    return categoricals

In [149]:
def K_M_multiply(categoricals):
    categoricals['Release_Clause'] = categoricals['Release_Clause'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Wage'] = categoricals['Wage'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Value'] = categoricals['Value'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Hits'] = categoricals['Hits'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    return categoricals

In [150]:
def object_to_num(categoricals):
    col_to_numercic = ['Weight', 'Value', 'Wage', 'Release_Clause', 'W/F', 'SM', 'IR', 'Hits', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    for i in range(len(col_to_numercic)):
        categoricals[col_to_numercic[i]] =  pd.to_numeric(categoricals[col_to_numercic[i]])
    return categoricals

In [151]:
def data_clear(file):
    file.columns = list(map(lambda x: x.replace(" ","_"), file.columns))
    file=file.drop(['Club', 'BP','ID', 'Name', 'Nationality', 'Joined','Contract', 'Team_&_Contract', 'Loan_Date_End', 'Position'], axis=1 )
    file=file.dropna()
    file = file.drop_duplicates()
    numerical = file._get_numeric_data()
    categoricals = file.select_dtypes(['object'])
    split_str(categoricals)
    remove_char(categoricals)
    K_M_multiply(categoricals)
    height_to_cm(categoricals)
    object_to_num(categoricals)
    cleaned_data = pd.concat([numerical, categoricals], axis=1)
    return cleaned_data

In [152]:
cleaned_data = data_clear(file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals[column] = categoricals[column].str.split('+').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals[column] = categoricals[column].replace(specialchars,'', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals['Release_Clause'] = categoricals['Release_Cla

## Data Encoding and Normalization

In [153]:
y = cleaned_data['OVA']

In [154]:

from sklearn.preprocessing import OneHotEncoder
categoricals = cleaned_data.select_dtypes(['object'])
encoder = OneHotEncoder(drop='first').fit(categoricals)
encoded = encoder.transform(categoricals).toarray()
cols = encoder.get_feature_names(input_features=categoricals.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)

In [155]:

from sklearn.preprocessing import MinMaxScaler
cleaned_data = cleaned_data.drop(['OVA'], axis=1)
numerical = cleaned_data._get_numeric_data()
transformer = MinMaxScaler().fit(numerical)
X_normalized = transformer.transform(numerical)
X_normalized = pd.DataFrame(X_normalized, columns=numerical.columns)


In [156]:
#onehot_encoded = encoding(cleaned_data)
#X_normalized = normalizing(cleaned_data)

In [157]:
X_concat = pd.concat([X_normalized, onehot_encoded], axis=1)

## Defining X,y

In [158]:

X = X_concat

## Training the model

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [160]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [161]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9137, 92)
(2285, 92)
(9137,)
(2285,)


In [162]:
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.9144438600725543

In [163]:
predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.9087974109870399

In [164]:
results = lm.predict(X)

results_OVA = pd.concat([file,pd.Series(results)],axis=1)

In [165]:
results_OVA.head()

Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team_&_Contract,Height,Weight,...,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA,0
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,"5'9""",161lbs,...,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64,64.931946
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan","6'0""",159lbs,...,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77,77.66304
2,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,Al Hilal 2019 ~ 2022,"5'4""",134lbs,...,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80,75.848694
3,233796,J. Evans,22,Wales,Swansea City,CDM,CDM CM,Swansea City 2016 ~ 2021,"5'10""",152lbs,...,58+2,56+2,57+2,58+2,58+2,58+2,57+2,14+2,59,61.909637
4,234799,Y. Demoncy,23,France,US Orléans Loiret Football,CDM,CDM CM,US Orléans Loiret Football 2018 ~ 2021,"5'11""",150lbs,...,64+2,64+2,63+2,61+2,61+2,61+2,63+2,15+2,65,67.096069


#### MSE

In [166]:
mse = mean_squared_error(y_test,predictions)
print(mse)

4.272216280507172


#### RMSE

In [167]:
rmse = math.sqrt(mse)
print(rmse)

2.066934029065072


#### MAE

In [168]:
mae = mean_absolute_error(y_test, predictions)
print(mae)

1.611789067308021


## VALIDATION

In [169]:
file_val = pd.read_csv('fifa21_validate.csv')

In [170]:
file_val.shape

(1999, 101)

In [171]:
cleaned_val = data_clear(file_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals[column] = categoricals[column].str.split('+').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals[column] = categoricals[column].replace(specialchars,'', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals['Release_Clause'] = categoricals['Release_Cla

In [172]:
cleaned_val.isna().sum()

Age          0
Growth       0
Attacking    0
Crossing     0
Finishing    0
            ..
LCB          0
CB           0
RCB          0
RB           0
GK           0
Length: 91, dtype: int64

In [173]:
cleaned_val.shape

(1949, 91)

In [174]:
categoricals_val = cleaned_val.select_dtypes(['object'])
encoded_val = encoder.transform(categoricals_val).toarray()
onehot_encoded_val = pd.DataFrame(encoded_val, columns=cols)

In [175]:
onehot_encoded_val.isna().sum()

foot_Right    0
A/W_Low       0
A/W_Medium    0
D/W_Low       0
D/W_Medium    0
dtype: int64

In [176]:
onehot_encoded_val.shape

(1949, 5)

In [177]:
y_val = cleaned_val['OVA']
cleaned_val = cleaned_val.drop(['OVA'], axis=1)
numerical_val = cleaned_val._get_numeric_data()
X_normalized_val = transformer.transform(numerical_val)
X_normalized_val = pd.DataFrame(X_normalized_val, columns=numerical_val.columns)

In [178]:
X_normalized_val.isna().sum()

Age          0
Growth       0
Attacking    0
Crossing     0
Finishing    0
            ..
LCB          0
CB           0
RCB          0
RB           0
GK           0
Length: 87, dtype: int64

In [179]:
X_normalized_val.shape

(1949, 87)

In [180]:
X_concat_val = pd.concat([X_normalized_val, onehot_encoded_val], axis=1)

In [181]:
X_concat_val.isna().sum()

Age           0
Growth        0
Attacking     0
Crossing      0
Finishing     0
             ..
foot_Right    0
A/W_Low       0
A/W_Medium    0
D/W_Low       0
D/W_Medium    0
Length: 92, dtype: int64

In [182]:
X_concat_val.head()

Unnamed: 0,Age,Growth,Attacking,Crossing,Finishing,Heading_Accuracy,Short_Passing,Volleys,Skill,Dribbling,...,LCB,CB,RCB,RB,GK,foot_Right,A/W_Low,A/W_Medium,D/W_Low,D/W_Medium
0,0.259259,0.269231,0.475949,0.465909,0.195652,0.647727,0.590361,0.418605,0.433255,0.428571,...,0.708333,0.708333,0.708333,0.681159,0.088608,1.0,1.0,0.0,0.0,0.0
1,0.222222,0.192308,0.648101,0.636364,0.619565,0.443182,0.614458,0.767442,0.672131,0.78022,...,0.305556,0.305556,0.305556,0.449275,0.101266,1.0,0.0,0.0,1.0,0.0
2,0.111111,0.653846,0.01519,0.011364,0.021739,0.068182,0.120482,0.0,0.021077,0.010989,...,0.027778,0.027778,0.027778,0.028986,0.556962,1.0,0.0,1.0,0.0,1.0
3,0.0,0.884615,0.437975,0.363636,0.304348,0.568182,0.578313,0.325581,0.423888,0.505495,...,0.527778,0.527778,0.527778,0.565217,0.025316,1.0,0.0,1.0,0.0,1.0
4,0.296296,0.192308,0.640506,0.579545,0.608696,0.454545,0.807229,0.604651,0.665105,0.725275,...,0.722222,0.722222,0.722222,0.797101,0.113924,1.0,0.0,1.0,0.0,1.0


In [183]:
X_val = X_concat_val

In [187]:
results_val = lm.predict(X_val)
pd.concat([cleaned_val,pd.Series(results_val)],axis=1)

Unnamed: 0,Age,Growth,Attacking,Crossing,Finishing,Heading_Accuracy,Short_Passing,Volleys,Skill,Dribbling,...,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,0
0,23.0,7.0,230.0,47.0,21.0,62.0,60.0,40.0,228.0,44.0,...,61.0,61.0,59.0,61.0,67.0,67.0,67.0,61.0,16.0,65.534286
1,22.0,5.0,298.0,62.0,60.0,44.0,62.0,70.0,330.0,76.0,...,47.0,47.0,49.0,45.0,38.0,38.0,38.0,45.0,17.0,65.427765
2,19.0,17.0,48.0,7.0,5.0,11.0,21.0,4.0,52.0,6.0,...,18.0,18.0,16.0,16.0,18.0,18.0,18.0,16.0,53.0,53.865875
3,16.0,23.0,215.0,38.0,31.0,55.0,59.0,32.0,224.0,51.0,...,54.0,54.0,53.0,53.0,54.0,54.0,54.0,53.0,11.0,57.799637
4,24.0,5.0,295.0,57.0,59.0,45.0,78.0,56.0,327.0,71.0,...,72.0,72.0,70.0,69.0,68.0,68.0,68.0,69.0,18.0,69.562195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,21.0,11.0,197.0,59.0,23.0,42.0,51.0,22.0,207.0,51.0,...,52.0,52.0,58.0,58.0,54.0,54.0,54.0,58.0,15.0,
1995,21.0,9.0,77.0,13.0,9.0,14.0,34.0,7.0,70.0,9.0,...,23.0,23.0,20.0,20.0,22.0,22.0,22.0,20.0,58.0,
1996,27.0,0.0,332.0,76.0,72.0,34.0,79.0,71.0,373.0,77.0,...,60.0,60.0,62.0,57.0,47.0,47.0,47.0,57.0,18.0,
1997,34.0,0.0,268.0,58.0,44.0,61.0,62.0,43.0,261.0,56.0,...,62.0,62.0,60.0,60.0,63.0,63.0,63.0,60.0,14.0,


In [188]:
predictions_val = lm.predict(X_val)
r2_score(y_val, predictions_val)

0.910747062881108

In [189]:
mse = mean_squared_error(y_val,predictions_val)
print(mse)
rmse = math.sqrt(mse)
print(rmse)
mae = mean_absolute_error(y_val, predictions_val)
print(mae)

4.037813654405377
2.0094311768272575
1.5734840968500472
