In [30]:
import pandas as pd  # keywords are highlighted in green, other strings in red, etc.
import numpy as np
import statistics as stats

In [31]:
file_1 = pd.read_csv('fifa21_train.csv')
file_1.shape

(11701, 101)

## Data Cleaning

def split_str(categoricals):   
    for column in categoricals:
        if [categoricals[column].isin(['+-'])]:
            categoricals[column] = categoricals[column].str.split('+').str[0]
        elif [(categoricals[column].isin(['-']))]:
            categoricals[column] = categoricals[column].str.split('-').str[0]
        else: 
            categoricals[column] = categoricals[column].str.split('+').str[0]

    return categoricals

In [32]:
def split_str(df): 
    
    lst = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF',
       'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    
    for i in lst:
        df[[i, i + '_pot']] = df[i].str.split('+', expand=True)
        df[i + '_pot'] = df[i + '_pot'].astype('int') + df[i].astype('int')
        df[i + '_pot'] = df[i + '_pot'].astype('int')
        df[i] = df[i].astype('int')
    
    return df

In [33]:
def remove_char(categoricals):
    specialchars = ['€','★',"lbs"]
    for char in specialchars:
        for column in categoricals:
            categoricals[column] = categoricals[column].replace(specialchars,'', regex=True)
    
    return categoricals #

In [34]:
def height_to_cm(categoricals):
    H_feet = categoricals['Height'].str.split("'").str[0]
    H_inch = categoricals['Height'].str.split("'").str[1].str.split("\"").str[0]
    categoricals['Height'] = (H_feet.astype(float) * 12) + H_inch.astype(float)
    return categoricals #

In [35]:
def K_M_multiply(categoricals):
    categoricals['Release_Clause'] = categoricals['Release_Clause'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Wage'] = categoricals['Wage'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Value'] = categoricals['Value'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Hits'] = categoricals['Hits'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    return categoricals #

In [36]:
def object_to_num(categoricals):
    col_to_numercic = ['Weight', 'Value', 'Wage', 'Release_Clause', 'W/F', 'SM', 'IR', 'Hits']
    for i in range(len(col_to_numercic)):
        categoricals[col_to_numercic[i]] =  pd.to_numeric(categoricals[col_to_numercic[i]])
    return categoricals #

In [37]:
file_1['LS'].head(5)

0    58+1
1    77+0
2    73+2
3    50+2
4    56+2
Name: LS, dtype: object

In [38]:
def data_clear(file):
    file.columns = list(map(lambda x: x.replace(" ","_"), file.columns))
    file=file.drop(['Club', 'BP', 'ID', 'Name', 'Nationality', 'Joined','Contract', 'Team_&_Contract', 'Loan_Date_End', 'Position'], axis=1 )
    file=file.dropna()
    file = file.drop_duplicates()
    numerical = file._get_numeric_data()
    categoricals = file.select_dtypes(['object'])
    categoricals = split_str(categoricals) #    
    categoricals = remove_char(categoricals) #
    categoricals = K_M_multiply(categoricals) #
    categoricals = height_to_cm(categoricals) #
    categoricals = object_to_num(categoricals) #

      
    cleaned_data = pd.concat([numerical, categoricals], axis=1)
    cleaned_data_drop_OVA = cleaned_data.drop(['OVA'], axis=1)
    
    numerical = cleaned_data._get_numeric_data() #
    categoricals = cleaned_data.select_dtypes(['object']) #

    return cleaned_data, cleaned_data_drop_OVA, categoricals, numerical #

In [39]:
cleaned_data, cleaned_data_drop_OVA, categoricals, numerical = data_clear(file_1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i + '_pot'] = df[i + '_pot'].astype('int') + df[i].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i + '_pot'] = df[i + '_pot'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [40]:
numerical['LS'].head(5)

0    58
1    77
2    73
3    50
4    56
Name: LS, dtype: int32

## Data Encoding and Normalization

In [41]:
def encoding(cleaned_data_drop_OVA):
    from sklearn.preprocessing import OneHotEncoder
    categoricals = cleaned_data_drop_OVA.select_dtypes(['object'])
    encoder = OneHotEncoder(drop='first').fit(categoricals)
    encoded = encoder.transform(categoricals).toarray()
    cols = encoder.get_feature_names(input_features=categoricals.columns)
    onehot_encoded = pd.DataFrame(encoded, columns=cols)
    onehot_encoded.head()
    print(onehot_encoded.shape)
    
    return onehot_encoded

In [42]:
def normalizing(cleaned_data_drop_OVA):
    from sklearn.preprocessing import MinMaxScaler # do not use the function Normalise() - it does something entirely different
    from sklearn.preprocessing import StandardScaler
    numerical = cleaned_data_drop_OVA._get_numeric_data()
    transformer = MinMaxScaler().fit(numerical)
    x_normalized = transformer.transform(numerical)
    x_normalized=pd.DataFrame(x_normalized, columns=numerical.columns)
    x_normalized
    print(x_normalized.shape)

    return x_normalized

In [43]:
onehot_encoded_for_p=encoding(cleaned_data_drop_OVA)
x_normalized=normalizing(cleaned_data_drop_OVA)

(11422, 5)
(11422, 114)


## Defining X,y

In [44]:
y = numerical['OVA']
X = numerical.drop(['OVA'], axis=1)

In [45]:
X_concat = pd.concat([x_normalized, onehot_encoded_for_p], axis=1)
X_concat.shape

(11422, 119)

## TRAINING THE MODEL

In [46]:
# train test split is the way ML generates its claim to fame: 
# we build the model on a portion of the data but we then validate it in 
# another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values 
# in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.2, random_state=42)

In [47]:
from sklearn import linear_model
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [48]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.9194030890890672

In [49]:
# But now we evaluate it in the TEST portion of the data, that we did not use for training.
# This way we know our model is genuinely guessing our donations, not just repeating the values it has seen in the training data

predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

0.9139121810474327

In [50]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test,predictions_test))

2.008139525752156

In [51]:
mse=mean_squared_error(y_test,predictions_test)
mse

4.032624354888094

In [52]:
#predict and inspect results
results_for_p = lm.predict(X_concat)

pd.concat([file_1['OVA'],pd.Series(results_for_p)],axis=1).head()
#dont retrain transformers

Unnamed: 0,OVA,0
0,64,64.868516
1,77,77.165639
2,80,76.485419
3,59,61.892155
4,65,67.23464


In [53]:
y.describe().T

count    11422.000000
mean        66.876642
std          6.845047
min         45.000000
25%         62.000000
50%         67.000000
75%         72.000000
max         93.000000
Name: OVA, dtype: float64

## VALIDATION

In [54]:
file_2 = pd.read_csv('fifa21_validate.csv')
file_2.shape

(1999, 101)

In [55]:
cleaned_data, cleaned_data_drop_OVA, categoricals, numerical = data_clear(file_2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i + '_pot'] = df[i + '_pot'].astype('int') + df[i].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i + '_pot'] = df[i + '_pot'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [56]:
onehot_encoded_for_p = encoding(cleaned_data_drop_OVA)
x_normalized = normalizing(cleaned_data_drop_OVA)

(1949, 5)
(1949, 114)


In [57]:
y = numerical['OVA']
X = numerical.drop(['OVA'], axis=1)

In [58]:
X_concat = pd.concat([x_normalized, onehot_encoded_for_p], axis=1)
X_concat.shape

(1949, 119)

In [59]:
#predict and inspect results
results_for_p = lm.predict(X_concat)

pd.concat([file_2['OVA'],pd.Series(results_for_p)],axis=1).head()
#dont retrain transformers

Unnamed: 0,OVA,0
0,67,63.659085
1,68,63.995747
2,54,50.410171
3,55,55.176153
4,70,68.554264


In [60]:
from sklearn.metrics import mean_absolute_error, r2_score

In [61]:
mae = mean_absolute_error(y_test, predictions_test)
print(mae)

1.5575777984230337


In [62]:
mse=mean_squared_error(y_test,predictions_test)

In [63]:
import math

rmse = math.sqrt(mse)
print(rmse)

2.008139525752156


In [64]:
r2 = r2_score(y_test, predictions_test)
r2

0.9139121810474327