In [280]:
import pandas as pd  # keywords are highlighted in green, other strings in red, etc.
import numpy as np
import statistics as stats

In [281]:
file = pd.read_csv('fifa21_train.csv')

## Data Cleaning

In [282]:
def split_str(categoricals):   
    for column in categoricals:
        if [categoricals[column].isin(['+-'])]:
            categoricals[column] = categoricals[column].str.split('+').str[0]
        elif [(categoricals[column].isin(['-']))]:
            categoricals[column] = categoricals[column].str.split('-').str[0]
        else: 
            categoricals[column] = categoricals[column].str.split('+').str[0]

In [283]:
def remove_char(categoricals):
    specialchars = ['€','★',"lbs"]
    for char in specialchars:
        for column in categoricals:
            categoricals[column] = categoricals[column].replace(specialchars,'', regex=True)
    return categoricals

In [284]:
def height_to_cm(categoricals):
    H_feet = categoricals['Height'].str.split("'").str[0]
    H_inch = categoricals['Height'].str.split("'").str[1].str.split("\"").str[0]
    categoricals['Height'] = (H_feet.astype(float) * 12) + H_inch.astype(float)
    return categoricals

In [285]:
def K_M_multiply(categoricals):
    categoricals['Release_Clause'] = categoricals['Release_Clause'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Wage'] = categoricals['Wage'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Value'] = categoricals['Value'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
    categoricals['Hits'] = categoricals['Hits'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)

In [286]:
def object_to_num(categoricals):
    col_to_numercic = ['Weight', 'Value', 'Wage', 'Release_Clause', 'W/F', 'SM', 'IR', 'Hits', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    for i in range(len(col_to_numercic)):
        categoricals[col_to_numercic[i]] =  pd.to_numeric(categoricals[col_to_numercic[i]])
    return categoricals

In [290]:
def data_clear(file):
    file.columns = list(map(lambda x: x.replace(" ","_"), file.columns))
    file=file.drop(['Club', 'BP','ID', 'Name', 'Nationality', 'Joined','Contract', 'Team_&_Contract', 'Loan_Date_End', 'Position'], axis=1 )
    file=file.dropna()
    file = file.drop_duplicates()
    numerical = file._get_numeric_data()
    categoricals = file.select_dtypes(['object'])
    split_str(categoricals)
    remove_char(categoricals)
    K_M_multiply(categoricals)
    height_to_cm(categoricals)
    object_to_num(categoricals)
    cleaned_data = pd.concat([numerical, categoricals], axis=1)
    
    return cleaned_data

In [None]:
cleaned_data = data_clear(file)
numerical_2 = cleaned_data._get_numeric_data()
categoricals_2= cleaned_data.select_dtypes(['object'])

## Data Encoding and Normalization

In [None]:
def encoding(cleaned_data):
    from sklearn.preprocessing import OneHotEncoder
    categoricals = cleaned_data.select_dtypes(['object'])
    encoder = OneHotEncoder().fit(categoricals)
    encoded_for_p = encoder.transform(categoricals).toarray()
    encoded_for_p
    encoder.categories_

    cols=[colname for row in encoder.categories_ for colname in row]
    cols
    onehot_encoded_for_p = pd.DataFrame(encoded_for_p,columns=cols)
    cols_to_drop=[row[0] for row in encoder.categories_]
    cols_to_drop
    onehot_encoded_for_p = onehot_encoded_for_p.drop(cols_to_drop,axis=1)
    onehot_encoded_for_p

In [299]:
def normalizing(cleaned_data):
    from sklearn.preprocessing import MinMaxScaler # do not use the function Normalise() - it does something entirely different
    from sklearn.preprocessing import StandardScaler
    numerical = cleaned_data._get_numeric_data()
    transformer = MinMaxScaler().fit(numerical)
    x_normalized = transformer.transform(numerical)
    print(numerical_normalized.shape)
    x_normalized=pd.DataFrame(x_normalized, columns=X.columns)
    x_normalized

## Defining X,y

In [297]:
y = numerical_2['OVA']
X = numerical_2.drop(['OVA'],axis=1)

Unnamed: 0,Right,Low,Medium,Low.1,Medium.1
0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,1.0
4,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
11417,1.0,0.0,0.0,1.0,0.0
11418,0.0,0.0,1.0,0.0,1.0
11419,1.0,0.0,1.0,0.0,1.0
11420,1.0,0.0,0.0,0.0,1.0


(11422, 87)


Unnamed: 0,Age,Growth,Attacking,Crossing,Finishing,Heading_Accuracy,Short_Passing,Volleys,Skill,Dribbling,...,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK
0,0.370370,0.038462,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,...,0.614286,0.614286,0.614286,0.642857,0.637681,0.527778,0.527778,0.527778,0.637681,0.075949
1,0.518519,0.000000,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,...,0.528571,0.528571,0.528571,0.614286,0.565217,0.444444,0.444444,0.444444,0.565217,0.113924
2,0.629630,0.000000,0.744304,0.761364,0.793478,0.329545,0.807229,0.825581,0.892272,0.879121,...,0.571429,0.571429,0.571429,0.642857,0.565217,0.347222,0.347222,0.347222,0.565217,0.037975
3,0.222222,0.500000,0.506329,0.431818,0.423913,0.602273,0.614458,0.372093,0.505855,0.538462,...,0.600000,0.600000,0.600000,0.600000,0.623188,0.583333,0.583333,0.583333,0.623188,0.063291
4,0.259259,0.307692,0.524051,0.488636,0.369565,0.636364,0.686747,0.348837,0.555035,0.648352,...,0.685714,0.685714,0.685714,0.714286,0.710145,0.625000,0.625000,0.625000,0.710145,0.075949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11417,0.481481,0.000000,0.627848,0.340909,0.750000,0.681818,0.554217,0.651163,0.562061,0.626374,...,0.414286,0.414286,0.414286,0.457143,0.434783,0.375000,0.375000,0.375000,0.434783,0.126582
11418,0.333333,0.192308,0.460759,0.636364,0.228261,0.522727,0.578313,0.279070,0.480094,0.571429,...,0.628571,0.628571,0.628571,0.700000,0.710145,0.625000,0.625000,0.625000,0.710145,0.088608
11419,0.407407,0.153846,0.106329,0.068182,0.119565,0.090909,0.216867,0.139535,0.124122,0.131868,...,0.142857,0.142857,0.142857,0.157143,0.144928,0.138889,0.138889,0.138889,0.144928,0.759494
11420,0.222222,0.307692,0.617722,0.659091,0.684783,0.522727,0.638554,0.430233,0.580796,0.725275,...,0.400000,0.400000,0.400000,0.500000,0.449275,0.263889,0.263889,0.263889,0.449275,0.101266


In [None]:
X_concat = pd.concat([x_normalized, onehot_encoded_for_p], axis=1)

In [None]:
# train test split is the way ML generates its claim to fame: 
# we build the model on a portion of the data but we then validate it in 
# another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values 
# in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import linear_model
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
# But now we evaluate it in the TEST portion of the data, that we did not use for training.
# This way we know our model is genuinely guessing our donations, not just repeating the values it has seen in the training data

predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test,predictions_test))

In [None]:
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
#predict and inspect results
results_for_p = lm.predict(X_concat)

pd.concat([file,pd.Series(results_for_p)],axis=1).head()
#dont retrain transformers

In [None]:
y.describe().T

## VALIDATION

In [None]:
file = pd.read_csv('fifa21_validate.csv')
file.shape

In [None]:
file.columns = list(map(lambda x: x.replace(" ","_"), file.columns))
file.head()

In [None]:
file=file.drop(['ID', 'Name', 'Nationality', 'Joined','Contract', 'Team_&_Contract', 'Loan_Date_End', 'Position'], axis=1 )
file=file.dropna()
file = file.drop_duplicates()

In [None]:
numerical = file._get_numeric_data()
categoricals= file.select_dtypes(['object'])

In [None]:
split()

In [None]:
remove_char(categoricals)

In [None]:
categoricals['Release_Clause'] = categoricals['Release_Clause'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
categoricals['Wage'] = categoricals['Wage'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
categoricals['Value'] = categoricals['Value'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)
categoricals['Hits'] = categoricals['Hits'].replace({'€': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(float)


In [None]:
H_feet = categoricals['Height'].str.split("'").str[0]
H_inch = categoricals['Height'].str.split("'").str[1].str.split("\"").str[0]

categoricals['Height'] = (H_feet.astype(float) * 12) + H_inch.astype(float)


In [None]:
col_to_numercic = ['Weight', 'Value', 'Wage', 'Release_Clause', 'W/F', 'SM', 'IR', 'Hits', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

for i in range(len(col_to_numercic)):
    categoricals[col_to_numercic[i]] =  pd.to_numeric(categoricals[col_to_numercic[i]])
#categoricals[col_to_numercic[i]] = [col_to_numercic[i]].map(pd.eval).astype(float)

In [None]:
file_2 = pd.concat([numerical, categoricals], axis=1)
numerical_2 = file_2._get_numeric_data()
categoricals_2= file_2.select_dtypes(['object'])
y = numerical_2['OVA']
X = numerical_2.drop(['OVA'],axis=1)

In [None]:
categoricals_2 = categoricals_2.drop(['Club', 'BP'], axis=1)
categoricals_2.head(5)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder().fit(categoricals_2)
encoded_for_p = encoder.transform(categoricals_2).toarray()
encoded_for_p
encoder.categories_

cols=[colname for row in encoder.categories_ for colname in row]
cols
onehot_encoded_for_p = pd.DataFrame(encoded_for_p,columns=cols)
cols_to_drop=[row[0] for row in encoder.categories_]
cols_to_drop
onehot_encoded_for_p = onehot_encoded_for_p.drop(cols_to_drop,axis=1)
onehot_encoded_for_p

In [None]:
from sklearn.preprocessing import MinMaxScaler # do not use the function Normalise() - it does something entirely different
from sklearn.preprocessing import StandardScaler

# Normalizing data: make data range from 0 - 1, instead of from min to max
transformer = MinMaxScaler().fit(X)
x_normalized = transformer.transform(X)
print(x_normalized.shape)
x_normalized=pd.DataFrame(x_normalized, columns=X.columns)
x_normalized

In [None]:
X_concat = pd.concat([x_normalized, onehot_encoded_for_p], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import linear_model
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test,predictions_test))

In [None]:
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
#predict and inspect results
results_for_p = lm.predict(X_concat)

pd.concat([file,pd.Series(results_for_p)],axis=1).head()
#dont retrain transformers