# FIFA Case Study - Liga Portugal

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns 
import time
from datetime import date
from sklearn import linear_model

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
fifa_raw = pd.read_csv(io.BytesIO(uploaded['fifa21_train.csv']))

## Data cleaning - pre-processing

In [None]:
fifa_raw = pd.read_csv('fifa21_train.csv')

In [None]:
correlations_matrix = fifa_raw.corr()
sns.set(rc = {'figure.figsize':(40,25)})
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
fifa_new_columns = fifa_raw[['Height', 'Weight', 'foot', 'Growth', 'Joined', 'Value', 'Wage', 
                 'Attacking', 'Skill', 'Movement', 'Reactions', 'Power', 'Jumping', 'Strength', 'Mentality',
                 'Composure','Goalkeeping','Total Stats','Base Stats','W/F','SM','A/W', 'D/W', 'IR', 'PAC', 'SHO', 'PAS', 'DRI',
                 'DEF', 'PHY', 'Hits', 'OVA']]

In [None]:
sum(fifa_new_columns.duplicated())

In [None]:
fifa_new_columns.isna().sum()

In [None]:
fifa_NaNs_removed = fifa_new_columns.dropna(axis=0)
fifa_NaNs_removed.shape
fifa_NaNs_removed.head()

In [None]:
correlations_matrix = fifa_NaNs_removed.corr()
sns.set(rc = {'figure.figsize':(25,10)})
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
# Convert height (into inches), weight (remove 'lbs'), value (remove $ and K & M with zeros), wage (ditto) to integers -- then normalise value by contract length

# Convert Joined into DateTime and use year -- this will allow us to slice into subsamples based on experience level <2, 2-5, >5 (if we want to)

# Convert contract into contract length

# Remove stars from W/F, SM, IR

# Hits coerce to numerical

## Data cleaning - processing

In [None]:
fifa_clean_val = fifa_NaNs_removed.copy()

In [None]:
fifa_clean_val['Weight'] = fifa_clean_val['Weight'].replace(to_replace=r'lbs', value='', regex=True) 

In [None]:
def parse_ht(ht):
  
    ht_ = ht.split("'")
    ft_ = int(ht_[0])
    in_ = int(ht_[1].replace("\"",""))
    return (12*ft_) + in_

fifa_clean_val['Height'] = fifa_clean_val['Height'].apply(lambda x:parse_ht(x))

In [None]:
cols = ['Value','Wage'] 

fifa_clean_val[cols] = fifa_clean_val[cols].replace({'\€':''}, regex=True)

In [None]:
def value_to_int(x):
    for i in x:
      if 'K' in x:
        i=x.replace('K','')
        i=float(i)*1000
        i=int(i)
      elif 'M' in x:
        i=x.replace('M','')
        i=float(i)*1000000
        i=int(i)
      else:
        i=i
    return i

In [None]:
fifa_clean_val['Value'] = fifa_clean_val['Value'].apply(value_to_int)
fifa_clean_val['Wage'] = fifa_clean_val['Wage'].apply(value_to_int)
fifa_clean_val['Hits'] = fifa_clean_val['Hits'].apply(value_to_int)

In [None]:
cols2 = ['W/F','SM', 'IR'] 

fifa_clean_val[cols2] = fifa_clean_val[cols2].replace({'\★':''}, regex=True)

In [None]:
fifa_clean_val['Joined'] = pd.DatetimeIndex(fifa_clean_val['Joined']).year

In [None]:
fifa_clean_val = fifa_clean_val.astype({'Value':'int', 'Wage':'int', 'W/F':'int', 'SM':'int', 'IR':'int', 'Weight':'int'})

In [None]:
fifa_clean_val['Hits'] = pd.to_numeric(fifa_clean_val['Hits'], errors='coerce')

In [None]:
fifa = fifa_clean_val.copy()
fifa.info()

In [None]:
fifa.isna().sum()

## Split, encoding and normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [None]:
y = fifa['OVA']

X = fifa.drop(['OVA'], axis=1)

In [None]:
X_num = X.select_dtypes(np.number)

X_cat = X.select_dtypes(object)

In [None]:
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_normalized = MinMaxtransformer.transform(X_num)
print(X_normalized.shape)

X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)
X_normalized.head()

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_cat)
encoded = encoder.transform(X_cat).toarray()
print(encoded)

cols = encoder.get_feature_names(input_features=X_cat.columns)
print(cols)

onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()

In [None]:
X = pd.concat([X_normalized, onehot_encoded], axis=1)
X.reset_index(drop=True)
X.head()

## Linear regression & predictions

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [None]:
lm = linear_model.LinearRegression()

lm.fit(X_train,y_train)

In [None]:
predictions = lm.predict(X_train)
predictions_test = lm.predict(X_test)

In [None]:
print('Results for lm')
print('  R2 SCORE: Train', round(r2_score(y_train, predictions),3), '     | Test', round(r2_score(y_test, predictions_test), 3))
print(' MSE SCORE: Train', round(mean_squared_error(y_train,predictions),3), ' | Test', round(mean_squared_error(y_test,predictions_test), 3))
print('RMSE SCORE: Train', round(np.sqrt(mean_squared_error(y_train,predictions)),3), '   | Test', round(np.sqrt(mean_squared_error(y_test,predictions_test)),3))
print(' MAE SCORE: Train', round(mean_absolute_error(y_train, predictions),3), '    | Test', round(mean_absolute_error(y_test, predictions_test), 3))

## Improvements on the model?

In [None]:
coefficients = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficients'])
coefficients

In [None]:
regression_line = lm.intercept_ + lm.coef_[0]*fifa['Value']
plt.plot(fifa['Value'], regression_line, c = 'orange')

sns.scatterplot(x='Value', y=fifa['OVA'], data=fifa)
plt.show()

In [None]:
for i, column in enumerate(X_num.columns, 1):
    sns.displot(X_num[column], kde=True)

   

In [None]:
X_num['Hits'].value_counts()

In [None]:
sns.scatterplot(x=X_num['Hits'], y=y, data=X_num)

In [None]:
correlations_matrix = X_num.corr()
sns.set(rc = {'figure.figsize':(25,10)})
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
 # Based on the distributions and the values, we drop Hits.

In [None]:
X2_normalized = X_normalized.drop(['Hits'], axis=1)

In [None]:
X2 = pd.concat([X2_normalized, onehot_encoded], axis=1)
X2.reset_index(drop=True)
X2.head()

#### Testing the model again without the Hits column

In [None]:
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42) 

In [None]:
lm2 = linear_model.LinearRegression()

lm2.fit(X2_train,y_train)

In [None]:
predictions2 = lm2.predict(X2_train)
predictions_test2 = lm2.predict(X2_test)

In [None]:
print('Results for lm2')
print('  R2 SCORE: Train', round(r2_score(y_train, predictions2),3), '     | Test', round(r2_score(y_test, predictions_test2), 3))
print(' MSE SCORE: Train', round(mean_squared_error(y_train,predictions2),3), ' | Test', round(mean_squared_error(y_test,predictions_test2), 3))
print('RMSE SCORE: Train', round(np.sqrt(mean_squared_error(y_train,predictions2)),3), '   | Test', round(np.sqrt(mean_squared_error(y_test,predictions_test2)),3))
print(' MAE SCORE: Train', round(mean_absolute_error(y_train, predictions2),3), '    | Test', round(mean_absolute_error(y_test, predictions_test2), 3))

In [None]:
print('Results for lm')
print('  R2 SCORE: Train', round(r2_score(y_train, predictions),3), '     | Test', round(r2_score(y_test, predictions_test), 3))
print(' MSE SCORE: Train', round(mean_squared_error(y_train,predictions),3), ' | Test', round(mean_squared_error(y_test,predictions_test), 3))
print('RMSE SCORE: Train', round(np.sqrt(mean_squared_error(y_train,predictions)),3), '   | Test', round(np.sqrt(mean_squared_error(y_test,predictions_test)),3))
print(' MAE SCORE: Train', round(mean_absolute_error(y_train, predictions),3), '    | Test', round(mean_absolute_error(y_test, predictions_test), 3))

In [None]:
# The model errors improved slightly when we removed 'Hits' though R2 dropped by one-thousandth (marginal, imperceptible difference)

## Validation data

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
fifa_validate_raw = pd.read_csv(io.BytesIO(uploaded['fifa21_validate.csv']))

In [None]:
# Cleaning steps

In [None]:
fifa_validate_raw.shape
fifa_validate_raw.isna().sum()

In [None]:
fifa_validate_clean = fifa_validate_raw[['Height', 'Weight', 'foot', 'Growth', 'Joined', 'Value', 'Wage', 
                 'Attacking', 'Skill', 'Movement', 'Reactions', 'Power', 'Jumping', 'Strength', 'Mentality',
                 'Composure','Goalkeeping','Total Stats','Base Stats','W/F','SM','A/W', 'D/W', 'IR', 'PAC', 'SHO', 'PAS', 'DRI',
                 'DEF', 'PHY', 'OVA']]

In [None]:
print(fifa_new_columns.shape)
print(fifa_validate_clean.shape)

In [None]:
fifa_validate_NaNs_removed = fifa_validate_clean.dropna(axis=0)
fifa_validate_NaNs_removed.isna().sum()

In [None]:
fifa_validate_clean = fifa_validate_NaNs_removed.copy() 

In [None]:
fifa_validate_clean['Weight'] = fifa_validate_clean['Weight'].replace(to_replace=r'lbs', value='', regex=True) 

In [None]:
def parse_ht(ht):
  
    ht_ = ht.split("'")
    ft_ = int(ht_[0])
    in_ = int(ht_[1].replace("\"",""))
    return (12*ft_) + in_

fifa_validate_clean['Height'] = fifa_validate_clean['Height'].apply(lambda x:parse_ht(x))

In [None]:
cols = ['Value','Wage'] 
fifa_validate_clean[cols] = fifa_validate_clean[cols].replace({'\€':''}, regex=True)

In [None]:
def value_to_int(x):
    for i in x:
      if 'K' in x:
        i=x.replace('K','')
        i=float(i)*1000
        i=int(i)
      elif 'M' in x:
        i=x.replace('M','')
        i=float(i)*1000000
        i=int(i)
      else:
        i=i
    return i

In [None]:
fifa_validate_clean['Value'] = fifa_validate_clean['Value'].apply(value_to_int)
fifa_validate_clean['Wage'] = fifa_validate_clean['Wage'].apply(value_to_int)

In [None]:
cols2 = ['W/F','SM', 'IR'] 

fifa_validate_clean[cols2] = fifa_validate_clean[cols2].replace({'\★':''}, regex=True)

In [None]:
fifa_validate_clean['Joined'] = pd.DatetimeIndex(fifa_validate_clean['Joined']).year

In [None]:
fifa_validate_clean = fifa_validate_clean.astype({'Value':'int', 'Wage':'int', 'W/F':'int', 'SM':'int', 'IR':'int', 'Weight':'int'})

In [None]:
fifa_v = fifa_validate_clean.copy()
fifa_v.info()

In [None]:
fifa_v.isna().sum()

In [None]:
yv = fifa_v['OVA']

XV = fifa_v.drop(['OVA'], axis=1)

In [None]:
XV_num = XV.select_dtypes(np.number)

XV_cat = XV.select_dtypes(object)

In [None]:
MinMaxtransformer = MinMaxScaler().fit(XV_num)
XV_normalized = MinMaxtransformer.transform(XV_num)
print(XV_normalized.shape)

XV_normalized = pd.DataFrame(XV_normalized,columns=XV_num.columns)
XV_normalized.head()

In [None]:
encoder = OneHotEncoder(drop='first').fit(XV_cat)
encoded = encoder.transform(XV_cat).toarray()
print(encoded)

cols = encoder.get_feature_names(input_features=XV_cat.columns)
print(cols)

onehot_encoded1 = pd.DataFrame(encoded, columns=cols)
onehot_encoded1.head()

In [None]:
XV = pd.concat([XV_normalized, onehot_encoded1], axis=1)
XV.reset_index(drop=True)
XV.head()

In [None]:
# Rerun the test with cleaned validation data

In [None]:
# XV_train, XV_test, yv_train, yv_test = train_test_split(XV, yv, test_size=0.2, random_state=42) 

In [None]:
# lm_validation = linear_model.LinearRegression()

# lm_validation.fit(XV_train,yv_train)

In [None]:
# predictionsV = lm_validation.predict(XV_train)
predictions_testV = lm2.predict(XV)

In [None]:
print('Results for lm_V')
print('  R2 SCORE: Test', round(r2_score(yv, predictions_testV), 3))
print(' MSE SCORE: Test', round(mean_squared_error(yv,predictions_testV), 3))
print('RMSE SCORE: Test', round(np.sqrt(mean_squared_error(yv,predictions_testV)),3))
print(' MAE SCORE: Test', round(mean_absolute_error(yv, predictions_testV), 3))