In [79]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import lime
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from lime import lime_tabular

# **1.** 
At the beginning I remove columns which are irrelevant and change strings to numerical variables. Then I split the data into the training and test sets. Next, I train XGBoost model for predicting value of a player.

In [3]:
df_ = pd.read_csv('Fifa 23 Players Data.csv')
df_.head()

Unnamed: 0,Known As,Full Name,Overall,Potential,Value(in Euro),Positions Played,Best Position,Nationality,Image Link,Age,...,LM Rating,CM Rating,RM Rating,LWB Rating,CDM Rating,RWB Rating,LB Rating,CB Rating,RB Rating,GK Rating
0,L. Messi,Lionel Messi,91,91,54000000,RW,CAM,Argentina,https://cdn.sofifa.net/players/158/023/23_60.png,35,...,91,88,91,67,66,67,62,53,62,22
1,K. Benzema,Karim Benzema,91,91,64000000,"CF,ST",CF,France,https://cdn.sofifa.net/players/165/153/23_60.png,34,...,89,84,89,67,67,67,63,58,63,21
2,R. Lewandowski,Robert Lewandowski,91,91,84000000,ST,ST,Poland,https://cdn.sofifa.net/players/188/545/23_60.png,33,...,86,83,86,67,69,67,64,63,64,22
3,K. De Bruyne,Kevin De Bruyne,91,91,107500000,"CM,CAM",CM,Belgium,https://cdn.sofifa.net/players/192/985/23_60.png,31,...,91,91,91,82,82,82,78,72,78,24
4,K. Mbappé,Kylian Mbappé,91,95,190500000,"ST,LW",ST,France,https://cdn.sofifa.net/players/231/747/23_60.png,23,...,92,84,92,70,66,70,66,57,66,21


In [5]:
non_important_columns = ["Full Name", "Known As", "Image Link", "National Team Image Link"]

df = df_.drop(non_important_columns, axis=1, inplace=False)
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df
df = handle_non_numerical_data(df).dropna()

feature_names = sorted([feature for feature in df.keys() if feature != "Value(in Euro)"])

y = pd.DataFrame(df['Value(in Euro)'])
X = df[feature_names]
data_dmatrix = xgb.DMatrix(data=X,label=y)
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=123)

Training of XGBoost

In [6]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)



Predictions for some players

In [15]:
print(f'Predictions of value for 5-th, 0-th and 10th players are {preds[0]}, {preds[1]}, {preds[1]}')

Predictions of value for 5-th, 0-th and 10th players are 864751.0, 1269038.875, 1269038.875


# **2.** 
Calculaction of the decompositions of chosen predictions

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=['Value(in Euro)'], verbose=True, mode='regression')
def show_lime(id):
  exp = explainer.explain_instance(X_test[id], xg_reg.predict, num_features=10)
  _ = exp.as_pyplot_figure()
[show_lime(i) for i in range(3)]

# **3.** Stability of explanations: comparison of LIME decompositions


In [None]:
for i in range(3):
  show_lime(i)
  show_lime(i)

# **4.** 
Comparison of the LIME and SHAP explanations


In [None]:
import shap
shap.initjs()
shap_explainer = shap.TreeExplainer(xg_reg, X_train)
def show_shap(id):
  df_id = X_test[id:id+1]
  shap_values = shap_explainer(df_id)[0,:]
  shap.plots.waterfall(shap_values)

for id in range(3):
  show_lime(id)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

shap_explainer = shap.Explainer(xg_reg, X_train)

def show_shap(id):
  df_id = X_test[id:id+1]
  shap_values = shap_explainer(df_id)[0,:]
  shap.plots.waterfall(shap_values)
for id in range(3):
  show_shap(id)

# **5.** 
Comparison of LIME for XGBoost and simple linear regression


Training of linear regression

In [75]:

X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(X, y, test_size=0.2, random_state=123)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_lin = scaler.transform(X_train_lin)
y_train_lin = y_train_lin.values.reshape(-1,1)
X_test_lin = scaler.transform(X_test)
y_test_lin = y_test.values.reshape(-1,1)
reg = LinearRegression().fit(X_train_lin, y_train_lin.reshape(-1,1))

y_pred = reg.predict(X_test_lin)

Explanations for XGBoost and linear model

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_lin, feature_names=feature_names, class_names=['Value(in Euro)'], verbose=True, mode='regression')

def show_lime_linear(id):
  exp = explainer.explain_instance(X_test_lin[id], reg.predict, num_features=10)
  _ = exp.as_pyplot_figure()

for id in range(3):
  show_lime_linear(id)