<h1>FIFA 23 ML Project  <img src="https://img.icons8.com/color/48/null/football-team.png"/></h1>

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("players_fifa23.csv")
data.head()

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,158023,L. Messi,Lionel Messi,35,169,67,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,91,91,...,91,88,91,67,66,67,62,53,62,22
1,165153,K. Benzema,Karim Benzema,34,185,81,https://cdn.sofifa.net/players/165/153/23_60.png,France,91,91,...,89,84,89,67,67,67,63,58,63,21
2,188545,R. Lewandowski,Robert Lewandowski,33,185,81,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,91,91,...,86,83,86,67,69,67,64,63,64,22
3,192985,K. De Bruyne,Kevin De Bruyne,31,181,70,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,91,91,...,91,91,91,82,82,82,78,72,78,24
4,231747,K. Mbappé,Kylian Mbappé,23,182,73,https://cdn.sofifa.net/players/231/747/23_60.png,France,91,95,...,92,84,92,70,66,70,66,57,66,21


In [4]:
data.shape

(18539, 90)

In [5]:
needed_columns = ['Name', 'Age', 'Height', 'Weight', "BestPosition",
       'Overall', 'Potential', 'Growth', 'TotalStats',
       'BaseStats', 'ValueEUR', 'WageEUR',
       'ReleaseClause', 'ContractUntil', 'OnLoad',
       'PreferredFoot', 'IntReputation', 'WeakFoot',
       'SkillMoves', 'AttackingWorkRate', 'DefensiveWorkRate', 'PaceTotal',
       'ShootingTotal', 'PassingTotal', 'DribblingTotal', 'DefendingTotal',
       'PhysicalityTotal', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']


data = data[needed_columns]

# Data Preprocessing:

<h3>1. Handle the missing values:</h3>

In [6]:
for i in data.columns:
    if data[i].isnull().sum() > 0:
        print(i, " ", data[i].isnull().sum())

ContractUntil   92


<h4>ContractUntil Column:</h4>

In [7]:
print("The percentage of the null values is: ", (data["ContractUntil"].isnull().sum()/data.shape[0])*100, "%")

The percentage of the null values is:  0.49625114623226707 %


As the percentage of nulls is small so we can drop the values of nulls in that column.

In [8]:
data.drop(data[data["ContractUntil"].isnull()].index, axis = 0, inplace =True)

In [9]:
data["ContractUntil"].isnull().sum()

0

<h3>2. Handle The Categorical Columns:</h3>

In [10]:
for i in data.columns:
    if data[i].dtype == 'object':
        print(i)

Name
BestPosition
PreferredFoot
AttackingWorkRate
DefensiveWorkRate


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data["PreferredFoot"] = le.fit_transform(data["PreferredFoot"])
data["AttackingWorkRate"] = le.fit_transform(data["AttackingWorkRate"])
data["DefensiveWorkRate"] = le.fit_transform(data["DefensiveWorkRate"])

<h3>Merge Some Players Positions to reduce the number of classes:</h3>

In [12]:
merge_pos = {'LWB' : 'LW', 'RWB': 'RW', 'ST': 'CF', 'CAM': 'CM', 'CDM': 'CM'}

data = data.replace({'BestPosition': merge_pos})

In [13]:
mapping = {'CF': 0, 'CM': 1, 'RW': 2, 'GK': 3, 'CB': 4, 'LW': 5, 'LM': 6, 'LB': 7,'RM': 8, 'RB': 9}

data = data.replace({'BestPosition': mapping})

<h2>Split the Data to Train and Test sets:</h2>

In [14]:
X = data.drop(["BestPosition"], axis = 1)
Y = pd.DataFrame(data["BestPosition"])

top = data.sort_values(by=["Overall"], ascending=False).head(20)

In [15]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [16]:
X_Train = X_Train.drop(["Name"], axis = 1)
test_names = X_Test["Name"]
X_Test = X_Test.drop(["Name"], axis = 1)

top_pos = top["BestPosition"]
top_names = top["Name"]
top = top.drop(["Name", "BestPosition"], axis = 1)

<h3>3. Handle the Imbalanced Data:</h3>

In [17]:
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling  import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_Train, Y_Train = oversample.fit_resample(X_Train, Y_Train)
X_Train, Y_Train = oversample.fit_resample(X_Train, Y_Train)
X_Train, Y_Train = oversample.fit_resample(X_Train, Y_Train)
X_Train, Y_Train = oversample.fit_resample(X_Train, Y_Train)

In [18]:
print(f' X_shape: {X_Train.shape} \n y_shape: {Y_Train.shape}')

 X_shape: (27732, 59) 
 y_shape: (27732, 1)


<h3>4. Feature Scaling:</h3>

In [19]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

X_Train = mms.fit_transform(X_Train)
X_Test = mms.fit_transform(X_Test)

top = mms.fit_transform(top)

# Modeling

## A. Predict the Position of the Player Using 8 Classification Algorithms:

<h3>Light GBM:</h3>

In [21]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/ca/b4/57f3f253721e0a16ea28c49acca92c5b1198eb94fbbb8328d6dabc61d2e0/lightgbm-4.4.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.4.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.4.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.4 MB 1.7 MB/s eta 0:00:01
   --- ------------------------------------ 0.1/1.4 MB 1.7 MB/s eta 0:00:01
   --- ------------------------------------ 0.1/1.4 MB 1.7 MB/s eta 0:00:01
   --- ------------------------------------ 0.1/1.4 MB 722.1 kB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.4 MB 958.4 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.4 MB 1.1 MB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.4 MB 1.3 MB/s eta 0:00:01
   -------------- ----------------------

In [22]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(max_depth = 15)

lgbm.fit(X_Train, Y_Train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4439
[LightGBM] [Info] Number of data points in the train set: 27732, number of used features: 59
[LightGBM] [Info] Start training from score -2.579222
[LightGBM] [Info] Start training from score -1.977115
[LightGBM] [Info] Start training from score -1.977115
[LightGBM] [Info] Start training from score -2.821206
[LightGBM] [Info] Start training from score -2.247926
[LightGBM] [Info] Start training from score -1.977115
[LightGBM] [Info] Start training from score -1.977115
[LightGBM] [Info] Start training from score -1.977115
[LightGBM] [Info] Start training from score -3.206583
[LightGBM] [Info] Start training from score -3.642792


In [46]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_Train, Y_Train)
knn.score(X_Train, Y_Train)

0.9309461993365066

In [56]:
import pickle

pickle.dump(knn, open("model20.pkl", "wb"))

In [57]:
top = pd.DataFrame(top)

# Iterate through each row in the 'top' DataFrame
for i in range(top.shape[0]):
    # Get the distances and indices of the three nearest neighbors
    distances, indices = knn.kneighbors(top.iloc[[i]], n_neighbors=3)
    
    # Get the predicted positions from the indices
    predicted_positions = [knn._y[idx] for idx in indices[0]]
    
    # Map the predicted positions to their names
    pred_names = [list(mapping.keys())[list(mapping.values()).index(pos)] for pos in predicted_positions]
    
    # Get the true position name
    true_pos = list(mapping.keys())[list(mapping.values()).index(top_pos.iloc[i])]
    
    # Print the results
    print(f'Name: {top_names.iloc[i]}\nPredicted Positions: {pred_names}\nTrue Best Pos: {true_pos}\n')

Name: L. Messi
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: R. Lewandowski
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: K. De Bruyne
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: K. Mbappé
Predicted Positions: ['CF', 'LW', 'LW']
True Best Pos: CF

Name: K. Benzema
Predicted Positions: ['CF', 'CM', 'CF']
True Best Pos: CF

Name: M. Salah
Predicted Positions: ['RW', 'RW', 'RW']
True Best Pos: RW

Name: T. Courtois
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: M. Neuer
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: Cristiano Ronaldo
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: V. van Dijk
Predicted Positions: ['CB', 'CB', 'CB']
True Best Pos: CB

Name: S. Mané
Predicted Positions: ['LW', 'LW', 'LW']
True Best Pos: LM

Name: N. Kanté
Predicted Positions: ['CM', 'RB', 'CM']
True Best Pos: CM

Name: J. Kimmich
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: Ede

In [58]:
model = pickle.load(open("model20.pkl", "rb"))

In [59]:
import pandas as pd

# Assuming 'top', 'top_names', 'top_pos', 'model', and 'mapping' are already defined

# Create DataFrame for 'top'
top = pd.DataFrame(top)

for i in range(top.shape[0]):
    # Get the distances and indices of the three nearest neighbors
    distances, indices = model.kneighbors(top.iloc[[i]], n_neighbors=3)
    
    # Get the predicted positions from the indices
    predicted_positions = [model._y[idx] for idx in indices[0]]
    
    # Map the predicted positions to their names
    pred_names = [list(mapping.keys())[list(mapping.values()).index(pos)] for pos in predicted_positions]
    
    # Get the true position name
    true_pos = list(mapping.keys())[list(mapping.values()).index(top_pos.iloc[i])]
    
    # Print the results
    print(f'Name: {top_names.iloc[i]}\nPredicted Positions: {pred_names}\nTrue Best Pos: {true_pos}\n')


Name: L. Messi
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: R. Lewandowski
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: K. De Bruyne
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: K. Mbappé
Predicted Positions: ['CF', 'LW', 'LW']
True Best Pos: CF

Name: K. Benzema
Predicted Positions: ['CF', 'CM', 'CF']
True Best Pos: CF

Name: M. Salah
Predicted Positions: ['RW', 'RW', 'RW']
True Best Pos: RW

Name: T. Courtois
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: M. Neuer
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: Cristiano Ronaldo
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: V. van Dijk
Predicted Positions: ['CB', 'CB', 'CB']
True Best Pos: CB

Name: S. Mané
Predicted Positions: ['LW', 'LW', 'LW']
True Best Pos: LM

Name: N. Kanté
Predicted Positions: ['CM', 'RB', 'CM']
True Best Pos: CM

Name: J. Kimmich
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: Ede

In [45]:
top = pd.DataFrame(top)

for i in range(top.shape[0]):
    pred_pos = model.predict(top.iloc[[i]])[0]
    
    pred = list(mapping.keys())[list(mapping.values()).index(pred_pos)]
    true_pos = list(mapping.keys())[list(mapping.values()).index(top_pos.iloc[i])]

    print('Name: {}\nPrediction: {}\n True Best Pos: {}\n'.format(top_names.iloc[i], pred, true_pos))

Name: L. Messi
Prediction: CM
 True Best Pos: CM

Name: R. Lewandowski
Prediction: CF
 True Best Pos: CF

Name: K. De Bruyne
Prediction: CM
 True Best Pos: CM

Name: K. Mbappé
Prediction: LW
 True Best Pos: CF

Name: K. Benzema
Prediction: CF
 True Best Pos: CF

Name: M. Salah
Prediction: RW
 True Best Pos: RW

Name: T. Courtois
Prediction: GK
 True Best Pos: GK

Name: M. Neuer
Prediction: GK
 True Best Pos: GK

Name: Cristiano Ronaldo
Prediction: CF
 True Best Pos: CF

Name: V. van Dijk
Prediction: CB
 True Best Pos: CB

Name: S. Mané
Prediction: LW
 True Best Pos: LM

Name: N. Kanté
Prediction: CM
 True Best Pos: CM

Name: J. Kimmich
Prediction: CM
 True Best Pos: CM

Name: Ederson
Prediction: GK
 True Best Pos: GK

Name: Alisson
Prediction: GK
 True Best Pos: GK

Name: J. Oblak
Prediction: GK
 True Best Pos: GK

Name: Casemiro
Prediction: CB
 True Best Pos: CM

Name: H. Son
Prediction: LM
 True Best Pos: LW

Name: H. Kane
Prediction: CF
 True Best Pos: CF

Name: Neymar Jr
Prediction

In [36]:
import pandas as pd

# Assuming 'top', 'knn', 'mapping', 'top_pos', and 'top_names' are already defined

# Convert 'top' to DataFrame if it's not already one



Name: L. Messi
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: R. Lewandowski
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: K. De Bruyne
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: K. Mbappé
Predicted Positions: ['CF', 'LW', 'LW']
True Best Pos: CF

Name: K. Benzema
Predicted Positions: ['CF', 'CM', 'CF']
True Best Pos: CF

Name: M. Salah
Predicted Positions: ['RW', 'RW', 'RW']
True Best Pos: RW

Name: T. Courtois
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: M. Neuer
Predicted Positions: ['GK', 'GK', 'GK']
True Best Pos: GK

Name: Cristiano Ronaldo
Predicted Positions: ['CF', 'CF', 'CF']
True Best Pos: CF

Name: V. van Dijk
Predicted Positions: ['CB', 'CB', 'CB']
True Best Pos: CB

Name: S. Mané
Predicted Positions: ['LW', 'LW', 'LW']
True Best Pos: LM

Name: N. Kanté
Predicted Positions: ['CM', 'RB', 'CM']
True Best Pos: CM

Name: J. Kimmich
Predicted Positions: ['CM', 'CM', 'CM']
True Best Pos: CM

Name: Ede