In [71]:
# Importing relevant libraries

import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
warnings.simplefilter(action='ignore', category=Warning) # Suppress warnings
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score




In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Data Preparation & Feature Extraction

In [3]:
# To display all the columns of the dataset:
pd.set_option('display.max_columns', None)
data_male_players= pd.read_csv('/content/drive/My Drive/Veronica/Assignment 2/New/male_players.csv')
data_players_22= pd.read_csv('/content/drive/My Drive/Veronica/Assignment 2/New/players_22.csv')


In [4]:
male_players = pd.DataFrame(data_male_players)
players_22 = pd.DataFrame(data_players_22)

In [None]:
male_players.head()

In [None]:
players_22.head()

In [None]:
male_players.describe()

In [None]:
# gives a description of the dataset player_21
players_22.describe()

In [None]:
# getting information about the male_players_legacy dataset

male_players.info()

In [None]:
# getting information about the players_22 dataset
players_22.info()

In [None]:
male_players.shape

(161583, 110)

In [None]:
players_22.shape


(19239, 110)

In [None]:
male_players.columns

Index(['player_id', 'player_url', 'fifa_version', 'fifa_update',
       'fifa_update_date', 'short_name', 'long_name', 'player_positions',
       'overall_rating', 'potential',
       ...
       'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk',
       'player_face_url'],
      dtype='object', length=101)

In [None]:
# viewing the columns of the dataset player_22
players_22.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name',
       'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age',
       ...
       'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url'],
      dtype='object', length=110)

In [None]:
players_22.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
male_players.hist(bins=50, figsize=(20,15))
plt.show()

Rename the overall column to overall rating

In [5]:
# Rename the column: overall to overall_rating

male_players.rename(columns={'overall':'overall_rating', },inplace=True)
players_22.rename(columns={'overall':'overall_rating', },inplace=True)

Dropping some redundant columns





In [6]:
# Removing columns with more than 30% missing values and also selecting columns without any NaN values
male_players = male_players.loc[:, (male_players.isnull().mean() <= 0.3) | (male_players.isnull().sum() == 0)]
players_22 = players_22.loc[:, (players_22.isnull().mean() <= 0.3) | (players_22.isnull().sum() == 0)]

In [7]:
# Drop the following columns in male_players:

male_players.drop(['mentality_composure'], axis=1, inplace=True)
male_players.drop(['player_id'], axis=1, inplace=True)
male_players.drop(['nationality_id'], axis=1, inplace=True)
male_players.drop(['club_team_id'], axis=1, inplace=True)
male_players.drop(['league_id'], axis=1, inplace=True)

In [8]:
# Drop the following columns in players_22:
players_22.drop(['sofifa_id'], axis=1, inplace=True)

In [9]:
male_players.shape

(161583, 97)

In [10]:
players_22.shape

(19239, 101)

Checking for null values

In [None]:
# for col in male_players:
male_players.isnull().sum()

player_id           0
player_url          0
fifa_version        0
fifa_update         0
fifa_update_date    0
                   ..
cb                  0
rcb                 0
rb                  0
gk                  0
player_face_url     0
Length: 98, dtype: int64

In [None]:
# for col in players_22:
players_22.isnull().sum()

sofifa_id            0
player_url           0
short_name           0
long_name            0
player_positions     0
                    ..
gk                   0
player_face_url      0
club_logo_url       61
club_flag_url       61
nation_flag_url      0
Length: 102, dtype: int64

In [11]:
# Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_male_players = male_players.isnull().sum()
columns_with_missing_values_for_male_players = missing_values_for_male_players[missing_values_for_male_players > 0].index.tolist()

# For players_22
missing_values_for_players_22 = players_22.isnull().sum()
columns_with_missing_values_for_players_22 = missing_values_for_players_22[missing_values_for_players_22 > 0].index.tolist()

print(columns_with_missing_values_for_male_players,"\n")
print(columns_with_missing_values_for_players_22)

['value_eur', 'wage_eur', 'league_name', 'league_level', 'club_name', 'club_position', 'club_jersey_number', 'club_joined_date', 'club_contract_valid_until_year', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic'] 

['value_eur', 'wage_eur', 'club_team_id', 'club_name', 'league_name', 'league_level', 'club_position', 'club_jersey_number', 'club_joined', 'club_contract_valid_until', 'release_clause_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'club_logo_url', 'club_flag_url']


Extracting Non-Categorical Features

In [12]:
# Keep the following categorical features for male_players
preferred_foot_for_male_players = male_players['preferred_foot']
work_rate_for_male_players = male_players['work_rate']

# Keep the following categorical features for players_22
preferred_foot_for_players_22 = players_22['preferred_foot']
work_rate_for_players_22 = players_22['work_rate']

In [13]:
# Extract numerical features

num_cols_for_male_players = male_players.select_dtypes(include=['number']).columns
num_cols_for_players_22 = players_22.select_dtypes(include=['number']).columns
print(num_cols_for_male_players, "\n")
print(num_cols_for_players_22)

Index(['fifa_version', 'fifa_update', 'overall_rating', 'potential',
       'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg',
       'league_level', 'club_jersey_number', 'club_contract_valid_until_year',
       'weak_foot', 'skill_moves', 'international_reputation', 'pace',
       'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'defending_marking_awareness', 'def

In [14]:
male_players = male_players[num_cols_for_male_players].head()
male_players

Unnamed: 0,fifa_version,fifa_update,overall_rating,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,club_jersey_number,club_contract_valid_until_year,weak_foot,skill_moves,international_reputation,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,15,2,93,95,100500000.0,550000.0,27,169,67,1.0,10.0,2018.0,3,4,5,93.0,89.0,86.0,96.0,27.0,63.0,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,25,21,20,6,11,15,14,8
1,15,2,92,92,79000000.0,375000.0,29,185,80,1.0,7.0,2018.0,4,5,5,93.0,93.0,81.0,91.0,32.0,79.0,83,95,86,82,87,93,88,79,72,92,91,94,93,90,63,94,94,89,79,93,63,24,91,81,85,22,31,23,7,11,15,14,11
2,15,2,90,90,54500000.0,275000.0,30,180,80,1.0,10.0,2017.0,2,4,5,93.0,86.0,83.0,92.0,32.0,64.0,80,85,50,86,86,93,85,83,76,90,93,93,93,89,91,86,61,78,65,90,47,39,89,84,80,29,26,26,10,8,11,5,15
3,15,2,90,90,52500000.0,275000.0,32,195,95,1.0,10.0,2016.0,4,4,5,76.0,91.0,81.0,86.0,34.0,86.0,76,91,76,84,92,88,80,80,76,90,74,77,86,85,41,93,72,78,93,88,84,20,86,83,91,25,41,27,13,15,10,9,12
4,15,2,90,90,63500000.0,300000.0,28,193,92,1.0,1.0,2019.0,4,1,5,,,,,,,25,25,25,42,25,25,25,25,41,31,58,61,43,89,35,42,78,44,83,25,29,30,25,20,37,25,25,25,87,85,92,90,86


In [15]:
players_22 = players_22[num_cols_for_players_22].head()
players_22

Unnamed: 0,overall_rating,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,club_jersey_number,club_contract_valid_until,nationality_id,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,93,93,78000000.0,320000.0,34,170,72,73.0,1.0,30.0,2023.0,52,4,4,5,144300000.0,85.0,92.0,91.0,95.0,34.0,65.0,85,95,70,91,88,96,93,94,91,96,91,80,91,94,95,86,68,72,69,94,44,40,93,95,75,96,20,35,24,6,11,15,14,8
1,92,92,119500000.0,270000.0,32,185,81,21.0,1.0,9.0,2023.0,37,4,4,5,197200000.0,78.0,92.0,79.0,86.0,44.0,82.0,71,95,90,85,89,85,79,85,70,88,77,79,77,93,82,90,85,76,86,87,81,49,95,81,90,88,35,42,19,15,6,12,8,10
2,91,91,45000000.0,270000.0,36,187,83,11.0,1.0,7.0,2023.0,38,4,5,5,83300000.0,87.0,94.0,80.0,88.0,34.0,75.0,87,95,90,80,86,88,81,84,77,88,85,88,86,94,74,94,95,77,77,93,63,29,95,76,88,95,24,32,24,7,11,15,14,11
3,91,91,129000000.0,270000.0,29,175,68,73.0,1.0,10.0,2025.0,54,5,5,5,238700000.0,91.0,83.0,86.0,94.0,37.0,63.0,85,83,63,86,86,95,88,87,81,95,93,89,96,89,84,80,64,81,53,81,63,37,86,90,93,93,35,32,29,9,9,15,15,11
4,91,91,125500000.0,350000.0,30,181,70,10.0,1.0,17.0,2025.0,7,5,4,4,232200000.0,76.0,86.0,93.0,88.0,64.0,78.0,94,82,55,94,82,88,85,83,93,91,76,76,79,91,78,91,63,89,74,91,76,66,88,94,83,89,68,65,53,15,13,5,10,13


In [16]:
# Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_male_players = male_players.isnull().sum()
columns_with_missing_values_for_male_players = missing_values_for_male_players[missing_values_for_male_players > 0].index.tolist()

print(columns_with_missing_values_for_male_players,"\n")

['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic'] 



In [17]:
# Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_players_22 = players_22.isnull().sum()
columns_with_missing_values_for_players_22 = missing_values_for_players_22[missing_values_for_players_22 > 0].index.tolist()

print(columns_with_missing_values_for_players_22)

[]


In [18]:
# Fill NaN values with the mean of the column


male_players = male_players.fillna(male_players.mean())

In [19]:
# Again, Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_male_players = male_players.isnull().sum()
columns_with_missing_values_for_male_players = missing_values_for_male_players[missing_values_for_male_players > 0].index.tolist()

print(columns_with_missing_values_for_male_players,"\n")

[] 



In [None]:
male_players.head()

In [None]:
players_22.head()

In [21]:
# Convert categorical features into numeric for male_players
preferred_foot_for_male_players = pd.get_dummies(preferred_foot_for_male_players, prefix='preferred_foot')
work_rate_for_male_players  = pd.get_dummies(work_rate_for_male_players , prefix='work_rate')


# Convert categorical features into numeric for players_22
preferred_foot_for_players_22 = pd.get_dummies(preferred_foot_for_players_22, prefix='preferred_foot')
work_rate_for_players_22 = pd.get_dummies(work_rate_for_players_22, prefix='work_rate')

In [None]:
preferred_foot_for_male_players

In [None]:
work_rate_for_male_players

(161583, 9)

In [None]:
preferred_foot_for_players_22

In [None]:
work_rate_for_players_22

In [None]:
male_players

In [None]:
players_22

In [22]:
# Again, Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_male_players = male_players.isnull().sum()
columns_with_missing_values_for_male_players = missing_values_for_male_players[missing_values_for_male_players > 0].index.tolist()

print(columns_with_missing_values_for_male_players,"\n")

[] 



In [23]:
# Join both transformed numeric and non-numeric columns to form fully numeric columns
male_players = pd.concat([male_players, preferred_foot_for_male_players, work_rate_for_male_players,], axis=1)

players_22 = pd.concat([players_22, preferred_foot_for_players_22, work_rate_for_players_22], axis=1)


In [None]:
male_players.shape

(161583, 65)

In [None]:
players_22.shape

(19239, 67)

In [None]:
## At this point male_players only has numeric features in the dataset. Lets impute the missing values if any.

# for male_players
imp = SimpleImputer()

# Fit and transform the data
try:
    imputed_data_for_male_players = imp.fit_transform(male_players)
    male_players = pd.DataFrame(imputed_data_for_male_players, columns=male_players.columns)
    print(male_players)
except ValueError as e:
    print("Error during imputation:", e)



In [25]:
## At this point players_21 and players_22 only have numeric features in the dataset. Lets impute the missing values.

# for players_22
imp = SimpleImputer(strategy='most_frequent')
imputed_data_22 = imp.fit_transform(players_22)
players_22 = pd.DataFrame(imputed_data_22, columns=players_22.columns)


In [26]:
# Print the columns names containing null values or missing values in the form of a list

# For male_players
missing_values_for_male_players = male_players.isnull().sum()
columns_with_missing_values_for_male_players = missing_values_for_male_players[missing_values_for_male_players > 0].index.tolist()

print(columns_with_missing_values_for_male_players,"\n")

[] 



In [27]:
# Print the columns names containing null values or missing values in the form of a list

# For players_22
missing_values_for_players_22 = players_22.isnull().sum()
columns_with_missing_values_for_players_22 = missing_values_for_players_22[missing_values_for_players_22 > 0].index.tolist()

print(columns_with_missing_values_for_players_22)

[]


In [None]:
players_22

In [None]:
male_players

Feature Selection

In [None]:
# Calculate the correlation between each feature and the target variable
correlation_matrix = male_players.corr()
target_correlation = correlation_matrix['overall_rating'].drop('overall_rating')

# Select the top-k features with the highest absolute correlation
k = 20
top_k_features = target_correlation[target_correlation > 0.5].abs().nlargest(k).index

# Collect the column names of top_k_features in a list and store it in the 'selected_features' variable
selected_features = top_k_features.tolist()

# Display the selected features and their correlation with the target variable
print(f"\nTop {k} features with maximum correlation with the target variable:\n")
print(male_players[top_k_features])

'''
#The correlation matrix produces the some results for players_22

# Calculate the correlation between each feature and the target variable
correlation_matrix = players_22.corr()
target_correlation = correlation_matrix['overall_rating'].drop('overall_rating')

# Select the top-k features with the highest absolute correlation
k = 20
top_k_features = target_correlation[target_correlation > 0.5].abs().nlargest(k).index

# Collect the column names of top_k_features in a list and store it in the 'selected_features' variable
selected_features = top_k_features.tolist()

# Display the selected features and their correlation with the target variable
print(f"\nTop {k} features with maximum correlation with the target variable:\n")
print(players_22[top_k_features])

print(selected_features)

'''

In [32]:
# List of features that would be trained
selected_features

['potential',
 'value_eur',
 'wage_eur',
 'movement_reactions',
 'dribbling',
 'movement_acceleration',
 'attacking_heading_accuracy',
 'movement_balance',
 'movement_sprint_speed',
 'passing',
 'pace']

In [61]:
# Select the the independent (X) and dependent (Y) variables
X = male_players[selected_features]
y = male_players['overall_rating']

In [36]:
X.head()

Unnamed: 0,potential,value_eur,wage_eur,movement_reactions,dribbling,movement_acceleration,attacking_heading_accuracy,movement_balance,movement_sprint_speed,passing,pace
0,95.0,100500000.0,550000.0,94.0,96.0,96.0,71.0,95.0,90.0,86.0,93.0
1,92.0,79000000.0,375000.0,90.0,91.0,91.0,86.0,63.0,94.0,81.0,93.0
2,90.0,54500000.0,275000.0,89.0,92.0,93.0,50.0,91.0,93.0,83.0,93.0
3,90.0,52500000.0,275000.0,85.0,86.0,74.0,76.0,41.0,77.0,81.0,76.0
4,90.0,63500000.0,300000.0,89.0,91.25,58.0,25.0,35.0,61.0,82.75,88.75


In [37]:
y.head()

0    93.0
1    92.0
2    90.0
3    90.0
4    90.0
Name: overall_rating, dtype: float64

In [38]:
X.shape

(161583, 11)

In [39]:
y.shape

(161583,)

**Feature Scaling and Training**

In [62]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [63]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
## View the dimension of the training data and testing data
x_train.shape, y_test.shape, y_train.shape, x_test.shape

((129266, 11), (32317,), (129266,), (32317, 11))

**Ensemble Models**

In [68]:
# Model 1: Train RandomForest, XGBoost and Gradient Boost Regressors models with cv and grid search:
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 5]},
    'XGBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [4, 6], 'learning_rate': [0.01, 0.1]}
}

for name, model in models.items():
    gs = GridSearchCV(model, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test, y_test)}")
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")


Best parameters for RandomForest: {'max_depth': 5, 'n_estimators': 5}
Validation score for RandomForest: 0.5359971284285554
mean_absolute_error for RandomForest: 6.188693257418696e-05
mean_squared_error for RandomForest: 7.178884178605716e-05
r2_score for RandomForest: 0.5359971284285554


Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for XGBoost: 0.3104973362891621
mean_absolute_error for XGBoost: 8.177288792107289e-05
mean_squared_error for XGBoost: 0.00010667735194949285
r2_score for XGBoost: 0.3104973362891621


Best parameters for GradientBoost: {'learning_rate': 0.1, 'n_estimators': 4}
Validation score for GradientBoost: 0.11389817925153545
mean_absolute_error for GradientBoost: 8.726420728174488e-05
mean_squared_error for GradientBoost: 0.00013709446064549634
r2_score for GradientBoost: 0.11389817925153545



In [72]:
'''
Mean Squared Error (MSE) is a common metric for regression problems,
and it measures the average of the squared differences between the
predicted values and the actual values. A lower MSE indicates that
the model's predictions are, on average, closer to the true values.
Lower MSE often corresponds to better model performance.
'''

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    random_state=42
)

# Train the model
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)


mean_absolute_error: 6.529071386576722e-05
mean_squared_error: 9.00609586285856e-05
r2_score: 0.41789639753936014


In [73]:
# Model 2: VotingClassifier

decision_tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=8)
svm = SVC(probability=True, random_state=42)

voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('knn', knn),
    ('svm', svm)
], voting='soft')

for model in (decision_tree, knn, svm,voting_classifier):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(model.__class__.__name__,accuracy_score(y_pred,y_test))

DecisionTreeClassifier 0.9999690565337129
KNeighborsClassifier 0.9999381130674259
SVC 0.9999690565337129
VotingClassifier 0.9999690565337129


In [74]:
# Model 3: RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=20, max_depth=3, criterion='entropy')

# Perform cross-validation
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"\nMean cross-validation score: {cv_scores.mean()}")

# Fit the model
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)
print('\nAccuracy of the model:',accuracy_score(y_pred,y_test))

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nmean_absolute_error of the model:",mae)
print("\nmean_squared_error of the model:",mse)
print("\nr2_score of the model:",r2)

# Fine-tune the model (RandomForestClassifier) with GridSearchCV
n_estimators_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

grid = GridSearchCV(RandomForestClassifier(max_depth=3, criterion='entropy'), param_grid, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)

rfc=RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=3, criterion='entropy')
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Examine the best model
print("\ngrid.best_score:",grid.best_score_)
print("\ngrid.best_params:",grid.best_params_)
print("\ngrid.best_estimator:",grid.best_estimator_)
print("\nAccuracy of the best model:", accuracy_score(y_pred,y_test))
print("\nmean_absolute_error of the best model:",mae)
print("\nmean_squared_error of the best model:",mse)
print("\nr2_score of the best model:",r2)


Cross-validation scores: [0.99996132 1.         1.         0.99996132 0.99996132]

Mean cross-validation score: 0.9999767921609

Accuracy of the model: 0.9999381130674259

mean_absolute_error of the model: 9.283039886128045e-05

mean_squared_error of the model: 0.0001547173314354674

r2_score of the model: -6.188731557665861e-06

grid.best_score: 0.9999845285062273

grid.best_params: {'n_estimators': 2}

grid.best_estimator: RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=2)

Accuracy of the best model: 0.9999690565337129

mean_absolute_error of the best model: 3.094346628709348e-05

mean_squared_error of the best model: 3.094346628709348e-05

r2_score of the best model: 0.7999987622536885


In [75]:
# correlation between variables in selected_features and the target variable; overall_rating
for name, score in zip(x_train.columns, rfc.feature_importances_):
  print(name, score)

potential 0.0
value_eur 0.17292422931890836
wage_eur 0.0
movement_reactions 0.3506130000786736
dribbling 0.0
movement_acceleration 0.0
attacking_heading_accuracy 0.0
movement_balance 0.0
movement_sprint_speed 0.0
passing 0.0
pace 0.4764627706024181


**Testing the models using players_22**

In [76]:
x_test_22 = players_22[selected_features]
y_test_22 = players_22['overall_rating']

scaler = StandardScaler()
x_test_22_scaled = scaler.fit_transform(x_test_22)
x_test_22 = pd.DataFrame(x_test_22_scaled, columns=x_test_22.columns)

In [77]:
# Model 1: Testing players_22 on RandomForest, XGBoost, Gradient Boost Regressors
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 5]},
    'XGBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [4, 6], 'learning_rate': [0.01, 0.1]}
}

for name, m in models.items():
    gs = GridSearchCV(m, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test_22)

    # Evaluate the model
    mae = mean_absolute_error(y_test_22, y_pred)
    mse = mean_squared_error(y_test_22, y_pred)
    r2 = r2_score(y_test_22, y_pred)

    #print('\nAccuracy: {}'.format(nb_model.score(x_test, y_test)))
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test_22, y_test_22)}")
    #print()
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")


Best parameters for RandomForest: {'max_depth': 5, 'n_estimators': 5}
Validation score for RandomForest: 0.23992888777992905
mean_absolute_error for RandomForest: 0.0001559332605644784
mean_squared_error for RandomForest: 0.00019751546338167203
r2_score for RandomForest: 0.23992888777992905


Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for XGBoost: -441.85074790056836
mean_absolute_error for XGBoost: 0.3390747840429667
mean_squared_error for XGBoost: 0.11508116710949906
r2_score for XGBoost: -441.85074790056836


Best parameters for GradientBoost: {'learning_rate': 0.1, 'n_estimators': 4}
Validation score for GradientBoost: -0.0027240757450004427
mean_absolute_error for GradientBoost: 0.00016942818195456107
mean_squared_error for GradientBoost: 0.00026057234287755457
r2_score for GradientBoost: -0.0027240757450004427



In [78]:
# Model 2: Testing players_22 on VotingClassifier

decision_tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=8)
svm = SVC(probability=True, random_state=42)

voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('knn', knn),
    ('svm', svm)
], voting='soft')

for model in (decision_tree, knn, svm,voting_classifier):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test_22)
  print(model.__class__.__name__,accuracy_score(y_pred,y_test_22))

DecisionTreeClassifier 0.999792088985914
KNeighborsClassifier 0.999896044492957
SVC 0.9997401112323925
VotingClassifier 0.999792088985914


In [79]:
# Model 3: Testing players_22 on RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=20, max_depth=3, criterion='entropy')

# Perform cross-validation
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"\nMean cross-validation score: {cv_scores.mean()}")

# Fit the model
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test_22)
accuracy_score(y_pred,y_test_22)
print('\nAccuracy of the model:',accuracy_score(y_pred,y_test_22))

# Evaluate the model
mae = mean_absolute_error(y_test_22, y_pred)
mse = mean_squared_error(y_test_22, y_pred)
r2 = r2_score(y_test_22, y_pred)
print("\nmean_absolute_error of the model:",mae)
print("\nmean_squared_error of the model:",mse)
print("\nr2_score of the model:",r2)

# Fine-tune the model (RandomForestClassifier) with GridSearchCV
n_estimators_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

grid = GridSearchCV(RandomForestClassifier(max_depth=3, criterion='entropy'), param_grid, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)

rfc=RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=3, criterion='entropy')
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test_22)
accuracy_score(y_pred,y_test_22)

# Evaluate the model
mae = mean_absolute_error(y_test_22, y_pred)
mse = mean_squared_error(y_test_22, y_pred)
r2 = r2_score(y_test_22, y_pred)

# Examine the best model
print("\ngrid.best_score:",grid.best_score_)
print("\ngrid.best_params:",grid.best_params_)
print("\ngrid.best_estimator:",grid.best_estimator_)
print("\nAccuracy of the best model:", accuracy_score(y_pred,y_test_22))
print("\nmean_absolute_error of the best model:",mae)
print("\nmean_squared_error of the best model:",mse)
print("\nr2_score of the best model:",r2)


Cross-validation scores: [0.99996132 1.         1.         0.99996132 0.99996132]

Mean cross-validation score: 0.9999767921609

Accuracy of the model: 0.999896044492957

mean_absolute_error of the model: 0.0001559332605644784

mean_squared_error of the model: 0.00025988876760746403

r2_score of the model: -9.356871062271566e-05

grid.best_score: 0.9999845285062273

grid.best_params: {'n_estimators': 1}

grid.best_estimator: RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=1)

Accuracy of the best model: 0.999896044492957

mean_absolute_error of the best model: 0.0001559332605644784

mean_squared_error of the best model: 0.00025988876760746403

r2_score of the best model: -9.356871062271566e-05


In [None]:
# correlation between variables in selected_features and the target variable; overall_rating
for name, score in zip(x_train.columns, rfc.feature_importances_):
  print(name, score)

**Saving RandomForestRegressor model using pickle**

In [None]:
filename = 'player_rating_predictor.pkl'
pickle.dump(rf_model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))


In [None]:
y_pred = loaded_model.predict(x_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)

In [None]:
if isinstance(loaded_model, RandomForestRegressor):
    print("The model is a RandomForestRegressor.")
else:
    print("The model is not a RandomForestRegressor.")