# QUESTION 4

4. Random Forest: For this question also, you have to use the same football dataset. The
aim is to use the Random Forest model to do classification and regression both.
(a) Classification: Your target column is ‘contribution type’. Process the data as you want,
modify/drop any columns that you want, and play around with the hyperparameters.
Try to understand and observe the difference in results. Try different losses (or quality
criterion) – ‘gini’, ‘entropy’, ‘log loss’. After training the model, report test accuracy
and f1 score.
(b) Regression: Your target column is ‘overall’. Again, you are free to process the dataset
and encouraged to try different hyperparameters. Use MSE and MAE one by one to
train the models, and report test MSE and MAE for both models.
Note: You can use sklearn library to get Random Forest implementations

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
from sklearn.model_selection import train_test_split

In [6]:
# from pandas_profiling import ProfileReport
from ydata_profiling import ProfileReport

In [7]:
filePath="E:\\MTech\\ML_sem1\\Assingment\\Assignment1\\football.csv"
df=pd.read_csv(filePath)
df.shape

(7772, 66)

In [8]:
df.head()

Unnamed: 0,sofifa_id,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,...,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,contribution_type
0,158023,L. Messi,93,93,78000000.0,320000.0,34,170,72,73.0,...,20,35,24,6,11,15,14,8,,1
1,188545,R. Lewandowski,92,92,119500000.0,270000.0,32,185,81,21.0,...,35,42,19,15,6,12,8,10,,1
2,20801,Cristiano Ronaldo,91,91,45000000.0,270000.0,36,187,83,11.0,...,24,32,24,7,11,15,14,11,,1
3,190871,Neymar Jr,91,91,129000000.0,270000.0,29,175,68,73.0,...,35,32,29,9,9,15,15,11,,1
4,192985,K. De Bruyne,91,91,125500000.0,350000.0,30,181,70,10.0,...,68,65,53,15,13,5,10,13,,1


In [9]:
### Target value spliting
X=df.drop('overall',axis=1)
y=df['overall']

In [10]:
#### For random forest normalization and missing values and standardization is not important

## taking only important columns

In [11]:
feature_cols=['potential', 'value_eur', 'wage_eur', 
       'preferred_foot', 'weak_foot', 'skill_moves',
       'international_reputation', 'work_rate', 'body_type',
       'pace',
        'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes',
       'contribution_type']

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [13]:
X_train=X_train[feature_cols]
X_test=X_test[feature_cols]

In [14]:
X_train=pd.get_dummies(X_train)
X_test=pd.get_dummies(X_test)

In [15]:
### Filling NULL values

fill_pace=X_train['pace'].min()
fill_shooting=X_train['shooting'].min()
fill_passing=X_train['passing'].min()
fill_dribbling=X_train['dribbling'].min()
fill_defending=X_train['defending'].min()
fill_physic=X_train['physic'].min()
fill_value=X_train['value_eur'].mean()
fill_wage=X_train['wage_eur'].mean()

X_train['pace']=fill_pace
X_train['shooting']=fill_shooting
X_train['passing']=fill_passing
X_train['dribbling']=fill_dribbling
X_train['defending']=fill_defending
X_train['physic']=fill_physic

X_train['value_eur']=fill_value
X_train['wage_eur']=fill_wage





In [16]:
X_train.isna().sum()

potential                     0
value_eur                     0
wage_eur                      0
weak_foot                     0
skill_moves                   0
                             ..
body_type_Normal (185+)       0
body_type_Stocky (170-)       0
body_type_Stocky (170-185)    0
body_type_Stocky (185+)       0
body_type_Unique              0
Length: 68, dtype: int64

In [17]:
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [18]:
# Create a Random Forest classifier
rf_classifier = RandomForestRegressor(random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(10, 200),        # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'] + list(range(1, X_train.shape[1] + 1)), 
    'max_depth': [None] + list(np.arange(1, 20)),
    'min_samples_split': randint(2, 20),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),     # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]              # Whether to bootstrap samples
}


### Training the regression model for MSE

In [16]:

start = time.time()

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=5,
    cv=5, verbose=2, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error'
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

end = time.time()
print(end - start)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
10.792119026184082


In [17]:
### preparing test data

X_test['pace']=fill_pace
X_test['shooting']=fill_shooting
X_test['passing']=fill_passing
X_test['dribbling']=fill_dribbling
X_test['defending']=fill_defending
X_test['physic']=fill_physic

X_test['value_eur']=fill_value
X_test['wage_eur']=fill_wage

In [18]:
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error

In [19]:

final_model=best_rf_model = random_search.best_estimator_
final_model.fit(X_train,y_train)
y_pred=final_model.predict(X_test)
r2=r2_score(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
mae=mean_absolute_error(y_test, y_pred)
print('r2:score',r2)
print('mse error',mse)
print('mean absolution error:',mae)

r2:score 0.9610465188255433
mse error 1.46446489040457
mean absolution error: 0.8697107089792554


### Training the regression model for MAE

In [20]:

start = time.time()
    
# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=5,
    cv=5, verbose=2, n_jobs=-1, random_state=42, scoring='neg_mean_absolute_error'
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

end = time.time()
print(end - start)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
7.802634239196777


In [21]:
final_model= random_search.best_estimator_
final_model.fit(X_train,y_train)
y_pred=final_model.predict(X_test)
r2=r2_score(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
mae=mean_absolute_error(y_test, y_pred)
print('r2:score',r2)
print('mse error',mse)
print('mean absolution error:',mae)

r2:score 0.9610465188255433
mse error 1.46446489040457
mean absolution error: 0.8697107089792554


In [22]:

# Use MSE when you want to prioritize reducing the impact of large errors or when you believe that squared errors are a more appropriate measure of model performance.

    
# Use MAE when you want predictions that are more robust to outliers or when you prefer a more balanced approach to errors, treating all observations equally.



### Classification

In [23]:
### Target value spliting
X=df.drop('contribution_type',axis=1)
y=df['contribution_type']

In [24]:
feature_cols=['potential', 'value_eur', 'wage_eur', 
       'preferred_foot', 'weak_foot', 'skill_moves',
       'international_reputation', 'work_rate', 'body_type',
       'pace',
        'shooting', 'passing', 'dribbling', 'defending', 'physic',
       'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes',
       'overall']

In [25]:
# Split the data into training and testing sets
## we will do stratified train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42,)



In [26]:
X_train=X_train[feature_cols]
X_test=X_test[feature_cols]

In [27]:
X_train=pd.get_dummies(X_train)
X_test=pd.get_dummies(X_test)

In [28]:
### Filling NULL values

fill_pace=X_train['pace'].min()
fill_shooting=X_train['shooting'].min()
fill_passing=X_train['passing'].min()
fill_dribbling=X_train['dribbling'].min()
fill_defending=X_train['defending'].min()
fill_physic=X_train['physic'].min()
fill_value=X_train['value_eur'].mean()
fill_wage=X_train['wage_eur'].mean()

X_train['pace']=fill_pace
X_train['shooting']=fill_shooting
X_train['passing']=fill_passing
X_train['dribbling']=fill_dribbling
X_train['defending']=fill_defending
X_train['physic']=fill_physic

X_train['value_eur']=fill_value
X_train['wage_eur']=fill_wage


### With gini impurity

In [29]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(criterion='gini',random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(10, 200),        # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'] + list(range(1, X_train.shape[1] + 1)), 
    'max_depth': [None] + list(np.arange(1, 20)),
    'min_samples_split': randint(2, 20),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),     # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]              # Whether to bootstrap samples
}

In [30]:

start = time.time()

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=5,
    cv=5, verbose=2, n_jobs=-1, random_state=42, scoring='accuracy'
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

end = time.time()
print(end - start)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
7.0349647998809814


In [32]:
### preparing test data

X_test['pace']=fill_pace
X_test['shooting']=fill_shooting
X_test['passing']=fill_passing
X_test['dribbling']=fill_dribbling
X_test['defending']=fill_defending
X_test['physic']=fill_physic

X_test['value_eur']=fill_value
X_test['wage_eur']=fill_wage

In [33]:
final_model = random_search.best_estimator_
final_model.fit(X_train,y_train)
y_pred=final_model.predict(X_test)


from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-score:", f1)

Accuracy: 0.8861736334405145
F1-score: 0.8703296703296705


### With ENTROPY

In [38]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(criterion='entropy',random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(10, 200),        # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'] + list(range(1, X_train.shape[1] + 1)), 
    'max_depth': [None] + list(np.arange(1, 20)),
    'min_samples_split': randint(2, 20),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),     # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]              # Whether to bootstrap samples
}

In [39]:

start = time.time()

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=5,
    cv=5, verbose=2, n_jobs=-1, random_state=42, scoring='accuracy'
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

end = time.time()
print(end - start)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
6.617023944854736


In [44]:
random_search.best_estimator_

In [41]:
final_model = random_search.best_estimator_
final_model.fit(X_train,y_train)
y_pred=final_model.predict(X_test)


from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-score:", f1)

Accuracy: 0.8868167202572347
F1-score: 0.8709677419354839


### With log loss

In [47]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(criterion='log_loss',random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(10, 200),        # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'] + list(range(1, X_train.shape[1] + 1)), 
    'max_depth': [None] + list(np.arange(1, 20)),
    'min_samples_split': randint(2, 20),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),     # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]              # Whether to bootstrap samples
}

In [48]:

start = time.time()

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=5,
    cv=5, verbose=2, n_jobs=-1, random_state=42, scoring='accuracy'
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

end = time.time()
print(end - start)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
6.622352123260498


In [49]:
random_search.best_estimator_

In [50]:
final_model = random_search.best_estimator_
final_model.fit(X_train,y_train)
y_pred=final_model.predict(X_test)


from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-score:", f1)

Accuracy: 0.8868167202572347
F1-score: 0.8709677419354839


# END