# Random Forest Classifier

In [1]:
# Load the packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
# Load the data
train_df = pd.read_csv('./../../../../data/train/train.csv')
test_df = pd.read_csv('./../../../../data/test/test.csv')

In [3]:
# Load the feature selection result
feature_selector = pd.read_csv('./../../../../data/feature_ranking.csv')
feature_selector.set_index('Unnamed: 0', inplace=True)

In [4]:
# Separate feature space from target variable
y_train = train_df['Attrition']
X_train = train_df.drop('Attrition', axis=1)
y_test = test_df['Attrition']
X_test = test_df.drop('Attrition', axis=1)

We will be running models for different set of features and evaluate their performances. We start with complete dataset and then start with meaximum feature score of 8 to 5.

In [5]:
# Declare the model paramters for searching
param_grid = dict(
    n_estimators = [50, 100, 200, 400],
    max_depth = [10, 20, 40],
    min_samples_split = [2, 5, 10]
)

In [6]:
# Declare and train the model
rf_clf = RandomForestClassifier(class_weight="balanced", max_features=None, bootstrap=False)
rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid, scoring='f1', n_jobs=-1)

## Complete data

In [7]:
# Train the model
rf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(bootstrap=False,
                                              class_weight='balanced',
                                              max_features=None),
             n_jobs=-1,
             param_grid={'max_depth': [10, 20, 40],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200, 400]},
             scoring='f1')

In [8]:
# Get the parameters for the best model
rf.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=10,
                       max_features=None)

In [9]:
# Predict using model
y_pred = rf.predict(X_test)

In [10]:
# Make the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.89      0.88       255
        True       0.24      0.23      0.23        39

    accuracy                           0.80       294
   macro avg       0.56      0.56      0.56       294
weighted avg       0.80      0.80      0.80       294



The results not better than that of logistic regression. The precision, recall and f1 of attrition is not at all good.

## Feature score of 8

In [11]:
# Create the new dataset

# Get features with feature score of 8
features = feature_selector[feature_selector['Total']==8].index.tolist()
X_train_8 = X_train.loc[:, features]
X_test_8 = X_test.loc[:, features]

In [12]:
# Train the model
rf.fit(X_train_8, y_train)

GridSearchCV(estimator=RandomForestClassifier(bootstrap=False,
                                              class_weight='balanced',
                                              max_features=None),
             n_jobs=-1,
             param_grid={'max_depth': [10, 20, 40],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200, 400]},
             scoring='f1')

In [13]:
# Predict with model
y_pred_8 = rf.predict(X_test_8)

In [14]:
# Make the report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.89      0.88       255
        True       0.24      0.23      0.23        39

    accuracy                           0.80       294
   macro avg       0.56      0.56      0.56       294
weighted avg       0.80      0.80      0.80       294



There is no improvement in the result. But since this model uses less number of features, it better to use it in production in order to improve the retraining and inferencing with huge load of data.

Since the least number of features that could be used gave the same performance as all the features, it is better to skip the other scores since the chance of improvement in result is quite less.