In [2]:
# import the neccessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder

# read the data
numerical = pd.read_csv('numerical.csv', nrows=10000)
categorical = pd.read_csv('categorical.csv', nrows=10000)
target = pd.read_csv('target.csv', nrows=10000)

In [None]:
# encode the categorical variables with one-hot encoding
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(categorical)
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical.columns))

In [None]:
# merge data
data = pd.concat([numerical, encoded_categorical_df], axis=1)
X = data
y = target['TARGET_B']

In [None]:
# unsample for the class imbalance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# upsmaple minority class
X_train_majority = X_train[y_train == 0]
X_train_minority = X_train[y_train == 1]
X_train_minority_upsampled = resample(X_train_minority,
                                      replace=True,
                                      n_samples=len(X_train_majority),
                                      random_state=42)
X_train_upsampled = pd.concat([X_train_majority, X_train_minority_upsampled])
y_train_upsampled = y_train.loc[X_train_upsampled.index]

In [None]:
# model train with GridSearchCV using F1 Score
clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
f1_scorer = make_scorer(f1_score)
grid_search = GridSearchCV(clf, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train_upsampled, y_train_upsampled)
# evaluate model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [5]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# show best parameters
print("Best Parameters:", grid_search.best_params_)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# general accuracy
print("\nAccuracy:", accuracy_score(y_test, y_pred))

# business Impact Analysis 

cost_fp = 100  # examample cost for a false positive
cost_fn = 200  # example cost for a false negative

#extracting false positives and false negatives from the confusion matrix
fp = cm[0][1]
fn = cm[1][0]

lost_earnings = (fp * cost_fp) + (fn * cost_fn)
print(f"\nEstimated Lost Earnings due to Misclassifications: EUR {lost_earnings}")


Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}

Confusion Matrix:
 [[2355    3]
 [ 142    0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      2358
           1       0.00      0.00      0.00       142

    accuracy                           0.94      2500
   macro avg       0.47      0.50      0.49      2500
weighted avg       0.89      0.94      0.92      2500


Accuracy: 0.942

Estimated Lost Earnings due to Misclassifications: EUR 28700


Given the failure of the model to be able to predict the minority class (1) even after addressing the class imbalance we see that relying only on accuracy to evaluate the model's performance may be misleading. The failure to detect any positives is worrying, especially if the costs of false negatives are significant.