In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

## Read Data

In [None]:
df  = pd.read_csv('../Dataset/brazilian_ecommerce_cleaned.csv')
y = df["is_delayed"]
X = df.drop(["is_delayed"], axis=1, inplace=False)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify=y, train_size=0.8, shuffle=True, random_state=19)

In [None]:
print("Number of development data:", len(X_dev))
print("Distribution of development data:")
print(y_dev.value_counts())

In [None]:
print("Number of test data:", len(X_test))
print("Distribution of test data:")
print(y_test.value_counts())

## Random Forest Parameters

In [None]:
ne = 30
md = 5
rs = 19

## Stratified sampling

In [None]:
rf_ss = RandomForestClassifier(n_estimators=ne, max_depth=md, random_state=rs).fit(X_dev, y_dev)
y_pred_prob_ss = rf_ss.predict_proba(X_test)

## Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(replacement=False, random_state=19)
X_dev_us, y_dev_us = rus.fit_resample(X_dev, y_dev)

In [None]:
y_dev_us.value_counts()

In [None]:
rf_us = RandomForestClassifier(n_estimators=ne, max_depth=md, random_state=rs).fit(X_dev_us, y_dev_us)
y_pred_prob_us = rf_us.predict_proba(X_test)

## SMOTE sampling

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=19)
X_dev_smote, y_dev_smote = smote.fit_resample(X_dev, y_dev)

In [None]:
y_dev_smote.value_counts()

In [None]:
rf_smote = RandomForestClassifier(n_estimators=ne, max_depth=md, random_state=rs).fit(X_dev_smote, y_dev_smote)
y_pred_prob_smote = rf_smote.predict_proba(X_test)

## Ensemble resampling

In [None]:
rf_es = BalancedRandomForestClassifier(n_estimators=ne, max_depth=md, random_state=rs).fit(X_dev, y_dev)
y_pred_prob_es = rf_es.predict_proba(X_test)

## Plotting

### Precision Recall Curve

In [None]:
from sklearn.metrics import roc_curve, plot_precision_recall_curve, RocCurveDisplay, PrecisionRecallDisplay
from matplotlib import pyplot as plt

plot_precision_recall_curve(rf_ss,  X_test, y_test, ax = plt.gca(),name = "Stratified")
plot_precision_recall_curve(rf_us, X_test, y_test, ax = plt.gca(),name = "Undersampling")
plot_precision_recall_curve(rf_smote, X_test, y_test, ax = plt.gca(),name = "SMOTE")
plot_precision_recall_curve(rf_es,  X_test, y_test, ax = plt.gca(),name = "Ensemble Resampling")

plt.title('Precision-Recall curve')
plt.rcParams["figure.figsize"] = (12,9)
plt.show()

### ROC Curve

In [None]:
fpr_ss, tpr_ss, thresholds_ss = roc_curve(X_test, y_pred_prob_ss, pos_label = 1)
fpr_us, tpr_us, thresholds_us = roc_curve(X_test, y_pred_prob_us, pos_label = 1)
fpr_smote, tpr_smote, thresholds_smote = roc_curve(X_test, y_pred_prob_smote, pos_label = 1)
fpr_es, tpr_es, thresholds_es = roc_curve(X_test, y_pred_prob_es, pos_label = 1)

plt.figure()
plt.plot(fpr_ss, tpr_ss, label = "Stratified")
plt.plot(fpr_us, tpr_us, label = "Undersampling")
plt.plot(fpr_smote, tpr_smote, label = "SMOTE")
plt.plot(fpr_es, tpr_es, label = "Ensemble Resampling")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()