# Imbalanced data handling

## Imports

In [2]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, precision_score, roc_auc_score,
    roc_curve, precision_recall_curve, auc
)
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, TomekLinks
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import imblearn
imblearn.show_versions()


System:
     python: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
 executable: c:\Users\Joel\Python Envs\enedis\Scripts\python.exe
    machine: Windows-10-10.0.19045-SP0

Python dependencies:
imbalanced-learn: 0.13.0
        pip: 24.2
 setuptools: 75.1.0
      numpy: 1.26.3
      scipy: 1.11.4
scikit-learn: 1.5.2
     Cython: None
     pandas: 2.2.3
      keras: 3.6.0
 tensorflow: 2.17.0
     joblib: 1.4.2


## Classifiers used

In [4]:
classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "k-NN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
}

## Resampling method

In [5]:
resampling_methods = {
    "None (Original Data)": None,
    "Random OverSampler": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "Borderline-SMOTE": BorderlineSMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "Random Undersampling": RandomUnderSampler(random_state=42),
    "ClusterCentroids": ClusterCentroids(random_state=42),
    # "CondensedNearestNeighbour": CondensedNearestNeighbour(random_state=42),
    "NearMiss": NearMiss(),
    "Tomek Links": TomekLinks()
}

## Metrics

In [6]:
def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall (Sensitivity)": recall_score(y_true, y_pred),
        "ROC AUC": roc_auc_score(y_true, y_prob[:, 1] if y_prob is not None else None),
    }

## ROC/PRC Curves

In [7]:
def save_curves(y_test, y_prob, model_name, resampling_name):
    fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
    precision, recall, _ = precision_recall_curve(y_test, y_prob[:, 1])
    

    roc_auc = roc_auc_score(y_test, y_prob[:, 1])
    pr_auc = auc(recall, precision)

    curve_data = {
                    "Classifier": model_name,
                    "Resampling Method": resampling_name,
                    "ROC": {"fpr": fpr.tolist(), "tpr": tpr.tolist(), "auc": roc_auc},
                    "PR": {"precision": precision.tolist(), "recall": recall.tolist(), "auc": pr_auc},
                }

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Model')
    plt.title(f'ROC Curve: {model_name} ({resampling_name})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
    plt.title(f'Precision-Recall Curve: {model_name} ({resampling_name})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()

    plt.tight_layout()
    os.makedirs("./figures", exist_ok=True)
    filename = f"./figures/{model_name}_{resampling_name.replace(' ', '_')}.png"
    plt.savefig(filename)
    plt.close()
    os.makedirs("./results/json", exist_ok=True)
    filename = f"./results/json/{model_name}_{resampling_name.replace(' ', '_')}_curves.json"
    with open(filename, "w") as f:
        json.dump(curve_data, f)

## Import dataset

In [8]:
PATH_TO_DATA = "data\guillaume.txt"

df = pd.read_csv(PATH_TO_DATA, sep=";", parse_dates=[4], decimal=',',header=0)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4646773 entries, 0 to 4646772
Data columns (total 23 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   ZIBZIN                    object        
 1   IDAvisAutorisationCheque  int64         
 2   FlagImpaye                int64         
 3   Montant                   float64       
 4   DateTransaction           datetime64[ns]
 5   CodeDecision              int64         
 6   VerifianceCPT1            int64         
 7   VerifianceCPT2            int64         
 8   VerifianceCPT3            int64         
 9   D2CB                      int64         
 10  ScoringFP1                float64       
 11  ScoringFP2                float64       
 12  ScoringFP3                float64       
 13  TauxImpNb_RB              float64       
 14  TauxImpNB_CPM             float64       
 15  EcartNumCheq              int64         
 16  NbrMagasin3J              int64         
 17  DiffDate

In [9]:
df.duplicated().sum()

834734

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
for col in df.columns:
    print(f"{col}: {df.loc[:,col].value_counts()}")

ZIBZIN: ZIBZIN
A013010041908004184100729    179
A033013306908002131943000    149
A010010002908870005472117    147
A075000041908023367242120    144
A013013369908000035048787    143
                            ... 
A075001315908004682437348      1
A034010041908007943451330      1
A007013379900000000160340      1
A039012506908013030694000      1
A078010004908211306074580      1
Name: count, Length: 1280126, dtype: int64
IDAvisAutorisationCheque: IDAvisAutorisationCheque
78643044    1
81927417    1
81927390    1
81927391    1
81927392    1
           ..
79927764    1
79927765    1
79927766    1
79927767    1
84966405    1
Name: count, Length: 3812039, dtype: int64
FlagImpaye: FlagImpaye
0    3788984
1      23055
Name: count, dtype: int64
Montant: Montant
30.00       29928
20.00       25929
40.00       19584
50.00       17578
25.00       13222
            ...  
348.24          1
428.85          1
393.56          1
467.49          1
2,705.62        1
Name: count, Length: 42863, dtype: int64


extract year, month, day from date.

hh:mm:ss is already as timestamp the column heure

In [12]:
df['annee'] = pd.DatetimeIndex(df["DateTransaction"]).year
df['mois'] = pd.DatetimeIndex(df["DateTransaction"]).month
df['jour'] = pd.DatetimeIndex(df["DateTransaction"]).day
df.head()

Unnamed: 0,ZIBZIN,IDAvisAutorisationCheque,FlagImpaye,Montant,DateTransaction,CodeDecision,VerifianceCPT1,VerifianceCPT2,VerifianceCPT3,D2CB,...,NbrMagasin3J,DiffDateTr1,DiffDateTr2,DiffDateTr3,CA3TRetMtt,CA3TR,Heure,annee,mois,jour
0,A013010004908126703060931,78643044,0,20.0,2017-02-01 07:32:14,1,0,0,0,551,...,1,4.0,4.0,4.0,20.0,0.0,27134,2017,2,1
1,A013011306908024927155000,78643045,0,20.0,2017-02-01 07:43:37,1,0,0,0,551,...,2,1.8,4.0,4.0,28.61,8.61,27817,2017,2,1
2,A013010002908283134592527,78643046,0,57.64,2017-02-01 07:47:38,1,0,0,0,549,...,1,4.0,4.0,4.0,57.64,0.0,28058,2017,2,1
3,A011010002908105209831316,78643047,0,54.29,2017-02-01 07:48:48,0,1,1,1,267,...,1,4.0,4.0,4.0,54.29,0.0,28128,2017,2,1
4,A013010041908000125652029,78643048,0,26.9,2017-02-01 08:13:27,1,0,0,0,549,...,1,2.0,4.0,4.0,59.15,32.25,29607,2017,2,1


1er modèles : DROP ZIBZIN, IDAvisAutorisationCheque, DateTransaction

In [13]:
train = df.loc[df["DateTransaction"] < "2017-09-01",:].copy()
test =  df.loc[df["DateTransaction"] >= "2017-09-01",:].copy()

train.drop(columns=["ZIBZIN", "IDAvisAutorisationCheque", "DateTransaction"], inplace=True)
test.drop(columns=["ZIBZIN", "IDAvisAutorisationCheque",  "DateTransaction"], inplace=True)

In [14]:
X_train = train.drop(columns="FlagImpaye")
y_train = train.loc[:,"FlagImpaye"]

X_test = test.drop(columns="FlagImpaye")
y_test = test["FlagImpaye"]

In [None]:
%%time

import pandas as pd

results_file = "classification_resampling_results.csv"
columns = [
    "Classifier", "Resampling Method", "Accuracy", "F1 Score",
    "Precision", "Recall (Sensitivity)", "ROC AUC"
]
pd.DataFrame(columns=columns).to_csv(f"./results/{results_file}", index=False)

roc_curves = []
pr_curves = []

for resampler_name, resampler in resampling_methods.items():
    if resampler:
        X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train

    for clf_name, clf in classifiers.items():
        print(f"Resampler: {resampler_name}, Model: {clf_name}")
        clf.fit(X_resampled, y_resampled)
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test) if hasattr(clf, "predict_proba") else None

        metrics = evaluate_model(y_test, y_pred, y_prob)
        metrics["Classifier"] = clf_name
        metrics["Resampling Method"] = resampler_name

        result_df = pd.DataFrame([metrics])
        
        result_df.to_csv(f"./results/{results_file}", mode='a', header=False, index=False)

        if y_prob is not None:
            save_curves(y_test, y_prob, clf_name, resampler_name)

Resampler: None (Original Data), Model: Random Forest
Resampler: None (Original Data), Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: None (Original Data), Model: Decision Tree
Resampler: None (Original Data), Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: Random OverSampler, Model: Random Forest
Resampler: Random OverSampler, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: Random OverSampler, Model: Decision Tree
Resampler: Random OverSampler, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: SMOTE, Model: Random Forest
Resampler: SMOTE, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: SMOTE, Model: Decision Tree
Resampler: SMOTE, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: Borderline-SMOTE, Model: Random Forest
Resampler: Borderline-SMOTE, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: Borderline-SMOTE, Model: Decision Tree
Resampler: Borderline-SMOTE, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: ADASYN, Model: Random Forest
Resampler: ADASYN, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: ADASYN, Model: Decision Tree
Resampler: ADASYN, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: Random Undersampling, Model: Random Forest
Resampler: Random Undersampling, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: Random Undersampling, Model: Decision Tree
Resampler: Random Undersampling, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: ClusterCentroids, Model: Random Forest
Resampler: ClusterCentroids, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: ClusterCentroids, Model: Decision Tree
Resampler: ClusterCentroids, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



Resampler: NearMiss, Model: Random Forest
Resampler: NearMiss, Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Resampler: NearMiss, Model: Decision Tree
Resampler: NearMiss, Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

