<a href="https://colab.research.google.com/github/pchernic/Imbalanced_data/blob/main/%5BAlgorithms_for_Imbalanced_data%5D_Credit_Card_Fraud_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, CondensedNearestNeighbour

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Data


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Imbalanced Dataset /Exercício/fraude_em_cartao_de_credito.csv')
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f11,f12,f13,f14,f15,f16,f17,f18,f19,class
0,-0.969781,-1.650574,-0.628913,-1.691092,-1.477751,-0.628768,-0.046908,-1.049127,0.466186,-0.942794,...,-1.214985,-0.197611,-1.564459,0.06409,-0.847784,-0.411003,-0.250578,-0.93602,0.769469,0
1,-1.224213,0.534225,-0.951029,0.375252,-0.186034,0.712699,-0.202158,0.838773,-0.053232,1.040443,...,0.697091,-0.068655,0.748391,-0.208193,-0.289533,-1.173952,0.993719,-0.060707,-0.965607,1
2,0.594308,0.491569,-1.394929,1.206783,1.147946,0.571385,-0.941333,1.246614,-0.239523,-0.663464,...,0.685957,0.535532,0.119008,1.154772,-0.504898,-0.847183,-0.689269,0.768352,-1.070108,1
3,1.104081,0.253214,1.398726,1.208175,0.443822,0.102821,-0.615884,-0.580218,-1.917046,0.761122,...,-1.216294,0.643695,-0.726431,1.646161,-0.090221,-0.665464,0.6025,0.429398,0.748708,0
4,0.203884,-0.164055,-0.628322,0.236933,-0.598334,0.448331,-1.246621,-1.120051,-0.91775,0.888408,...,1.137012,0.106594,-2.681995,-1.673327,1.295357,1.022215,-0.897329,-0.374499,1.072438,0


# Checking Distribution


In [None]:
df['class'].value_counts(normalize=True)

0    0.6804
1    0.3196
Name: class, dtype: float64

# Data Understanding

In [None]:
df.dtypes

f0       float64
f1       float64
f2       float64
f3       float64
f4       float64
f5       float64
f6       float64
f7       float64
f8       float64
f9       float64
f10      float64
f11      float64
f12      float64
f13      float64
f14      float64
f15      float64
f16      float64
f17      float64
f18      float64
f19      float64
class      int64
dtype: object

# Checking nulls.

In [None]:
df.isna().sum()

f0       0
f1       0
f2       0
f3       0
f4       0
f5       0
f6       0
f7       0
f8       0
f9       0
f10      0
f11      0
f12      0
f13      0
f14      0
f15      0
f16      0
f17      0
f18      0
f19      0
class    0
dtype: int64

# Train-test-split

In [None]:
X = df.drop(columns=['class'])
y = df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Data Pre-processing Pipeline

In [None]:
preprocessing = ColumnTransformer(
    transformers = [('num', StandardScaler(), selector(dtype_include=['float64']))]
)

# Modeling

## with Imbalanced data


### Logistic Regression

In [None]:
pipeline_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## Random Oversampling

### Logistic Regression

In [None]:
pipeline_ros_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('ros', RandomOverSampler(random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_ros_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('ros', RandomOverSampler(random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_ros_dt = Pipeline(
    [
        ('pp', preprocessing),
         ('ros', RandomOverSampler(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## SMOTE - Synthetic Minority **Over-sampling** Technique.


Certainly, here's a concise and professional summary of the SMOTE algorithm:

**SMOTE (Synthetic Minority Over-sampling Technique)** is a data augmentation technique used to address class imbalance in machine learning. **It generates synthetic samples for the minority class by creating new data points that are similar to existing ones.** This helps balance the class distribution and improves the performance of machine learning models, **particularly in cases where one class is significantly smaller than the other.** SMOTE is a valuable tool for enhancing the fairness and effectiveness of machine learning algorithms in imbalanced datasets.

### Logistic Regression

In [None]:
pipeline_smt_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('smt', SMOTE(random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_smt_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('smt', SMOTE(random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_smt_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('smt', SMOTE(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## ADASYN - Adaptive Synthetic Sampling.

ADASYN stands for "Adaptive Synthetic Sampling." It is a data preprocessing technique used to address class imbalance in machine learning datasets, particularly in classification problems.

Class imbalance occurs when one class **(the minority class) has significantly fewer examples than another class (the majority class). This imbalance can lead to biased model performance**, where the model **may perform well on the majority class but poorly on the minority class.**

ADASYN is designed to alleviate this issue by **generating synthetic examples for the minority class, making it more balanced with the majority class.** Unlike traditional oversampling techniques that create duplicates of existing minority class examples, **ADASYN creates synthetic samples that are similar to existing examples but not identical. It does this in an adaptive manner, focusing more on the minority class examples that are difficult to classify correctly.**

Here's a simplified overview of how ADASYN works:

1. **Identify Minority Class:** ADASYN starts by identifying the minority class in the dataset.

2. **Calculating the Imbalance Ratio:** It calculates the imbalance ratio, which is the ratio of the number of majority class examples to the number of minority class examples.

3. **Adaptive Sampling:** ADASYN then focuses on the minority class examples that are harder to classify by computing a density distribution. It gives more attention to examples that are in densely populated regions of the minority class.

4. **Generating Synthetic Examples:** For each selected minority class example, ADASYN generates synthetic samples by interpolating between that example and its nearest neighbors within the minority class. These synthetic examples are added to the dataset.

By generating synthetic examples for the minority class, **ADASYN aims to balance the class distribution and improve the performance of machine learning models on imbalanced datasets.**

ADASYN is a popular technique for handling class imbalance, **especially in situations where the imbalance is severe or when traditional methods like random oversampling or undersampling may not be as effective.**

### Logistic Regression

In [None]:
pipeline_asy_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('asy', ADASYN(random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_asy_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('asy', ADASYN(random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_asy_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('asy', ADASYN(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## Random Oversampling

### Logistic Regression

In [None]:
pipeline_rus_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('rus', RandomUnderSampler(random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_rus_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('rus', RandomUnderSampler(random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_rus_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('rus', RandomUnderSampler(random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## NearMiss


**Near Miss** is a data undersampling technique used to address class imbalance in machine learning. It reduces the size of the majority class by removing some of its instances, typically those that are "near" or similar to the minority class examples. This method helps balance the class distribution, making the dataset more suitable for training machine learning models.

### NearMiss-1

#### Logistic Regression

In [None]:
pipeline_nmi_1_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_1', NearMiss(version=1)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

#### $k$-nearest neighbors

In [None]:
pipeline_nmi_1_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_1', NearMiss(version=1)),
        ('knn', KNeighborsClassifier())
    ]
)

#### Decision Tree

In [None]:
pipeline_nmi_1_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_1', NearMiss(version=1)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

### NearMiss-2

#### Logistic Regression

In [None]:
pipeline_nmi_2_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_2', NearMiss(version=2)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

#### $k$-nearest neighbors

In [None]:
pipeline_nmi_2_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_2', NearMiss(version=2)),
        ('knn', KNeighborsClassifier())
    ]
)

#### Decision Tree

In [None]:
pipeline_nmi_2_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_2', NearMiss(version=2)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

### NearMiss-3

#### Logistic Regression

In [None]:
pipeline_nmi_3_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_3', NearMiss(version=3)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

#### $k$-nearest neighbors

In [None]:
pipeline_nmi_3_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_3', NearMiss(version=3)),
        ('knn', KNeighborsClassifier())
    ]
)

#### Decision Tree

In [None]:
pipeline_nmi_3_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('nmi_3', NearMiss(version=3)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

## CNN - Condensed Nearest Neighbours

### Logistic Regression

In [None]:
pipeline_cnn_lr = Pipeline(
    [
        ('pp', preprocessing),
        ('cnn', CondensedNearestNeighbour(random_state=42, n_jobs=-1)),
        ('lr', LogisticRegression(random_state=42))
    ]
)

### $k$-nearest neighbors

In [None]:
pipeline_cnn_knn = Pipeline(
    [
        ('pp', preprocessing),
        ('cnn', CondensedNearestNeighbour(random_state=42, n_jobs=-1)),
        ('knn', KNeighborsClassifier())
    ]
)

### Decision Tree

In [None]:
pipeline_cnn_dt = Pipeline(
    [
        ('pp', preprocessing),
        ('cnn', CondensedNearestNeighbour(random_state=42, n_jobs=-1)),
        ('dt', DecisionTreeClassifier(random_state=42))
    ]
)

# Assessing Models

In [None]:
skf  = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Imbalanced data

In [None]:
res = cross_val_score(pipeline_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.8093817602174468

In [None]:
res = cross_val_score(pipeline_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7250687770995903

In [None]:
res = cross_val_score(pipeline_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7697192549892733

## Random Oversampling

In [None]:
res = cross_val_score(pipeline_ros_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7953494626770745

In [None]:
res = cross_val_score(pipeline_ros_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7243561438185071

In [None]:
res = cross_val_score(pipeline_ros_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.762497049863646

## SMOTE

In [None]:
res = cross_val_score(pipeline_smt_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7960658655650577

In [None]:
res = cross_val_score(pipeline_smt_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.729924881268291

In [None]:
res = cross_val_score(pipeline_smt_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7591965332201513

## ADASYN

In [None]:
res = cross_val_score(pipeline_asy_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7886695606299362

In [None]:
res = cross_val_score(pipeline_asy_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7185772080158916

In [None]:
res = cross_val_score(pipeline_asy_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7606064757933844

## Random Undersampling

In [None]:
res = cross_val_score(pipeline_rus_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7946931528697699

In [None]:
res = cross_val_score(pipeline_rus_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7515877819011731

In [None]:
res = cross_val_score(pipeline_rus_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7629057650658486

## NearMiss

### NearMiss-1

In [None]:
res = cross_val_score(pipeline_nmi_1_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.8115848250943181

In [None]:
res = cross_val_score(pipeline_nmi_1_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7259710343593008

In [None]:
res = cross_val_score(pipeline_nmi_1_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.6984638745509862

### NearMiss-2

In [None]:
res = cross_val_score(pipeline_nmi_2_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.8081179918725347

In [None]:
res = cross_val_score(pipeline_nmi_2_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7337825854408814

In [None]:
res = cross_val_score(pipeline_nmi_2_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.6807891049172505

### NearMiss-3

In [None]:
from warnings import simplefilter
simplefilter(action='ignore', category=UserWarning)

res = cross_val_score(pipeline_nmi_3_lr, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.807326452489813

In [None]:
res = cross_val_score(pipeline_nmi_3_knn, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.7217220767607773

In [None]:
res = cross_val_score(pipeline_nmi_3_dt, X_train, y_train, cv=skf, scoring='f1')
res.mean()

0.6935638552977393

## CNN

In [None]:
res = cross_val_score(pipeline_cnn_lr, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
res.mean()

0.7903485797981264

In [None]:
res = cross_val_score(pipeline_cnn_knn, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
res.mean()

0.7278781791313544

In [None]:
res = cross_val_score(pipeline_cnn_dt, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
res.mean()

0.7300744757707818