In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


In [14]:
fraud = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/card_transdata.csv")
fraud.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


1. Distribution of our target variable

In [15]:
fraud_distribution = fraud['fraud'].value_counts(normalize=True)
print(fraud_distribution)

fraud
0.0    0.912597
1.0    0.087403
Name: proportion, dtype: float64


2. LogisticRegression

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X = fraud.drop('fraud', axis=1)
y = fraud['fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

3. Evaluation of the model

In [17]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score: {roc_auc}")

              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98    273779
         1.0       0.90      0.60      0.72     26221

    accuracy                           0.96    300000
   macro avg       0.93      0.80      0.85    300000
weighted avg       0.96      0.96      0.96    300000

ROC-AUC Score: 0.7982889861008081


4. Oversampling

In [18]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_scaled, y_train)

model.fit(X_train_ros, y_train_ros)
y_pred_ros = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_ros))
roc_auc_ros = roc_auc_score(y_test, y_pred_ros)
print(f"ROC-AUC Score (Oversampled): {roc_auc_ros}")

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    273779
         1.0       0.58      0.95      0.72     26221

    accuracy                           0.93    300000
   macro avg       0.79      0.94      0.84    300000
weighted avg       0.96      0.93      0.94    300000

ROC-AUC Score (Oversampled): 0.9407530072287246


5. Undersampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

model.fit(X_train_rus, y_train_rus)
y_pred_rus = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_rus))
roc_auc_rus = roc_auc_score(y_test, y_pred_rus)
print(f"ROC-AUC Score (Undersampled): {roc_auc_rus}")

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    273779
         1.0       0.58      0.95      0.72     26221

    accuracy                           0.93    300000
   macro avg       0.79      0.94      0.84    300000
weighted avg       0.96      0.93      0.94    300000

ROC-AUC Score (Undersampled): 0.9413561147301891


6. SMOTE

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

model.fit(X_train_smote, y_train_smote)
y_pred_smote = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_smote))
roc_auc_smote = roc_auc_score(y_test, y_pred_smote)
print(f"ROC-AUC Score (SMOTE): {roc_auc_smote}")

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    273779
         1.0       0.58      0.95      0.72     26221

    accuracy                           0.93    300000
   macro avg       0.79      0.94      0.84    300000
weighted avg       0.96      0.93      0.94    300000

ROC-AUC Score (SMOTE): 0.9408024786613405
