In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve

In [2]:
# Create imbalanced dataset (90% class 0, 10% class 1)

x, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9, 0.1], random_state=42)

print("Class distribution:")
print(pd.Series(y).value_counts())
print(f"\nBaseline (always predict majority): {max(pd.Series(y).value_counts()) / len(y):.3f}")

Class distribution:
0    897
1    103
Name: count, dtype: int64

Baseline (always predict majority): 0.897


In [3]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("\nClassification Report (imbalanced):")
print(classification_report(y_test, y_pred))


Classification Report (imbalanced):
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       180
           1       0.46      0.30      0.36        20

    accuracy                           0.90       200
   macro avg       0.69      0.63      0.65       200
weighted avg       0.88      0.90      0.88       200



**Solution 1: Class Weights**
-  Option 1: Balanced weights

In [None]:
model_balanced = LogisticRegression(
    class_weight='balanced',
    random_state=42
)
model_balanced.fit(x_train, y_train)
y_pred_balanced = model_balanced.predict(x_test)

print("\nWith balanced class weights:")
print(classification_report(y_test, y_pred_balanced))


With balanced class weights:
              precision    recall  f1-score   support

           0       0.97      0.88      0.92       180
           1       0.41      0.75      0.53        20

    accuracy                           0.86       200
   macro avg       0.69      0.81      0.72       200
weighted avg       0.91      0.86      0.88       200



- Option 2: Custom weights

In [None]:
model_custom = LogisticRegression(
    class_weight={0:1, 1:10},
    random_state=42
)

model_custom.fit(x_train, y_train)
y_pred_custom = model_custom.predict(x_test)

print("\nWith Custom class weights:")
print(classification_report(y_test, y_pred_custom))


With Custom class weights:
              precision    recall  f1-score   support

           0       0.97      0.87      0.91       180
           1       0.38      0.75      0.51        20

    accuracy                           0.85       200
   macro avg       0.68      0.81      0.71       200
weighted avg       0.91      0.85      0.87       200



**Solution 2: Resampling**
- Oversampling (SMOTE)

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

smote = SMOTE()
x_resample, y_resample = smote.fit_resample(x_train, y_train)

print('After Resample :')
print(pd.Series(y_resample).value_counts())

After Resample :
0    717
1    717
Name: count, dtype: int64


In [8]:
model_smote = LogisticRegression(random_state=42)
model_smote.fit(x_resample, y_resample)

y_pred_smote = model_smote.predict(x_test)

print("\nWith SMOTE:")
print(classification_report(y_test, y_pred_smote))


With SMOTE:
              precision    recall  f1-score   support

           0       0.97      0.88      0.92       180
           1       0.42      0.75      0.54        20

    accuracy                           0.87       200
   macro avg       0.69      0.82      0.73       200
weighted avg       0.91      0.87      0.89       200



- Undersampling

In [9]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

undersampler = RandomUnderSampler(random_state=42)
x_under, y_under = undersampler.fit_resample(x_train, y_train)

print("After undersampling:")
print(pd.Series(y_under).value_counts())

After undersampling:
0    83
1    83
Name: count, dtype: int64


- Combined (SMOTE + Tomek Links)

In [10]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
x_combined, y_combined = smote_tomek.fit_resample(x_train, y_train)

print("After SMOTE + Tomek Links:")
print(pd.Series(y_combined).value_counts())

After SMOTE + Tomek Links:
0    716
1    716
Name: count, dtype: int64


**Solution 3: Threshold Tuning**

In [15]:
y_pred_proba = model.predict_proba(x_test)[:, 1]

# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Find threshold that maximizes F1-score
f1_score = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_score)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.3f}")

Optimal threshold: 0.127


In [17]:
# Use optimal threshold
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)

print("\nWith optimal threshold:")
print(classification_report(y_test, y_pred_optimal))


With optimal threshold:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       180
           1       0.45      0.75      0.57        20

    accuracy                           0.89       200
   macro avg       0.71      0.82      0.75       200
weighted avg       0.92      0.89      0.90       200



**Comparison of Methods**

In [21]:
from sklearn.metrics import f1_score

methods = {
    'Baseline': model,
    'Class Weights': model_balanced,
    'SMOTE': model_smote
}

results = {}

for name, model_method in methods.items():
    y_pred_method = model_method.predict(x_test)
    f1 = f1_score(y_test, y_pred_method)
    results[name] = f1
    print(f"{name:15s}: F1-Score = {f1:.3f}")

Baseline       : F1-Score = 0.364
Class Weights  : F1-Score = 0.526
SMOTE          : F1-Score = 0.536
