# Handling Imbalanced Datasets
This notebook demonstrated baseline model and four approaches to handle imbalanced data for Spam vs Ham classification.<br>
The notebook provides a step-by-step process, from dataset loading, exploration, and preprocessing, to training models using several imbalanced data handling techniques with performance evaluation after each. It uses the classic SMS Spam Collection dataset, which is a popular benchmark for spam detection tasks with class imbalance.Here is a comprehensive Python Jupyter notebook demonstrating various imbalanced data handling techniques on the spam vs ham dataset. It covers dataset loading, exploration, preprocessing, train-test splitting, baseline model training, and handling imbalance with undersampling, oversampling, SMOTE, and class weighting, with evaluation after each step:

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Load dataset (example dataset - SMS Spam Collection dataset)
# URL for dataset: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
# or usually saved locally as 'spam.csv'
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

# Display first few rows
print("Dataset sample:")
display(df.head())

# Explore and visualize class distribution
print("\nClass distribution:")
print(df['label'].value_counts())

sns.countplot(x='label', data=df)
plt.title("Spam vs Ham Class Distribution")
plt.show()

# Encode labels: ham=0, spam=1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing - basic (can be extended)
# For simplicity, just lowercase here
df['message'] = df['message'].str.lower()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label_num'], test_size=0.3, random_state=42, stratify=df['label_num'])

print(f"\nTrain set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Function to evaluate model
def evaluate_model(model, X_test, y_test, title='Confusion Matrix'):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Ham', 'Spam'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()

# Baseline model on imbalanced data
print("Baseline Model (Imbalanced Data):")
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train_tfidf, y_train)
evaluate_model(baseline_model, X_test_tfidf, y_test, 'Baseline Confusion Matrix')

# Show training label distribution
print("\nTraining label distribution before resampling:")
print(Counter(y_train))



In [None]:
# Handling imbalance techniques

# 1. Random Undersampling majority class
print("\n1. Random Undersampling:")
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_tfidf, y_train)
print("Resampled label distribution:", Counter(y_rus))
rus_model = LogisticRegression(max_iter=1000, random_state=42)
rus_model.fit(X_rus, y_rus)
evaluate_model(rus_model, X_test_tfidf, y_test, 'Random Undersampling Confusion Matrix')

# 2. Random Oversampling minority class
print("\n2. Random Oversampling:")
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_tfidf, y_train)
print("Resampled label distribution:", Counter(y_ros))
ros_model = LogisticRegression(max_iter=1000, random_state=42)
ros_model.fit(X_ros, y_ros)
evaluate_model(ros_model, X_test_tfidf, y_test, 'Random Oversampling Confusion Matrix')

# 3. SMOTE (Synthetic Minority Over-sampling Technique)
print("\n3. SMOTE Oversampling:")
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_tfidf, y_train)
print("Resampled label distribution:", Counter(y_smote))
smote_model = LogisticRegression(max_iter=1000, random_state=42)
smote_model.fit(X_smote, y_smote)
evaluate_model(smote_model, X_test_tfidf, y_test, 'SMOTE Confusion Matrix')

# 4. Class weighting in Logistic Regression
print("\n4. Class Weighting:")
class_weight_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
class_weight_model.fit(X_train_tfidf, y_train)
evaluate_model(class_weight_model, X_test_tfidf, y_test, 'Class Weighting Confusion Matrix')

# Summary
print("\nSummary:")
print("This notebook demonstrated baseline model and four approaches to handle imbalanced data for Spam vs Ham classification:")
print("- Random Undersampling")
print("- Random Oversampling")
print("- SMOTE Oversampling")
print("- Class Weighting in model training")
print("Choose the technique depending on specific dataset needs and constraints.")