# Evaluating Algorithms for Imbalanced Data

* Student Name: Michael Rideout
* Student Number: 225065259
* E-mail: s225065259@deakin.edu.au
* Student Course Code: SIT731
---

# Introduction

In [19]:
!pip install sdv
from sdv.datasets.demo import download_demo, get_available_demos
from sdv.sampling import Condition
from sdv.single_table import GaussianCopulaSynthesizer
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

!pip install imblearn
from imblearn.ensemble import BalancedRandomForestClassifier




## TODO
- Generate synthetic datasets
- Test algorithms for imbalanced data
- Evaluate performance
- Report Findings

# Generate Synthetic datasets
- Create datasets with varying class imbalance ratios (5, 10, 20, 50, 100) based on the size of the majority class relative to the minority class

In [20]:
class SyntheticDatasetGenerator():

    def __init__(self, sample_dataset_name, target_column_name, majority_class_value, minority_class_value) -> None:
        self.majority_class_value = majority_class_value
        self.minority_class_value = minority_class_value
        self.target_column_name = target_column_name
        sample_data, sample_metadata = download_demo(modality='single_table', dataset_name=sample_dataset_name)
        self.synthesiser = GaussianCopulaSynthesizer(sample_metadata)
        self.synthesiser.fit(data=sample_data)
        

    def generate_dataset(self, row_count, imbalance_ratio):
        minority_class_row_count = int(row_count / (imbalance_ratio + 1))
        majority_class_row_count = imbalance_ratio * minority_class_row_count
        minority_condition  = Condition(num_rows = minority_class_row_count, column_values={self.target_column_name: self.minority_class_value})
        majority_condition  = Condition(num_rows = majority_class_row_count, column_values={self.target_column_name: self.majority_class_value})
        return self.synthesiser.sample_from_conditions(conditions=[minority_condition, majority_condition])

## Classifiers

In [21]:
#define constants
TARGET_COLUMN_NAME = "label"


class BaseClassifier:

    def __init__(self, algorithm_name, target_column_name) -> None:
        self.algorithm_name = algorithm_name
        self.target_column_name = target_column_name

    
    def fit(self, train_data: pd.DataFrame):
        pass

    def predict(self, test_data: pd.DataFrame):
        pass


    def calculate_kappa(self, y_true, y_pred):
        """
        Calculate the Kappa metric for evaluating classifier performance in imbalanced settings.
        
        The metric computes the inter-rater agreement between predictions and true labels,
        correcting for agreements that occur by chance. Values range from -100 (total disagreement)
        through 0 (random classification) to 100 (perfect agreement).
        
        Parameters:
        -----------
        y_true : array-like
            Ground truth (correct) labels
        y_pred : array-like
            Predicted labels, as returned by a classifier
            
        Returns:
        --------
        float
            Kappa score multiplied by 100 (ranging from -100 to 100)
            
        Notes:
        ------
        The implementation follows the formula:
        Kappa = (n∑xii - ∑xi.x.i)/(n² - ∑xi.x.i) * 100
        where:
        - xii is the count of cases in the main diagonal of confusion matrix
        - n is the number of examples
        - xi. and x.i are the row and column total counts respectively
        """
        # Ensure inputs are numpy arrays
        y_true = np.asarray(y_true)
        y_pred = np.asarray(y_pred)
        
        # Compute confusion matrix
        conf_matrix = confusion_matrix(y_true, y_pred)
        
        # Get total number of examples
        n = len(y_true)
        
        # Calculate sum of main diagonal (∑xii)
        sum_diag = np.sum(np.diag(conf_matrix))
        
        # Calculate row and column sums
        row_sums = np.sum(conf_matrix, axis=1)  # xi.
        col_sums = np.sum(conf_matrix, axis=0)  # x.i
        
        # Calculate ∑xi.x.i
        sum_product = np.sum(row_sums * col_sums)
        
        # Calculate Kappa according to the formula
        numerator = n * sum_diag - sum_product
        denominator = n * n - sum_product
        
        # Handle division by zero case
        if denominator == 0:
            return 0.0
            
        kappa = (numerator / denominator) * 100
        
        return kappa
    
    def evaluate(self, dataset: pd.DataFrame):
        # split dataset
        
        self.fit(train_data)
        predictions = self.predict(test_data)
        kappa = self.calculate_kappa(test_data[self.target_column_name], predictions)
        print(f"Kappa is {kappa}")



In [22]:
class BalancedRandomForestClassifierWrapper(BaseClassifier):

    def __init__(self, algorithm_name, target_column_name) -> None:
        super().__init__(algorithm_name, target_column_name)
        self.clf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

    ...
    def fit(self, train_data: pd.DataFrame):
        train_df = train_data
        # We need to encode categorical features
        self.encoders = {}
        for col in train_df.select_dtypes(include=['object']).columns:
            le = LabelEncoder()
            train_df[col] = le.fit_transform(train_df[col])
            train_df[col] = le.transform(train_df[col])
            self.encoders[col] = le
        x_train = train_df.drop(columns=[self.target_column_name])
        y_train = train_df[self.target_column_name]
        self.clf.fit(x_train, y_train)

    def predict(self, test_data: pd.DataFrame):
        test_df = test_data
        for col, encoder in self.encoders.items():
            print(f"encoding: {col}")
            test_df[col] = encoder.transform(test_df[col])
        return self.clf.predict(test_df.drop(columns=[self.target_column_name]))

## Evaluation

In [23]:
# Generate all train and test datasets ratios 5, 10, 20, 50, 100

synth_generator = SyntheticDatasetGenerator("adult", "label", "<=50K", ">50K")

TOTAL_ROWS = 10000


datasets = {}
for ratio in [5, 10, 20, 50, 100]:
    datasets[ratio] = synth_generator.generate_dataset(TOTAL_ROWS, ratio)


    


Sampling conditions: 100%|██████████| 9996/9996 [00:02<00:00, 3650.70it/s]
Sampling conditions: 100%|██████████| 1998/1998 [00:00<00:00, 2480.60it/s]
Sampling conditions: 100%|██████████| 9999/9999 [00:02<00:00, 3725.14it/s]
Sampling conditions: 100%|██████████| 1991/1991 [00:00<00:00, 2553.72it/s]
Sampling conditions: 100%|██████████| 9996/9996 [00:02<00:00, 3728.48it/s]
Sampling conditions: 100%|██████████| 1995/1995 [00:00<00:00, 2530.82it/s]
Sampling conditions: 100%|██████████| 9996/9996 [00:02<00:00, 3758.14it/s]
Sampling conditions: 100%|██████████| 1989/1989 [00:00<00:00, 2613.90it/s]
Sampling conditions: 100%|██████████| 9999/9999 [00:02<00:00, 3750.24it/s]
Sampling conditions: 100%|██████████| 1919/1919 [00:00<00:00, 2476.27it/s]


In [24]:
# Run Classifiers




classifiers = [
    BalancedRandomForestClassifierWrapper("BRFC", TARGET_COLUMN_NAME)
]

for classifier in classifiers:
    for ratio, dataset in datasets.items():
        print (f"Running ratio {ratio}")
        classifier.evaluate(dataset)

Running ratio 5


ValueError: y contains previously unseen labels: 4