# Evaluating Algorithms for Imbalanced Data

* Student Name: Michael Rideout
* Student Number: 225065259
* E-mail: s225065259@deakin.edu.au
* Student Course Code: SIT731
---

# Introduction

In [19]:
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import compute_sample_weight
from sklearn.datasets import make_classification


!pip install imblearn
from imblearn.ensemble import BalancedRandomForestClassifier

!pip install river
from river import datasets
from river import evaluate
from river import metrics
from river import preprocessing
from river import compose
from river import tree, ensemble, forest




## TODO
- Generate synthetic datasets
- Test algorithms for imbalanced data
- Evaluate performance
- Report Findings

# Generate Synthetic datasets
- Create datasets with varying class imbalance ratios (5, 10, 20, 50, 100) based on the size of the majority class relative to the minority class

In [20]:
class SyntheticDatasetGenerator():

    def generate_dataset(self, row_count, imbalance_ratio):
        majority_class_weight = imbalance_ratio / (imbalance_ratio + 1)
        minority_class_weight = 1 / (imbalance_ratio + 1)
        X, y = make_classification(n_samples=row_count, n_features=20, random_state=42, weights=[minority_class_weight, majority_class_weight])

        # Create a DataFrame
        df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(X.shape[1])]) 
        df['label'] = y
        return df
        

In [21]:
syn = SyntheticDatasetGenerator()
df = syn.generate_dataset(10000, 100)
df['label'].value_counts()

label
1    9848
0     152
Name: count, dtype: int64

## Classifiers

In [22]:
#define constants


class BaseClassifier:

    def __init__(self, algorithm_name, target_column_name) -> None:
        self.algorithm_name = algorithm_name
        self.target_column_name = target_column_name

    
    def fit(self, full_dataset: pd.DataFrame, train_data: pd.DataFrame):
        pass

    def predict(self, test_data: pd.DataFrame):
        pass

    def calculate_g_mean(self, y_true, y_pred):
        """
        Calculate the geometric mean for evaluating classifier performance.
        
        Parameters:
        -----------
        y_true : array-like
            Ground truth (correct) labels
        y_pred : array-like
            Predicted labels, as returned by a classifier
            
        Returns:
        --------
        float
            G-mean score ranging from 0 to 1
        """
        # Ensure inputs are numpy arrays
        y_true = np.asarray(y_true)
        y_pred = np.asarray(y_pred)
        
        # Calculate the number of positive and negative samples
        num_positives = np.sum(y_true)
        num_negatives = len(y_true) - num_positives
        
        # Calculate the number of true positives, true negatives, false positives, and false negatives
        true_positives = np.sum(np.logical_and(y_true == 1, y_pred == 1))
        true_negatives = np.sum(np.logical_and(y_true == 0, y_pred == 0))
        false_positives = np.sum(np.logical_and(y_true == 0, y_pred == 1))
        false_negatives = np.sum(np.logical_and(y_true == 1, y_pred == 0))
        
        # Calculate the sensitivity (true positive rate) and specificity (true negative rate)
        sensitivity = true_positives / num_positives
        specificity = true_negatives / num_negatives
        
        # Calculate the g-mean using the sensitivity and specificity
        g_mean = np.sqrt(sensitivity * specificity)
        
        return g_mean

    def calculate_auc(self, y_true, y_pred):
        """
        Calculate the AUC (Area Under the Curve) for evaluating classifier performance.
        
        Parameters:
        -----------
        y_true : array-like
            Ground truth (correct) labels
        y_pred : array-like
            Predicted probabilities or scores, as returned by a classifier
                
        Returns:
        --------
        float
            AUC score ranging from 0 to 1
        """
        return roc_auc_score(y_true, y_pred)


    def calculate_kappa(self, y_true, y_pred):
        """
        Calculate the Cohen's kappa coefficient for evaluating classifier performance.
        
        Parameters:
        -----------
        y_true : array-like
            Ground truth (correct) labels
        y_pred : array-like
            Predicted labels, as returned by a classifier
            
        Returns:
        --------
        float
            Kappa coefficient ranging from -1 to 1
        """
        # Ensure inputs are numpy arrays
        y_true = np.asarray(y_true)
        y_pred = np.asarray(y_pred)
        
        # Calculate the confusion matrix
        confusion = confusion_matrix(y_true, y_pred)
        
        # Calculate the observed agreement
        observed_agreement = np.trace(confusion) / np.sum(confusion)
        
        # Calculate the expected agreement
        expected_agreement = np.sum(np.sum(confusion, axis=0) * np.sum(confusion, axis=1)) / np.sum(confusion)**2
        
        # Calculate the kappa coefficient
        kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)
        
        return kappa * 100

    
    def evaluate(self, dataset: pd.DataFrame):
        # split the dataset
        train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
        self.fit(dataset, train_data)
        y_true = test_data[self.target_column_name]
        y_pred = self.predict(test_data)
        kappa = self.calculate_kappa(y_true, y_pred)
        auc = self.calculate_auc(y_true, y_pred)
        gmean = self.calculate_g_mean(y_true, y_pred)
        print(f"Model: {self.algorithm_name} \t AUC: {auc:.2f} Kappa:{kappa:.2f} g-mean: {gmean:.2f}")



In [23]:
class BalancedRandomForestClassifierWrapper(BaseClassifier):

    def __init__(self, algorithm_name, target_column_name) -> None:
        super().__init__(algorithm_name, target_column_name)
        self.clf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
    
    def fit(self, full_dataset: pd.DataFrame, train_data: pd.DataFrame):
        train_df = train_data
        x_train = train_df.drop(columns=[self.target_column_name])
        y_train = train_df[self.target_column_name]
        self.clf.fit(x_train, y_train)

    def predict(self, test_data: pd.DataFrame):
        return self.clf.predict(test_data.drop(columns=[self.target_column_name]))

In [27]:
class ARFWrapper(BaseClassifier):
    def __init__(self, algorithm_name, target_column_name) -> None:
        super().__init__(algorithm_name, target_column_name)
        self.clf = forest.ARFClassifier(n_models=10)

    def fit(self, full_dataset: pd.DataFrame, train_data: pd.DataFrame):
        train_df = train_data
        x_train = train_df.drop(columns=[self.target_column_name])
        y_train = train_df[self.target_column_name]
        self.clf = self.clf.learn_one(x_train, y_train)

    def predict(self, test_data: pd.DataFrame):
        test_df = test_data.drop(columns=[self.target_column_name])
        return test_df.apply(lambda x: self.clf.predict_one(x))

## Evaluation

In [25]:
# Generate all train and test datasets ratios 5, 10, 20, 50, 100

TARGET_COLUMN_NAME = "label"

synth_generator = SyntheticDatasetGenerator()

TOTAL_ROWS = 10000

datasets = {}
for ratio in [5, 10, 20, 50, 100]:
    datasets[ratio] = synth_generator.generate_dataset(TOTAL_ROWS, ratio)


    


In [28]:
# Run Classifiers




classifiers = [
    BalancedRandomForestClassifierWrapper("BRFC", TARGET_COLUMN_NAME),
    ARFWrapper("ARF", TARGET_COLUMN_NAME)

]

for classifier in classifiers:
    for ratio, dataset in datasets.items():
        print (f"Running ratio {ratio}")
        classifier.evaluate(dataset)

Running ratio 5
Model: BRFC 	 AUC: 0.92 Kappa:80.66 g-mean: 0.92
Running ratio 10
Model: BRFC 	 AUC: 0.91 Kappa:71.77 g-mean: 0.91
Running ratio 20
Model: BRFC 	 AUC: 0.90 Kappa:57.12 g-mean: 0.90
Running ratio 50
Model: BRFC 	 AUC: 0.86 Kappa:31.05 g-mean: 0.86
Running ratio 100
Model: BRFC 	 AUC: 0.77 Kappa:13.61 g-mean: 0.76
Running ratio 5


TypeError: unhashable type: 'Series'

In [None]:
import pandas as pd
from sklearn.datasets import make_classification

# Generate the dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Create a DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(X.shape[1])])
df['label'] = y

print(df.head())

clf = BalancedRandomForestClassifierWrapper("BRFC", 'label')
clf.evaluate(df)


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -0.669356  -1.495778  -0.870766   1.141831   0.021606   1.730630   
1   0.093372   0.785848   0.105754   1.272354  -0.846316  -0.979093   
2  -0.905797  -0.608341   0.295141   0.943716   0.092936   1.370397   
3  -0.585793   0.389279   0.698816   0.436236  -0.315082   0.459505   
4   1.146441   0.515579  -1.222895  -0.396230  -1.293508  -0.352428   

   feature_7  feature_8  feature_9  feature_10  ...  feature_12  feature_13  \
0  -1.251698   0.289305   0.357163   -0.196811  ...    0.154850   -0.219970   
1   1.263707   0.264020   2.411677   -0.960046  ...    0.199810    0.288724   
2  -0.064772   0.287273  -0.533004   -0.032504  ...   -0.510064   -0.868768   
3   1.448820   0.505558  -1.440982   -1.134020  ...    1.466783    0.678728   
4   0.071254   1.239584   1.007133   -1.479444  ...   -0.918127    0.604121   

   feature_14  feature_15  feature_16  feature_17  feature_18  feature_19  \
0   -0.739137    1.80

In [None]:
df.info()
df['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   feature_1   1000 non-null   float64
 1   feature_2   1000 non-null   float64
 2   feature_3   1000 non-null   float64
 3   feature_4   1000 non-null   float64
 4   feature_5   1000 non-null   float64
 5   feature_6   1000 non-null   float64
 6   feature_7   1000 non-null   float64
 7   feature_8   1000 non-null   float64
 8   feature_9   1000 non-null   float64
 9   feature_10  1000 non-null   float64
 10  feature_11  1000 non-null   float64
 11  feature_12  1000 non-null   float64
 12  feature_13  1000 non-null   float64
 13  feature_14  1000 non-null   float64
 14  feature_15  1000 non-null   float64
 15  feature_16  1000 non-null   float64
 16  feature_17  1000 non-null   float64
 17  feature_18  1000 non-null   float64
 18  feature_19  1000 non-null   float64
 19  feature_20  1000 non-null   

label
1    500
0    500
Name: count, dtype: int64