# Machine Learning Semester Project
## Murtaza Hussain (29449) and Muhammad Asad ur Rehman (29456)

### Class Imbalance Problem

The below code solves the prevalent problem of imbalanced dataset, where one class dominates the dataset as compared to the other. Such is the case for the following dataset for Credit Card Transactions to detect Fraudulent Transactions. We will evaluate the following methods to resolve Class Imbalance:
1. Random Under Sampling
2. Algorithmic Methods (Using Random Forest as well as modifying Class Weights)
3. Anomaly Detection Method

For the following Dataset, we will use the following 5 Algorithms to draw a comparision between different methods:
1. Logistic Regression
2. K-Nearest Neighbors (KNN)
3. Random Forest
4. Support Vector Machines (SVM)
5. Artificial Neural Network (ANN)

In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer, recall_score, precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN

pd.options.display.float_format = '{:,.4f}'.format

In [10]:
# Data Loader loads data from CSV Files
def load_dataset():
    dataset = pd.read_csv("./Source.CreditCardFraud.csv")
    return dataset

# df = load_dataset()

In [3]:
# This function performs a missing value analysis on each column of the dataset, helps you decide on what to do in cleaning process
def null_check(df):
    null_columns = []
    for column in df.columns:
        print("Column Name:", column)
        print("Column DataType:", df[column].dtype)
        if df[column].dtype != 'float64' and df[column].dtype != 'int64':
            print("Column unique values:", df[column].unique())
        print("Column has null:", df[column].isnull().any())

        
        if df[column].isnull().any() == True:
            print("Column Null Count:", df[column].isnull().sum())
            null_columns.append(column)
        print("\n")
    return null_columns

# null_check(df)

Column Name: Time
Column DataType: int64
Column has null: False


Column Name: V1
Column DataType: float64
Column has null: False


Column Name: V2
Column DataType: float64
Column has null: False


Column Name: V3
Column DataType: float64
Column has null: False


Column Name: V4
Column DataType: float64
Column has null: False


Column Name: V5
Column DataType: float64
Column has null: False


Column Name: V6
Column DataType: float64
Column has null: False


Column Name: V7
Column DataType: float64
Column has null: False


Column Name: V8
Column DataType: float64
Column has null: False


Column Name: V9
Column DataType: float64
Column has null: False


Column Name: V10
Column DataType: float64
Column has null: False


Column Name: V11
Column DataType: float64
Column has null: False


Column Name: V12
Column DataType: float64
Column has null: False


Column Name: V13
Column DataType: float64
Column has null: False


Column Name: V14
Column DataType: float64
Column has null: False


Colum

[]

In [None]:
# This function drops any null columns and missing values
# This is where you decide whether to remove NULL rows (which will reduce the size of Dataset) or remove NULL columns entirely. You can also choose a combination of both.
def clean_data(df, drop_columns, missing_value = False):
    # Remove unnecessary columns
    df.drop(drop_columns, axis=1, inplace=True)
    # Drop rows with any missing values
    if missing_value == False:
        df.dropna(inplace=True)
    else:
        df.fillna(missing_value, inplace=True)
    return df

In [11]:
# Prints a summary of class instances and distribution
def data_summary(df, target=None):
    if isinstance(df, pd.DataFrame) and target!=None:
        a = df[target].value_counts()
    else:
        a = df.value_counts()
    class0 = format(100 * a[0]/sum(a), ".2f")
    class1 = format(100 * a[1]/sum(a), ".2f")

    meta = pd.DataFrame([{ "%": class0, "count": a[0]},
                         { "%": class1, "count": a[1]}])
    print("\nClass Distribution:\n", meta, "\n")

# data_summary(df,'Class')

Class Distribution:

        %  count
0  99.78  99776
1   0.22    223


In [12]:
# Transforms categorical and numberical data into numerical data
def transform_data(df):
    # Encode categorical variables
    label_encoder = LabelEncoder()
    print("Categorical columns:", df.select_dtypes(include=['object', 'int64']).columns)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Standardize numerical features
    scaler = StandardScaler()
    print("Numerical columns:", df.select_dtypes(include=['float64']).columns)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numerical_cols) > 0:
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# df['Class'] = df['Class'].astype(str)
# df = transform_data(df)

Categorical columns: Index(['Time', 'Class'], dtype='object')
Numerical columns: Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


In [16]:
# Runs Baseline Model for All 5 Algorithms
def BaselineRunAll(df, target_name, k=10):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    print("Class Distribution for Baseline Run:")
    data_summary(y)

    # Initialize the classifiers
    lr_classifier = LogisticRegression()
    rf_classifier = RandomForestClassifier()
    knn_classifier = KNeighborsClassifier()
    svm_classifier = SVC()
    ann_classifier = MLPClassifier(max_iter=1000)
    

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)     # The reasoning behind k = 10 is so as to strike a balance between test and train samples of minority class

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), # As the majority class has 99.81% presence, accuracy cannot be used as a metric to evaluate performance
                               'precision': make_scorer(precision_score, pos_label=1)}
    
    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Artificial Neural Networks': ann_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Baseline',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })

    df = pd.DataFrame(results)
    return df

results = BaselineRunAll(df, 'Class')
print(results)

Class Distribution for Baseline Run:
Class Distribution:

        %  count
0  99.78  99776
1   0.22    223
Logistic Regression Model Training Completed
Random Forest Model Training Completed
K-Nearest Neighbours Model Training Completed
Support Vector Machines Model Training Completed
Artificial Neural Networks Model Training Completed
     Method                  Classifier  Class 1 Recall  Class 1 Precision
0  Baseline         Logistic Regression          0.5648             0.7965
1  Baseline               Random Forest          0.8383             0.9645
2  Baseline        K-Nearest Neighbours          0.8298             0.9415
3  Baseline     Support Vector Machines          0.7265             0.9697
4  Baseline  Artificial Neural Networks          0.8385             0.9559


In [17]:
# Performs Undersampling of Majority Class followed by Oversampling of Minority class using SMOTE and tests on all 5 Algorithms
def RandomSamplingSMOTE(df, target_name, k=10):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    # Undersample Majority Class
    rus = RandomUnderSampler(sampling_strategy={0: 50000, 1: 223}, random_state=42)
    X, y = rus.fit_resample(X, y)
    print("Class Distribution after Undersampling Majority Class:")
    data_summary(y)

    # Oversample using SMOTE
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    print("Class Distribution after Oversampling Minority Class using SMOTE:")
    data_summary(y)

    # Initialize the classifiers
    lr_classifier = LogisticRegression()
    rf_classifier = RandomForestClassifier()
    knn_classifier = KNeighborsClassifier()
    svm_classifier = SVC()
    ann_classifier = MLPClassifier(max_iter=1000)

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), 
                               'precision': make_scorer(precision_score, pos_label=1)}

    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Artificial Neural Networks': ann_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Undersampling + SMOTE',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })
    
    df = pd.DataFrame(results)
    return df

results = RandomSamplingSMOTE(df, 'Class')
print(results)

Class Distribution after Undersampling Majority Class:
Class Distribution:

        %  count
0  99.56  50000
1   0.44    223
Class Distribution after Oversampling Minority Class using SMOTE:
Class Distribution:

        %  count
0  50.00  50000
1  50.00  50000
Logistic Regression Model Training Completed
Random Forest Model Training Completed
K-Nearest Neighbours Model Training Completed
Support Vector Machines Model Training Completed


In [None]:
# Performs Undersampling of Majority Class followed by Oversampling of Minority class using ADASYN and tests on all 5 Algorithms
def RandomSamplingADASYN(df, target_name, k=10):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    # Undersample Majority Class
    rus = RandomUnderSampler(sampling_strategy={0: 50000, 1: 223}, random_state=42)
    X, y = rus.fit_resample(X, y)
    print("Class Distribution after Undersampling Majority Class:")
    data_summary(y)

    # Oversample using ADASYN
    adasyn = ADASYN(random_state=42)
    X, y = adasyn.fit_resample(X, y)
    print("Class Distribution after Oversampling Minority Class using ADASYN:")
    data_summary(y)

    # Initialize the classifiers
    lr_classifier = LogisticRegression()
    rf_classifier = RandomForestClassifier()
    knn_classifier = KNeighborsClassifier()
    svm_classifier = SVC()
    ann_classifier = MLPClassifier(max_iter=1000)

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), 
                               'precision': make_scorer(precision_score, pos_label=1)}

    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Artificial Neural Networks': ann_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Undersampling + ADASYN',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })
    
    df = pd.DataFrame(results)
    return df

results = RandomSamplingADASYN(df, 'Class')
print(results)

Class Distribution after Undersampling Majority Class:
Class Distribution:

        %  count
0  99.56  50000
1   0.44    223
Class Distribution after Oversampling Minority Class using SMOTE:
Class Distribution:

        %  count
0  50.01  50000
1  49.99  49985
LR CV Completed
RF CV Completed
KNN CV Completed
SVM CV Completed
ANN CV Completed
                 Method  Logistic Regression  Random Forest  \
0  SMOTE Class 1 Recall               0.9141         1.0000   

   K-Nearest Neighbours  Support Vector Machines  Artificial Neural Networks  
0                1.0000                   0.9997                      1.0000  


In [None]:
# Plot a Model vs Recall graph for Classification Dataset for Each Method
def plot_model_recall_graph(df):
    plt.figure(figsize=(10, 6))
    
    # Plotting lines for each Method
    sns.lineplot(data=df, x='Classifier', y='Class 1 Recall', hue='Method', marker='o')

    plt.title('Classifier vs Recall')
    plt.xlabel('Classifier')
    plt.ylabel('Recall')
    plt.legend(title='Method')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Plot a Model vs Precision graph for Classification Dataset for Each Method
def plot_model_precision_graph(df):
    plt.figure(figsize=(10, 6))
    
    # Plotting lines for each Method
    sns.lineplot(data=df, x='Classifier', y='Class 1 Precision', hue='Method', marker='o')

    plt.title('Classifier vs Precision')
    plt.xlabel('Classifier')
    plt.ylabel('Precision')
    plt.legend(title='Method')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def master_workflow():
    # Load Dataset
    df = load_dataset()
    # No need for Data Cleaning and EDA as Data is already clean
    # Evaluate Class Distribution of the cleaned Dataset
    data_summary(df)
    # Transform and Encode Data
    df['Class'] = df['Class'].astype(str)
    df = transform_data(df)
    # Run Baseline Models using k = 10
    baseline_results = BaselineRunAll(df, 'Class')
    # Evaluate Models using SMOTE Oversampling Technique
    smote_results = RandomSamplingSMOTE(df, 'Class')
    # Evaluate Models using ADASYN Oversampling Technique
    adasyn_results = RandomSamplingADASYN(df, 'Class')
    # Concatenate the results
    results_df = pd.concat(baseline_results, smote_results, adasyn_results)
    # Plot a Classifier vs Recall Graph -> To evaluate how well the model is performing to detect the fraudulent transactions (minority class)
    plot_model_recall_graph(results_df)
    # Plot a Classifier vs Precision Graph -> To evaluate how precise the model is to detect the minority class (can be used as a secondary metric for evaluation)
    plot_model_precision_graph(results_df)

In [None]:
master_workflow()