# Machine Learning Semester Project
## Murtaza Hussain (29449) and Muhammad Asad ur Rehman (29456)

### Class Imbalance Problem

The below code solves the prevalent problem of imbalanced dataset, where one class dominates the dataset as compared to the other. Such is the case for the following dataset for Credit Card Transactions to detect Fraudulent Transactions. We will evaluate the following methods to resolve Class Imbalance:
1. Random Under Sampling
2. Algorithmic Methods (Using Random Forest as well as modifying Class Weights)
3. Anomaly Detection Method

For the following Dataset, we will use the following 5 Algorithms to draw a comparision between different methods:
1. Logistic Regression
2. K-Nearest Neighbors (KNN)
3. Random Forest
4. Support Vector Machines (SVM)
5. Artificial Neural Network (ANN)

In [13]:
# Import necessary libraries
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

pd.options.display.float_format = '{:,.4f}'.format

In [14]:
# Data Loader loads data from CSV Files
def load_dataset():
    dataset = pd.read_csv("./Source.LoanData.csv")
    return dataset

df = load_dataset()

In [15]:
# This function performs a missing value analysis on each column of the dataset, helps you decide on what to do in cleaning process
def null_check(df):
    null_columns = []
    for column in df.columns:
        print("Column Name:", column)
        print("Column DataType:", df[column].dtype)
        if df[column].dtype != 'float64' and df[column].dtype != 'int64':
            print("Column unique values:", df[column].unique())
        print("Column has null:", df[column].isnull().any())

        
        if df[column].isnull().any() == True:
            print("Column Null Count:", df[column].isnull().sum())
            null_columns.append(column)
        print("\n")
    return null_columns

# null_check(df)

In [16]:
# This function drops any null columns and missing values
# This is where you decide whether to remove NULL rows (which will reduce the size of Dataset) or remove NULL columns entirely. You can also choose a combination of both.
def clean_data(df, drop_columns, missing_value = False):
    # Remove unnecessary columns
    df.drop(drop_columns, axis=1, inplace=True)
    # Drop rows with any missing values
    if missing_value == False:
        df.dropna(inplace=True)
    else:
        df.fillna(missing_value, inplace=True)
    return df

In [17]:
# Prints a summary of class instances and distribution
def data_summary(df, target=None):
    if isinstance(df, pd.DataFrame) and target!=None:
        a = df[target].value_counts()
    else:
        a = df.value_counts()
    class0 = format(100 * a[0]/sum(a), ".2f")
    class1 = format(100 * a[1]/sum(a), ".2f")

    meta = pd.DataFrame([{ "%": class0, "count": a[0]},
                         { "%": class1, "count": a[1]}])
    print("\nClass Distribution:\n", meta, "\n")

data_summary(df,'not.fully.paid')


Class Distribution:
        %  count
0  90.01   8045
1   9.99    893 



  class0 = format(100 * a[0]/sum(a), ".2f")
  class1 = format(100 * a[1]/sum(a), ".2f")
  meta = pd.DataFrame([{ "%": class0, "count": a[0]},
  { "%": class1, "count": a[1]}])


In [18]:
# Transforms categorical and numberical data into numerical data
def transform_data(df):
    # Encode categorical variables
    label_encoder = LabelEncoder()
    print("Categorical columns:", df.select_dtypes(include=['object', 'int64']).columns)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Standardize numerical features
    scaler = StandardScaler()
    print("Numerical columns:", df.select_dtypes(include=['float64']).columns)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numerical_cols) > 0:
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

df = transform_data(df)

Categorical columns: Index(['credit.policy', 'purpose', 'fico', 'revol.bal', 'inq.last.6mths',
       'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')
Numerical columns: Index(['int.rate', 'installment', 'log.annual.inc', 'dti', 'days.with.cr.line',
       'revol.util'],
      dtype='object')


In [23]:
# As the results for Baseline Model were not promising for the original dataset, Feature selection is required
def select_best_features_by_rfe(X, y, model=None, step=1, min_features=1, max_features=None):
    if model is None:
        model = LogisticRegression(max_iter=1000)
    if max_features is None:
        max_features = X.shape[1]

    best_score = 0
    best_num_features = min_features
    best_features = []

    # Loop over possible number of features from max_features to min_features
    for n_features_to_select in range(max_features, min_features - 1, -step):
        selector = RFE(model, n_features_to_select=n_features_to_select, step=step)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        selector.fit(X_train, y_train)
        selected_features = X.columns[selector.support_]
        y_pred = selector.predict(X_test)
        score = f1_score(y_test, y_pred, pos_label=1)
        
        print(f"Testing {n_features_to_select} features: F1 Score = {score}")

        if score > best_score:
            best_score = score
            best_num_features = n_features_to_select
            best_features = selected_features.tolist()

    print(f"Best F1 Score: {best_score} with {best_num_features} features. {best_features}")

    return best_features, best_num_features

In [25]:
# Runs Baseline Model for All 5 Algorithms
def BaselineRunAll(df, target_name, k=5):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    features = select_best_features_by_rfe(X,y, SVC(probability=True))


    # pca = PCA(n_components=9)
    # X = pca.fit_transform(X)

    print("Class Distribution for Baseline Run:")
    data_summary(y)

    # Initialize the classifiers
    lr_classifier = LogisticRegression(max_iter=1000)
    rf_classifier = RandomForestClassifier()
    knn_classifier = KNeighborsClassifier()
    svm_classifier = SVC(probability=True)
    nb_classifier = GaussianNB()
    

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)     # The reasoning behind k = 10 is so as to strike a balance between test and train samples of minority class

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), # As the majority class has 99.81% presence, accuracy cannot be used as a metric to evaluate performance
                               'precision': make_scorer(precision_score, pos_label=1)}
    
    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Naive Bayes': nb_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Baseline',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })

    df = pd.DataFrame(results)
    return df

results = BaselineRunAll(df, 'not.fully.paid')
print(results)

Testing 13 features: F1 Score = 0.0


ValueError: when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [6]:
# Runs Baseline Model for All 5 Algorithms
def BaselineRunAll(df, target_name, k=5, n_features_to_select=None):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    print("Class Distribution for Baseline Run:")
    data_summary(y)

    # If n_features_to_select is not set, use half of the available features
    if n_features_to_select is None:
        n_features_to_select = X.shape[1] // 2

    # Initialize the classifiers
    lr_classifier = LogisticRegression(max_iter=1000)
    rf_classifier = RandomForestClassifier()
    knn_classifier = KNeighborsClassifier()
    svm_classifier = SVC(kernel='linear')  # Ensure linear kernel for feature ranking in SVC
    nb_classifier = GaussianNB()
    
    # Wrap classifiers with RFE
    classifiers = {
        'Logistic Regression': RFE(lr_classifier, n_features_to_select=n_features_to_select),
        'Random Forest': RFE(rf_classifier, n_features_to_select=n_features_to_select),
        'K-Nearest Neighbours': RFE(knn_classifier, n_features_to_select=n_features_to_select),
        'Support Vector Machines': RFE(svm_classifier, n_features_to_select=n_features_to_select),
        'Naive Bayes': RFE(nb_classifier, n_features_to_select=n_features_to_select, step=1)
    }

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1),
                               'precision': make_scorer(precision_score, pos_label=1)}
    
    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Baseline with RFE',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })

    df_results = pd.DataFrame(results)
    return df_results

results = BaselineRunAll(df, 'not.fully.paid')
print(results)

Class Distribution for Baseline Run:

Class Distribution:
        %  count
0  90.01   8045
1   9.99    893 

Logistic Regression Model Training Completed


  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Model Training Completed


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\murta\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\murta\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\_rfe.py", line 235, in fit
    return self._fit(X, y, **fit_params)
  File "c:\Users\murta\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\_rfe.py", line 299, in _fit
    importances = _get_feature_importances(
  File "c:\Users\murta\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\_base.py", line 208, in _get_feature_importances
    raise ValueError(
ValueError: when `importance_getter=='auto'`, the underlying estimator KNeighborsClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.


In [None]:
# Applies Class Weighting (A method to add additional weightage to learn patterns from minority class)
def ClassWeightingMethod(df, target_name, k=5):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    pca = PCA(n_components=20)
    X = pca.fit_transform(X)

    print("Class Distribution for Class Weighting Method:")
    data_summary(y)

    # Calculating priors for GaussianNB
    priors = y.value_counts()
    priors = priors/sum(priors)

    # Initialize the classifiers with settings
    lr_classifier = LogisticRegression(max_iter=1000, class_weight='balanced')
    rf_classifier = RandomForestClassifier(class_weight='balanced')
    knn_classifier = KNeighborsClassifier() # KNN does not support class_weight as the algorithm is mainly based on clustering
    svm_classifier = SVC(class_weight='balanced')
    nb_classifier = GaussianNB(priors=priors)

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), 
                               'precision': make_scorer(precision_score, pos_label=1)}

    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Naive Bayes': nb_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Class Weighting',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })
    
    df = pd.DataFrame(results)
    return df

# results = ClassWeightingMethod(df, 'not.fully.paid')
# print(results)

In [None]:
# Applies Bagging (Algorithmic Methods) to learn minority class
def BaggingAlgorithmicMethod(df, target_name, k=5):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    pca = PCA(n_components=20)
    X = pca.fit_transform(X)

    print("Class Distribution for Bagging Method:")
    data_summary(y)


    # Initialize the classifiers with Bagging
    lr_classifier = BaggingClassifier(estimator=LogisticRegression(max_iter=1000), random_state=42)
    rf_classifier = RandomForestClassifier() # Already an ensemble method (Bagging Algorithm) 
    knn_classifier = BaggingClassifier(estimator=KNeighborsClassifier(), random_state=42)
    svm_classifier = BaggingClassifier(estimator=SVC(), random_state=42)
    nb_classifier = BaggingClassifier(estimator=GaussianNB(), random_state=42)

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), 
                               'precision': make_scorer(precision_score, pos_label=1)}

    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Naive Bayes': nb_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Bagging (Algorithmic Method)',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })
    
    df = pd.DataFrame(results)
    return df

# results = BaggingAlgorithmicMethod(df, 'not.fully.paid')
# print(results)

In [None]:
# Performs Boosting (Algorithmic Method) to learn minority class
def BoostingAlgorithmicMethod(df, target_name, k=5):

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    pca = PCA(n_components=20)
    X = pca.fit_transform(X)

    print("Class Distribution for Boosting Run:")
    data_summary(y)

    # Initialize the classifiers with Boosting
    lr_classifier = AdaBoostClassifier(estimator=LogisticRegression(max_iter=1000), random_state=42, algorithm="SAMME")
    rf_classifier = RandomForestClassifier() # Already an ensemble method (Bagging Algorithm) 
    knn_classifier = KNeighborsClassifier() # Does not support weighting samples which is required for Boosting
    svm_classifier = SVC() # Is slow to train with Boosting Algorithm and requires probability estimation hence can be extremely time consuming as Boosting Algorithm performs multiple iterations
    nb_classifier = AdaBoostClassifier(estimator=GaussianNB(), random_state=42, algorithm="SAMME")

    # Initialize k-fold cross-validation where folds = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    # Define a recall and precision scorer specifically focusing on the minority class
    recall_precision_scorer = {'recall': make_scorer(recall_score, pos_label=1), 
                               'precision': make_scorer(precision_score, pos_label=1)}

    classifiers = {
        'Logistic Regression': lr_classifier,
        'Random Forest': rf_classifier,
        'K-Nearest Neighbours': knn_classifier,
        'Support Vector Machines': svm_classifier,
        'Naive Bayes': nb_classifier
    }

    for clf_name, clf in classifiers.items():
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=recall_precision_scorer)
        print(f"{clf_name} Model Training Completed")
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()

        results.append({
            'Method': 'Boosting (Algorithmic Method)',
            'Classifier': clf_name,
            'Class 1 Recall': mean_recall,
            'Class 1 Precision': mean_precision
        })
    
    df = pd.DataFrame(results)
    return df

# results = RandomSamplingADASYN(df, 'not.fully.paid')
# print(results)

In [None]:
# Plot a Model vs Recall graph for Classification Dataset for Each Method
def plot_model_recall_graph(df):
    plt.figure(figsize=(10, 6))
    
    # Plotting lines for each Method
    sns.lineplot(data=df, x='Classifier', y='Class 1 Recall', hue='Method', marker='o')

    plt.title('Classifier vs Recall')
    plt.xlabel('Classifier')
    plt.ylabel('Recall')
    plt.legend(title='Method')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Plot a Model vs Precision graph for Classification Dataset for Each Method
def plot_model_precision_graph(df):
    plt.figure(figsize=(10, 6))
    
    # Plotting lines for each Method
    sns.lineplot(data=df, x='Classifier', y='Class 1 Precision', hue='Method', marker='o')

    plt.title('Classifier vs Precision')
    plt.xlabel('Classifier')
    plt.ylabel('Precision')
    plt.legend(title='Method')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def master_workflow():
    # Load Dataset
    df = load_dataset()
    # No need for Data Cleaning and EDA as Data is already clean
    # Evaluate Class Distribution of the cleaned Dataset
    data_summary(df,'not.fully.paid')
    # Transform and Encode Data
    df = transform_data(df)
    # Run Baseline Models using k = 5
    baseline_results = BaselineRunAll(df, 'not.fully.paid')
    # Evaluate Models using Class Weight Adjustment Technique
    class_weighting_results = ClassWeightingMethod(df, 'not.fully.paid')
    # Evaluate Models using Bagging Algorithm
    bagging_results = BaggingAlgorithmicMethod(df, 'not.fully.paid')
    # Evaluate Models using Boosting Algorithm
    boosting_results = BoostingAlgorithmicMethod(df, 'not.fully.paid')
    # Concatenate the results
    results_df = pd.concat([baseline_results, class_weighting_results, bagging_results, boosting_results])
    # Print results
    print(results_df)
    results_df.to_csv('Results/CreditCardFraud.AlgorithmicMethodsResults.csv', index=False)
    # Plot a Classifier vs Recall Graph -> To evaluate how well the model is performing to detect the fraudulent transactions (minority class)
    plot_model_recall_graph(results_df)
    # Plot a Classifier vs Precision Graph -> To evaluate how precise the model is to detect the minority class (can be used as a secondary metric for evaluation)
    plot_model_precision_graph(results_df)

In [None]:
# As the pipeline was not running in one go, we had to split it into smaller parts
# master_workflow() 

### Broken down pipeline:

In [None]:
# Load Dataset
df = load_dataset()
# No need for Data Cleaning and EDA as Data is already clean
# Evaluate Class Distribution of the cleaned Dataset
data_summary(df,'not.fully.paid')
# Transform and Encode Data
df = transform_data(df)

In [None]:
# Run Baseline Models using k = 10
baseline_results = BaselineRunAll(df, 'not.fully.paid')

In [None]:
# Evaluate Models using Class Weight Adjustment Technique
class_weighting_results = ClassWeightingMethod(df, 'not.fully.paid')

In [None]:
# Evaluate Models using Bagging Algorithm
bagging_results = BaggingAlgorithmicMethod(df, 'not.fully.paid')

In [None]:
# Evaluate Models using Boosting Algorithm
boosting_results = BoostingAlgorithmicMethod(df, 'not.fully.paid')

In [None]:
# Concatenate the results
results_df = pd.concat([baseline_results, class_weighting_results, bagging_results, boosting_results])
# Print results
print(results_df)
results_df.to_csv('Results/LoanData.AlgorithmicMethodsResults.csv', index=False)
# Plot a Classifier vs Recall Graph -> To evaluate how well the model is performing to detect the fraudulent transactions (minority class)
plot_model_recall_graph(results_df)
# Plot a Classifier vs Precision Graph -> To evaluate how precise the model is to detect the minority class (can be used as a secondary metric for evaluation)
plot_model_precision_graph(results_df)