In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Viz Imports
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Increase the maximum number of columns displayed in Pandas to 200
pd.set_option('display.max_columns', 200)
pd.set_option('display.precision', 2)
# pd.options.display.float_format = '{:.0f}'.format
# Set the default style of Matplotlib plots to "ggplot"
plt.style.use('ggplot')
# Define custom color palette
my_palette = sns.color_palette("husl", 2)
sns.set_style("whitegrid")

# All imports here
from sklearn.compose import make_column_selector

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score # Library for model evaluation
from sklearn.model_selection import train_test_split # Library to split datset into test and train

from sklearn.dummy import DummyClassifier
from sklearn.linear_model  import LogisticRegression # Logistic Regression Classifier
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent Classifier
from sklearn.tree import DecisionTreeClassifier # Decision Tree Classifier
from sklearn.ensemble  import RandomForestClassifier # Random Forest Classifier
from sklearn.neighbors import KNeighborsClassifier # K Nearest neighbors Classifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes Classifier
from sklearn.svm import SVC #Support vector Machine Classifier
from sklearn.ensemble import AdaBoostClassifier # Ada Boost Classifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import StackingClassifier, VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

In [None]:
file_list = ['Base.csv', 'variant_1.csv', 'variant_2.csv', 'variant_3.csv', 'variant_4.csv', 'variant_5.csv']

In [None]:
intellifraud_dataset = pd.DataFrame()

for file in file_list:
    input_df = pd.read_csv(f"../data/{file}")
    print('File Name : ', file)
    print(f'Rows - {input_df.shape[0]}, Columns - {input_df.shape[1]}')
    print(input_df['fraud_bool'].value_counts())

    # Append the data into fraud dataset
    intellifraud_dataset = intellifraud_dataset.append(input_df)

print('Shape of intellifraud_dataset : ', intellifraud_dataset.shape)

In [None]:
# input_df = pd.read_csv(f"../data/Base.csv")

In [None]:
# Checking the datatype of the columns
intellifraud_dataset.info()

In [None]:
# Extract Continuous & Categorical Columns
cat_cols = intellifraud_dataset.select_dtypes(include=['object']).columns.tolist()
cont_cols = intellifraud_dataset.select_dtypes(exclude=['object']).columns.tolist()
print(f'Categorical Columns - {cat_cols}')
print("=========================================")
print(f'Continuous Columns - {cont_cols}')

In [None]:
# Get Information on Categorical/Object Variables
intellifraud_dataset.describe(include=["object", "bool"]).transpose()

In [None]:
# Printing the unique values of Categorcal columns
for cols in cat_cols:
    print(cols, '-', intellifraud_dataset[cols].unique())

In [None]:
# Details of Numeric Columns
intellifraud_dataset.describe().T

In [None]:
# Printing the unique count of Cont columns
# Get the number of unique values in each column of the DataFrame
intellifraud_dataset.nunique()

In [None]:
# Printing the unique values of Categorcal columns
discreet_column = []
for cols in cont_cols:
    if len(list(intellifraud_dataset[cols].unique())) < 15:
        discreet_column.append(cols)
        print(cols, '-', intellifraud_dataset[cols].unique())
discreet_column

In [None]:
# EDA to explore Fraud vs Non Fraud
fraud_count = intellifraud_dataset["fraud_bool"].map({1:'Fraud', 0:'Not Fraud'}).to_frame()
ax = sns.countplot(
                    x=fraud_count['fraud_bool'],
                    order=fraud_count['fraud_bool'].value_counts(ascending=True).index
                )
        
abs_values = fraud_count['fraud_bool'].value_counts(ascending=True)
rel_values = fraud_count['fraud_bool'].value_counts(ascending=True, normalize=True).values * 100
lbls = [f'{p[0]:,.0f} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]

ax.bar_label(container=ax.containers[0], labels=lbls)
ax.set(title ='Fraud vs Non Fraud Count (All Variants)')
ax.grid(False)
sns.despine(left=True)
ax.set(xlabel=None)
ax.set(ylabel=None)
ax.set_yticklabels([])

## Convert All Columns to Numeric

In [None]:
def map_categorical_column(df):
    
    ''' Function to map the categorical columns '''
     
    map_payment_type      = {'AA':0, 'AB':1, 'AC':2, 'AD':3, 'AE':4}
    map_employment_status = {'CA':0, 'CB':1, 'CC':2, 'CD':3, 'CE':4,'CF':5,'CG':6}
    map_housing_status    = {'BA':0, 'BB':1, 'BC':2, 'BD':3, 'BE':4,'BF':5,'BG':6}
    map_source            = {'INTERNET':0,'TELEAPP':1}
    map_device_os         = {'windows':0,'other':1,'linux':2,'macintosh':3,'x11':4}
    
    # Updating the mapping in dataframe
    df["payment_type"]                 = df["payment_type"].map(map_payment_type)
    df["employment_status"]            = df["employment_status"].map(map_employment_status)
    df["housing_status"]               = df["housing_status"].map(map_housing_status)
    df["source"]                       = df["source"].map(map_source)
    df["device_os"]                    = df["device_os"].map(map_device_os)

    return df

In [None]:
# input_df_copy = input_df.copy()
intellifraud_dataset_num = map_categorical_column(intellifraud_dataset)
intellifraud_dataset_num.head()

In [None]:
intellifraud_dataset_num.fraud_bool.value_counts()

## Features Selection

In [None]:
# Import the necessary libraries for feature selection
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel, chi2, mutual_info_classif
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.under_sampling import NearMiss
from collections import Counter

##### Variance Threshold Test - Removes all low-variance features.

In [None]:
# True: High Variance ; #False: Low Variance
selector = VarianceThreshold()
selector.fit(intellifraud_dataset_num)
low_variance_col = [column for column in intellifraud_dataset_num.columns if column not in intellifraud_dataset_num.columns[selector.get_support()]]
low_variance_col


##### Pearson's Correlation Matrix - Remove highly correlated features

In [None]:
# Multicollinearity Test
corr = intellifraud_dataset_num.drop(columns=['device_fraud_count', 'fraud_bool']).corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plot correlation matrix with annotated values
fig, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
            corr[(corr >= 0.3) | (corr <= -0.3)], 
            mask=mask,
            cmap='coolwarm', 
            vmax=.3, 
            center=0,
            square=True, 
            linewidths=.5, 
            cbar_kws={"shrink": .5},
            annot=True, 
            annot_kws={"fontsize": 8},
            fmt=".2f", 
)

plt.title('Correlation Heatmap')
plt.show()

velocity_4w can re removed. It shows strong co-linearity with Month

In [None]:
# ##### Recursive Feature Elimination

# Fraud Transactions
intellifraud_df_fraud = intellifraud_dataset_num[intellifraud_dataset_num.fraud_bool == 1]
display(f'Shape of train_df_fraud {intellifraud_df_fraud.shape}')

# Non Fraud Transactions
intellifraud_df_non_fraud = intellifraud_dataset_num[intellifraud_dataset_num.fraud_bool == 0].sample(intellifraud_df_fraud.shape[0])
display(f'Shape of train_df_non_fraud {intellifraud_df_non_fraud.shape}')

# Merge Fraud & Non Fraud
train_df_merged = pd.concat([intellifraud_df_fraud, intellifraud_df_non_fraud])
display(f'Shape of train_df_merged {train_df_merged.shape}')

# Fit Model
X                 = train_df_merged.drop(columns=['fraud_bool', 'device_fraud_count', 'velocity_4w', 'x1', 'x2'])
y                 = train_df_merged['fraud_bool']

clf = RandomForestClassifier(random_state=42, max_depth=10).fit(X, y)

In [None]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                    index = X.columns,
                                    columns=['Feature_Importance']).sort_values('Feature_Importance', ascending=False
                                            )
# display(feature_importances)
# display('Feature Importance:', feature_importances['Feature_Importance'])
ax = feature_importances.head(11).plot(kind="bar")
ax.set(title ='Top 10 Features')
ax.grid(False)

In [None]:
feature_importances.head(11)

### Define Initial Models for Effective Attributes

In [None]:
# Function for Precsion, Recall and F1 Score
def calc_classfier_metric(classifier, y_test, y_pred):
    '''
    Function for Precsion, Recall and F1 Score
    '''
    accuracy      = accuracy_score(y_test, y_pred)
    precision     = precision_score(y_test, y_pred)
    recall        = recall_score(y_test, y_pred)
    F1_score      = f1_score(y_test, y_pred)
    roc_auc_scr   = roc_auc_score(y_test, y_pred)
    conf_mat      = confusion_matrix(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    
    return accuracy, precision, recall, F1_score, roc_auc_scr, conf_mat, fpr, tpr

In [None]:
# Build Classification Model
def build_individual_classifier_model(X_train, X_test, y_train, y_test, classifier_model, under_sample_size):
    '''
    Function to Build Classification Model for Individual Classifier
    '''
    print('Into build_individual_classifier_model')
    
    classifier_performance = []
    cnf_lst = []

    for classifier in classifier_model:

        # Fitting the training set into classification model
        classifier.fit(X_train,y_train)

        # Predicting the output on test datset
        y_pred = classifier.predict(X_test)    

        # Cross Validation Score on training test
        cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
        scores = cross_val_score(classifier, X_train,y_train, cv=5, scoring='f1_weighted')
        cv_score_mean = scores.mean()

        # Classification score
        accuracy, precision, recall, F1_score, roc_auc_scr, conf_mat, fpr, tpr = calc_classfier_metric(classifier, y_test, y_pred)
        classifier_performance.append([classifier.__class__.__name__, conf_mat, accuracy, precision, recall, F1_score, roc_auc_scr, cv_score_mean, fpr, tpr])
        
        # Store the model into pkl
        joblib.dump(classifier, f'../model/sample_1_{under_sample_size}/{classifier.__class__.__name__}.pkl')
     
    class_perf_df = pd.DataFrame(classifier_performance, columns=['Classifier', 'Conf_Mtrx', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC_Scr', 'CV_Score', 'FPR', 'TPR']).sort_values('F1_Score', ascending = False)
    
    return class_perf_df

In [None]:
def build_voting_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_class_model_df, under_sample_size):
    
    '''
    Function to Classifier Model for Voting Classifier
    '''
    
    print('Into build_voting_classifier_model')
    
    classifier_performance = []
    cnf_lst = []

    # Voting Classifier                
    clf1 = classifier_model[0]
    clf2 = classifier_model[1]
    clf3 = classifier_model[2]
    
    vote_classifier = VotingClassifier(
                                        estimators=[('ada', clf1),('xgb', clf2), ('lgb', clf3)],
                                        voting='soft'
                                    )
    
    # Fitting the training set into classification model
    vote_classifier.fit(X_train,y_train)

    # Predicting the output on test datset
    y_pred = vote_classifier.predict(X_test)    

    # Cross Validation Score on training test
    cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
    scores = cross_val_score(vote_classifier, X_train,y_train, cv=5, scoring='f1_weighted')
    cv_score_mean = scores.mean()

    # Classification score
    accuracy, precision, recall, F1_score, roc_auc_scr, conf_mat, fpr, tpr = calc_classfier_metric(vote_classifier, y_test, y_pred)
    classifier_performance.append([vote_classifier.__class__.__name__, conf_mat, accuracy, precision, recall, F1_score, roc_auc_scr, cv_score_mean, fpr, tpr])
    
    # Store the model into pkl
    joblib.dump(vote_classifier, f'../model/sample_1_{under_sample_size}/{vote_classifier.__class__.__name__}.pkl')
        
    class_perf_df = pd.DataFrame(classifier_performance, columns=['Classifier', 'Conf_Mtrx', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC_Scr', 'CV_Score', 'FPR', 'TPR']).sort_values('F1_Score', ascending = False)
    
    voting_class_df = pd.concat([ind_class_model_df, class_perf_df])
    
    return voting_class_df

In [None]:
# Build Classification Model
def build_stacking_classifier_model(X_train, X_test, y_train, y_test, classifier_model, prev_class_model_df, under_sample_size):
    
    '''
    Function to Classifier Model for Voting Classifier
    '''
    
    print('Into build_stacking_classifier_model')
    
    classifier_performance = []
    cnf_lst = []

    # Voting Classifier                
    clf1 = classifier_model[0]
    clf2 = classifier_model[1]
    clf3 = classifier_model[2]
    
    stacking_classifier = StackingClassifier(
                                                estimators = [('ada', clf1),('xgb', clf2), ('lgb', clf3)],
                                                final_estimator = LogisticRegression(),
                                                cv = 5
                                    )
    
    
    # Fitting the training set into classification model
    stacking_classifier.fit(X_train,y_train)

    # Predicting the output on test datset
    y_pred = stacking_classifier.predict(X_test)    

    # Cross Validation Score on training test
    cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
    scores = cross_val_score(stacking_classifier, X_train,y_train, cv=5, scoring='f1_weighted')
    cv_score_mean = scores.mean()

    # Classification score
    accuracy, precision, recall, F1_score, roc_auc_scr, conf_mat, fpr, tpr = calc_classfier_metric(stacking_classifier, y_test, y_pred)
    classifier_performance.append([stacking_classifier.__class__.__name__, conf_mat, accuracy, precision, recall, F1_score, roc_auc_scr, cv_score_mean, fpr, tpr])
    
    # Store the model into pkl
    joblib.dump(stacking_classifier, f'../model/sample_1_{under_sample_size}/{stacking_classifier.__class__.__name__}.pkl')        
    class_perf_df = pd.DataFrame(classifier_performance, columns=['Classifier', 'Conf_Mtrx', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC_Scr', 'CV_Score', 'FPR', 'TPR']).sort_values('F1_Score', ascending = False)
    
    stacking_class_df = pd.concat([prev_class_model_df, class_perf_df])
    
    return stacking_class_df

In [None]:
# Function for Confusion Matrix
def view_confusion_matrix(class_perf_df, columns):
    '''
    Function for Confusion Matrix
    '''
    rows = int(class_perf_df.shape[0]/ columns)
    plt.figure(figsize=(15,13))

    for i in range(class_perf_df.shape[0]):
        plt.subplot(rows,columns,i+1)
        plt.title(class_perf_df['Classifier'].loc[i])
        ax=sns.heatmap(class_perf_df['Conf_Mtrx'].loc[i],
                    annot=True,
                    cmap="coolwarm",
                    fmt="d",
                    cbar=False, 
                    annot_kws={"size": 12},
                    linewidths=1.2,
                    linecolor='w',
                   )
        ax.set_xticklabels(ax.get_xticklabels(), rotation = 0, fontsize = 10)
        ax.set_yticklabels(ax.get_yticklabels(), rotation = 25, fontsize = 10)
        ax.set_xlabel('True label') 
        ax.set_ylabel('Predicted label')
    
    return

In [None]:
def create_sample_set(train_df, non_fraud_sample_sizse):
    
    # Select columns
    train_df = train_df[['housing_status',
                            'device_os',
                            'credit_risk_score',
                            'current_address_months_count',
                            'has_other_cards',
                            'keep_alive_session',
                            'prev_address_months_count',
                            'phone_home_valid',
                            'proposed_credit_limit',
                            'name_email_similarity',
                            'income',
                            'fraud_bool' 
                        ]]
                        
    # Fraud Transactions
    train_df_fraud = train_df[train_df.fraud_bool == 1]
    # display(f'Shape of train_df_fraud {train_df_fraud.shape}')
    
    # Non Fraud Transactions
    train_df_non_fraud = train_df[train_df.fraud_bool == 0].sample(train_df_fraud.shape[0] * non_fraud_sample_sizse)
    # display(f'Shape of train_df_non_fraud {train_df_non_fraud.shape}')
    
    # Merge Fraud & Non Fraud
    train_df_merged = pd.concat([train_df_fraud, train_df_non_fraud])
 
    # X & Y
    X                 = train_df_merged.drop(columns=['fraud_bool'])
    y                 = train_df_merged['fraud_bool']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    return X_train, y_train, X_test, y_test

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay

def create_performance_graphs(X_test, y_test, sample_size):

    # Sample Ratio 1:1
    classifier_list = ['AdaBoostClassifier.pkl', 'LGBMClassifier.pkl', 'XGBClassifier.pkl', 'StackingClassifier.pkl', 'VotingClassifier.pkl']

    fig, [ax_roc, ax_prc] = plt.subplots(1, 2, figsize=(11, 5))

    for classifiers in classifier_list:
        classifier = joblib.load(f'../model/sample_1_{sample_size}/{classifiers}')
        y_pred = classifier.predict(X_test)
        RocCurveDisplay.from_predictions(y_test, y_pred, ax=ax_roc, name=classifier.__class__.__name__)
        PrecisionRecallDisplay.from_predictions(y_test, y_pred, ax=ax_prc, name=classifier.__class__.__name__)

    ax_roc.set_title(f"ROC-AUC Curve (1:{sample_size}) Ratio")
    ax_prc.set_title(f"Precision Recall Curve (1:{sample_size}) Ratio")

    ax_roc.grid(linestyle="--")
    ax_prc.grid(linestyle="--")

    return

In [None]:
%%time
# Train Model with Different Sample Size

performance_dataset = pd.DataFrame()

sample_size = 1

X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, sample_size)
print('In Sample Size {}, Traing Set - {} and Test Set - {}'.format(sample_size, X_train.shape[0], X_test.shape[0]))

# Machine Learning Model Build
classifier_model = [
                    AdaBoostClassifier(learning_rate = 0.1, n_estimators=500, random_state=42), 
                    XGBClassifier(colsample_bytree=1.0, gamma=5, learning_rate=1.0, max_depth=5, min_child_weight=1,    n_estimators=10, subsample=1.0, random_state=42),
                    LGBMClassifier(boosting_type = 'dart', colsample_bytree=1.0, learning_rate = 0.1, max_depth=10,n_estimators = 50, subsample=0.6, random_state=42, verbose=-1)
                ]

# Call Classification module
ind_class_model_df        = build_individual_classifier_model(X_train, X_test, y_train, y_test, classifier_model, sample_size)
ind_voting_model_df       = build_voting_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_class_model_df, sample_size)
ind_voting_stack_model_df = build_stacking_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_voting_model_df, sample_size)
ind_voting_stack_model_df['Sample_Size'] = f'1:{sample_size}'
ind_voting_stack_model_df.to_csv(f'performance_for_1_{sample_size}.csv', index=False)
performance_dataset = performance_dataset.append(ind_voting_stack_model_df)
    
# Show Performance Viz
create_performance_graphs(X_test, y_test, sample_size)
    

In [None]:
%%time
# Train Model with Different Sample Size

# performance_dataset = pd.DataFrame()

sample_size = 2

X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, sample_size)
print('In Sample Size {}, Traing Set - {} and Test Set - {}'.format(sample_size, X_train.shape[0], X_test.shape[0]))

# Machine Learning Model Build
classifier_model = [
                    AdaBoostClassifier(learning_rate = 0.1, n_estimators=500, random_state=42), 
                    XGBClassifier(colsample_bytree=1.0, gamma=5, learning_rate=1.0, max_depth=5, min_child_weight=1,    n_estimators=10, subsample=1.0, random_state=42),
                    LGBMClassifier(boosting_type = 'dart', colsample_bytree=1.0, learning_rate = 0.1, max_depth=10,n_estimators = 50, subsample=0.6, random_state=42, verbose=-1)
                ]

# Call Classification module
ind_class_model_df        = build_individual_classifier_model(X_train, X_test, y_train, y_test, classifier_model, sample_size)
ind_voting_model_df       = build_voting_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_class_model_df, sample_size)
ind_voting_stack_model_df = build_stacking_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_voting_model_df, sample_size)
ind_voting_stack_model_df['Sample_Size'] = f'1:{sample_size}'
ind_voting_stack_model_df.to_csv(f'performance_for_1_{sample_size}.csv', index=False)
performance_dataset = performance_dataset.append(ind_voting_stack_model_df)
    
# Show Performance Viz
create_performance_graphs(X_test, y_test, sample_size)

In [None]:
%%time
# Train Model with Different Sample Size

# performance_dataset = pd.DataFrame()
sample_size = 3

X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, sample_size)
print('In Sample Size {}, Traing Set - {} and Test Set - {}'.format(sample_size, X_train.shape[0], X_test.shape[0]))

# Machine Learning Model Build
classifier_model = [
                    AdaBoostClassifier(learning_rate = 0.1, n_estimators=500, random_state=42), 
                    XGBClassifier(colsample_bytree=1.0, gamma=5, learning_rate=1.0, max_depth=5, min_child_weight=1,    n_estimators=10, subsample=1.0, random_state=42),
                    LGBMClassifier(boosting_type = 'dart', colsample_bytree=1.0, learning_rate = 0.1, max_depth=10,n_estimators = 50, subsample=0.6, random_state=42, verbose=-1)
                ]

# Call Classification module
ind_class_model_df        = build_individual_classifier_model(X_train, X_test, y_train, y_test, classifier_model, sample_size)
ind_voting_model_df       = build_voting_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_class_model_df, sample_size)
ind_voting_stack_model_df = build_stacking_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_voting_model_df, sample_size)
ind_voting_stack_model_df['Sample_Size'] = f'1:{sample_size}'
ind_voting_stack_model_df.to_csv(f'performance_for_1_{sample_size}.csv', index=False)
performance_dataset = performance_dataset.append(ind_voting_stack_model_df)
    
# Show Performance Viz
create_performance_graphs(X_test, y_test, sample_size)

In [None]:
performance_dataset

In [None]:
%%time
# Train Model with Different Sample Size

performance_dataset1 = pd.DataFrame()

for sample_size in range(1, 2):

    X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, sample_size)
    print('In Sample Size {}, Traing Set - {} and Test Set - {}'.format(sample_size, X_train.shape[0], X_test.shape[0]))

    # Machine Learning Model Build
    classifier_model = [
                        AdaBoostClassifier(learning_rate = 0.1, n_estimators=500, random_state=42), 
                        XGBClassifier(colsample_bytree=1.0, gamma=5, learning_rate=1.0, max_depth=5, min_child_weight=1,    n_estimators=10, subsample=1.0, random_state=42),
                        LGBMClassifier(boosting_type = 'dart', colsample_bytree=1.0, learning_rate = 0.1, max_depth=10,n_estimators = 50, subsample=0.6, random_state=42, verbose=-1)
                    ]

    # Call Classification module
    ind_class_model_df        = build_individual_classifier_model(X_train, X_test, y_train, y_test, classifier_model, sample_size)
    ind_voting_model_df       = build_voting_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_class_model_df, sample_size)
    ind_voting_stack_model_df = build_stacking_classifier_model(X_train, X_test, y_train, y_test, classifier_model, ind_voting_model_df, sample_size)
    ind_voting_stack_model_df['Sample_Size'] = f'1:{sample_size}'

    performance_dataset = performance_dataset.append(ind_voting_stack_model_df)
    
    # Show Performance Viz
    create_performance_graphs(X_test, y_test, sample_size)

In [None]:
performance_dataset.to_csv('../model/performance.csv', index=False)

### GRID SEARCH

In [None]:
%%time
# Light GBM base model
# Initiate classifier to use
lgbm_classifier = LGBMClassifier(random_state=42)

# Grid Search for Light GBM
gridParams = {
                'n_estimators': [10, 50, 100, 500],
                'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
                'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
                'colsample_bytree': [0.6, 0.8, 1.0],
                'subsample' : [0.6, 0.8, 1.0],
                'max_depth': [2, 4, 6, 8, 10]
    }

grid = GridSearchCV(lgbm_classifier, gridParams, verbose=1, cv=4, n_jobs=-1, scoring = 'recall')
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
%%time
# Xtreme GBM base model
# Initiate classifier to use
xgb_classifier = XGBClassifier(random_state=42)

# Grid Search for XGBOOST
gridParams = {
        'n_estimators': [10, 50, 100, 500],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

grid = GridSearchCV(xgb_classifier, gridParams, verbose=1, cv=5, n_jobs=-1, scoring = 'recall')
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
%%time
# AdaBoost Base Model
# Initiate classifier to use
abc_classifier = AdaBoostClassifier(random_state=42)

# Grid Search for XGBOOST
gridParams = {
        'n_estimators': [10, 50, 100, 500],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
        }

grid = GridSearchCV(abc_classifier, gridParams, verbose=1, cv=5, n_jobs=-1, scoring = 'recall')
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay

In [None]:
for sample_size in range(1, 2):

    X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, sample_size)
    print('In Sample Size {}, Traing Set - {} and Test Set - {}'.format(sample_size, X_train.shape[0], X_test.shape[0]))

    classifier = joblib.load('../model/sample_1_1/VotingClassifier.pkl')
    predictions = classifier.predict(X_test)
    cm = confusion_matrix(y_test, predictions, labels=classifier.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                display_labels=['No Fraud', 'Fraud'])
    disp.plot()

In [None]:
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay

# Sample Ratio 1:1
classifier_list = ['AdaBoostClassifier.pkl', 'LGBMClassifier.pkl', 'XGBClassifier.pkl', 'StackingClassifier.pkl', 'VotingClassifier.pkl']

fig, [ax_roc, ax_prc] = plt.subplots(1, 2, figsize=(11, 5))

# Required to extract the test data
X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, 1) # 1is for 1:1 Sample.

for classifiers in classifier_list:
    classifier = joblib.load(f'../model/sample_1_1/{classifiers}')
    RocCurveDisplay.from_estimator(classifier, X_test, y_test, ax=ax_roc, name=classifier.__class__.__name__)
    PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax_prc, name=classifier.__class__.__name__)

ax_roc.set_title("ROC-AUC Curve (1:1) Ratio")
ax_prc.set_title("Precision Recall Curve (1:1) Ratio")

ax_roc.grid(linestyle="--")
ax_prc.grid(linestyle="--")

In [None]:
# Sample Ratio 1:2
classifier_list = ['AdaBoostClassifier.pkl', 'LGBMClassifier.pkl', 'XGBClassifier.pkl', 'StackingClassifier.pkl', 'VotingClassifier.pkl']

fig, [ax_roc, ax_prc] = plt.subplots(1, 2, figsize=(11, 5))

# Required to extract the test data
X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, 2) # 2 is for 1:2 Sample.

for classifiers in classifier_list:
    classifier = joblib.load(f'../model/sample_1_2/{classifiers}')
    RocCurveDisplay.from_estimator(classifier, X_test, y_test, ax=ax_roc, name=classifier.__class__.__name__)
    PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax_prc, name=classifier.__class__.__name__)

ax_roc.set_title("ROC-AUC Curve (1:2) Ratio")
ax_prc.set_title("Precision Recall Curve (1:2) Ratio")

ax_roc.grid(linestyle="--")
ax_prc.grid(linestyle="--")

In [None]:
# Sample Ratio 1:3
classifier_list = ['AdaBoostClassifier.pkl', 'LGBMClassifier.pkl', 'XGBClassifier.pkl', 'StackingClassifier.pkl', 'VotingClassifier.pkl']

fig, [ax_roc, ax_prc] = plt.subplots(1, 2, figsize=(11, 5))

# Required to extract the test data
X_train, y_train, X_test, y_test = create_sample_set(intellifraud_dataset_num, 3) # 3 is for 1:3 Sample.

for classifiers in classifier_list:
    classifier = joblib.load(f'../model/sample_1_3/{classifiers}')
    RocCurveDisplay.from_estimator(classifier, X_test, y_test, ax=ax_roc, name=classifier.__class__.__name__)
    PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax_prc, name=classifier.__class__.__name__)

ax_roc.set_title("ROC-AUC Curve (1:3) Ratio")
ax_prc.set_title("Precision Recall Curve (1:3) Ratio")

ax_roc.grid(linestyle="--")
ax_prc.grid(linestyle="--")