In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

In [None]:
df = pd.read_csv("/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv")

In [None]:
new_df = df.copy()

In [None]:
categorical_features = [x for x in new_df.columns if new_df[x].dtypes == "O"]
categorical_features

In [None]:
numeric_features = [x for x in new_df.columns if new_df[x].dtypes != "O" and new_df[x].nunique() > 2]
numeric_features

In [None]:
new_df = pd.DataFrame(pd.get_dummies(new_df, prefix=categorical_features))

In [None]:
new_df.head()

In [None]:
X = new_df.drop(['fraud_bool', 'device_fraud_count'], axis=1)
y = new_df['fraud_bool']

In [None]:
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTENC

In [None]:
from collections import Counter

In [None]:
%%time

# NearMiss Undersampling
print(f'Test dataset samples per class {Counter(y)}')

nm = NearMiss(sampling_strategy=0.25, n_jobs=-1)
X_nm, y_nm = nm.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_nm))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42, stratify=y_nm)

In [None]:
# https://stackoverflow.com/questions/48370150/how-to-implement-smote-in-cross-validation-and-gridsearchcv
# https://stackoverflow.com/questions/69388476/pipeline-and-gridsearch-pipeline-fully-recomputed

def train_classifier(classifier, param_dist, X_train, y_train, encoded_features):
    
    smote_nc = SMOTENC(categorical_features=encoded_features, sampling_strategy='minority', random_state=42)
        
    pipeline = make_pipeline(smote_nc, classifier)
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    
    rand_search = RandomizedSearchCV(estimator=pipeline, 
                                      param_distributions=param_dist,
                                      n_iter=10,
                                      scoring="roc_auc",
                                      n_jobs=-1,
                                      cv=cv)

    rand_search.fit(X_train, y_train)

    # model = rand_search.best_estimator_
    
    return rand_search

In [None]:
def test_classifier(classifier, X_test, y_test):
    
    preds = classifier.predict(X_test)

    labels = ['No Fraud', 'Fraud']
    
    metrics = classification_report(y_test, preds, target_names=labels)
    matrix = confusion_matrix(y_test, preds)
    
    return metrics, matrix

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

In [None]:
numeric_transformer = MinMaxScaler()

preprocessor = ColumnTransformer(transformers=[('numeric_transformer', numeric_transformer, numeric_features)], remainder='passthrough')

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.fit(X_test)

In [None]:
encoded_features = [X_train_scaled.columns.get_loc(str(feature)) for feature in X_train_scaled.columns if X_train_scaled[feature].nunique() == 2] # for scaled numerics
encoded_features

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr_params = {'logisticregression__C': [0.01, 0.1, 1, 10, 100, 1000],
             'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],
             'logisticregression__max_iter': [1000, 2500, 5000]
            }

In [None]:
%%time

lr_model = train_classifier(lr, lr_params, X_train_scaled, y_train, encoded_features)

In [None]:
lr_metrics, lr_matrix = test_classifier(lr_model, X_test_scaled, y_test)

In [None]:
print(lr_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(lr_matrix)

In [None]:
# Support Vector Classifier

from sklearn.svm import SVC

svc = SVC()

svc_params = {'svc__C': [0.01, 0.1, 1, 10, 100, 1000],
              'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'svc__max_iter': [1000, 2500, 5000]
             }

In [None]:
%%time

svc_model = train_classifier(svc, svc_params, X_train_scaled, y_train, encoded_features)

In [None]:
# K-Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier()

kn_params = {'kneighborsclassifier__n_neighbors': [2, 4, 6, 8, 10],
             'kneighborsclassifier__weights': ['uniform', 'distance'],
             'kneighborsclassifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
             'kneighborsclassifier__leaf_size': [20, 25, 30, 35, 40]
            }

In [None]:
%%time

kn_model = train_classifier(kn, kn_params, X_train_scaled, y_train, encoded_features)

In [None]:
encoded_features = [X_train.columns.get_loc(str(feature)) for feature in X_train.columns if X_train[feature].nunique() == 2] # for unscaled numerics
encoded_features

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt_params = {'decisiontreeclassifier__criterion': ['gini', 'entropy'],
             'decisiontreeclassifier__max_depth': [2, 4, 6, 8, 10],
             'decisiontreeclassifier__max_features': ['sqrt', 'log2']
            }

In [None]:
%%time

dt_model = train_classifier(dt, dt_params, X_train, y_train, encoded_features)

In [None]:
# RandomForest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_params = {'randomforestclassifier__n_estimators': [20, 40, 60, 80, 100],
            'randomforestclassifier__criterion': ['gini', 'entropy'],
            'randomforestclassifier__max_depth': [2, 4, 6, 8, 10],
            'randomforestclassifier__max_features': ['sqrt', 'log2']
            }

In [None]:
%%time

rf_model = train_classifier(rf, rf_params, X_train, y_train, encoded_features)

In [None]:
rf_metrics, rf_matrix = test_classifier(rf_model, X_test, y_test)

In [None]:
print(rf_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(rf_matrix)

In [None]:
# XGBoost

from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb_params = {'xgbclassifier__n_estimators': [20, 40, 60, 80, 100],
              'xgbclassifier__max_depth': [2, 4, 6, 8, 10],
              'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
              'xgbclassifier__min_child_weight': [1, 2, 3, 4],
              'xgbclassifier__subsample': [0.6, 0.8, 1.0],
              'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0]
             }

In [None]:
%%time

xgb_model = train_classifier(xgb, xgb_params, X_train, y_train, encoded_features)

In [None]:
xgb_metrics, xgb_matrix = test_classifier(xgb_model, X_test, y_test)

In [None]:
print(xgb_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(xgb_matrix)

In [None]:
# LightGBM

from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

lgbm_params = {'lgbmclassifier__max_depth': [2, 4, 6, 8, 10],
               'lgbmclassifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
               'lgbmclassifier__n_estimators': [20, 40, 60, 80, 100],
               'lgbmclassifier__min_child_weight': [1, 2, 3, 4],
               'lgbmclassifier__subsample': [0.6, 0.8, 1.0],
               'lgbmclassifier__colsample_bytree': [0.6, 0.8, 1.0]
              }

In [None]:
%%time

lgbm_model = train_classifier(lgbm, lgbm_params, X_train, y_train, encoded_features)

In [None]:
lgbm_metrics, lgbm_matrix = test_classifier(lgbm_model, X_test, y_test)

In [None]:
print(lgbm_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(lgbm_matrix)

In [None]:
'''# CatBoost

from catboost import CatBoostClassifier

cbc = CatBoostClassifier()

cbc_params = {'catboostclassifier__iterations': [20, 40, 60, 80, 100],
              'catboostclassifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
              'catboostclassifier__depth': [2, 4, 6, 8, 10]
             }'''

In [None]:
'''%%time

cbc_model = train_classifier(cbc, cbc_params, X_train, y_train, encoded_features)'''

In [None]:
'''cbc_metrics, cbc_matrix = test_classifier(cbc_model, X_test, y_test)'''

In [None]:
'''print(cbc_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(cbc_matrix)'''