In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

In [3]:
df = pd.read_csv("Base.csv")

In [4]:
new_df = df.copy()

In [5]:
categorical_features = [x for x in new_df.columns if new_df[x].dtypes == "O"]
categorical_features

['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']

In [6]:
numeric_features = [x for x in new_df.columns if new_df[x].dtypes != "O" and new_df[x].nunique() > 2]
numeric_features

['income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'bank_months_count',
 'proposed_credit_limit',
 'session_length_in_minutes',
 'device_distinct_emails_8w',
 'month']

In [7]:
new_df = pd.DataFrame(pd.get_dummies(new_df, prefix=categorical_features))

In [8]:
new_df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,email_is_free,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month,payment_type_AA,payment_type_AB,payment_type_AC,payment_type_AD,payment_type_AE,employment_status_CA,employment_status_CB,employment_status_CC,employment_status_CD,employment_status_CE,employment_status_CF,employment_status_CG,housing_status_BA,housing_status_BB,housing_status_BC,housing_status_BD,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,1,0.9,0.166828,-1,88,50,0.020925,-1.331345,769,10650.765523,3134.31963,3863.64774,1,6,185,0,1,0,24,0,500.0,0,3.888115,0,1,0,7,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
1,1,0.9,0.296286,-1,144,50,0.005418,-0.816224,366,534.047319,2670.918292,3124.298166,718,3,259,1,0,0,15,0,1500.0,0,31.798819,0,1,0,7,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
2,1,0.9,0.044985,-1,132,40,3.108549,-0.755728,870,4048.534263,2893.621498,3159.590679,1,14,177,1,0,1,-1,0,200.0,0,4.728705,0,1,0,7,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
3,1,0.9,0.159511,-1,22,50,0.019079,-1.205124,810,3457.064063,4054.908412,3022.261812,1921,6,110,1,0,1,31,1,200.0,0,2.047904,0,1,0,7,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
4,1,0.9,0.596414,-1,218,50,0.004441,-0.773276,890,5020.341679,2728.237159,3087.670952,1990,2,295,1,1,0,31,0,1500.0,0,3.775225,1,1,0,7,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0


In [9]:
X = new_df.drop(['fraud_bool', 'device_fraud_count'], axis=1)
y = new_df['fraud_bool']

In [10]:
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTENC

In [11]:
from collections import Counter

In [12]:
%%time

# NearMiss Undersampling
print(f'Test dataset samples per class {Counter(y)}')

nm = NearMiss(sampling_strategy=0.25, n_jobs=-1)
X_nm, y_nm = nm.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_nm))

Test dataset samples per class Counter({0: 988971, 1: 11029})
Resampled dataset shape Counter({0: 44116, 1: 11029})
Wall time: 5min 2s


In [67]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold

In [68]:
X_train,X_test,y_train,y_test = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42, stratify=y_nm)

In [69]:
# https://stackoverflow.com/questions/48370150/how-to-implement-smote-in-cross-validation-and-gridsearchcv
# https://stackoverflow.com/questions/69388476/pipeline-and-gridsearch-pipeline-fully-recomputed

def train_classifier(classifier, param_dist, X_train, y_train, encoded_features):
    
    smote_nc = SMOTENC(categorical_features=encoded_features, sampling_strategy='minority', random_state=42)
        
    pipeline = make_pipeline(smote_nc, classifier)
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    
    rand_search = RandomizedSearchCV(estimator=pipeline, 
                                      param_distributions=param_dist,
                                      n_iter=10,
                                      scoring="accuracy",
                                      n_jobs=-1,
                                      cv=cv)

    rand_search.fit(X_train, y_train)

    # model = rand_search.best_estimator_
    
    return rand_search

In [70]:
def test_classifier(classifier, X_test, y_test):
    
    preds = classifier.predict_proba(X_test)[:, 1]
    
    fprs, tprs, thresholds = roc_curve(y_test, preds)
    
    threshold = np.min(thresholds[fprs==max(fprs[fprs < 0.05])])
    recall = np.max(tprs[fprs==max(fprs[fprs < 0.05])])
    
    preds_binary = (preds > threshold).astype(int)

    cls_report = classification_report(y_test, preds_binary, target_names=['No Fraud', 'Fraud'])
    con_matrix = confusion_matrix(y_test, preds_binary)
    
    return fprs, tprs, recall, cls_report, con_matrix

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

In [72]:
numeric_transformer = MinMaxScaler()

preprocessor = ColumnTransformer(transformers=[('numeric_transformer', numeric_transformer, numeric_features)], remainder='passthrough')

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [73]:
X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

In [74]:
encoded_features = [feature for feature in X_train_scaled.columns if X_train_scaled[feature].nunique() == 2] # for scaled numerics
encoded_features

[19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50]

In [75]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr_params = {'logisticregression__C': [0.01, 0.1, 1, 10, 100, 1000],
             'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],
             'logisticregression__max_iter': [1000, 2500, 5000]
            }

In [76]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr_params = {'logisticregression__C': [0.1, 1, 10, 100],
             'logisticregression__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],
             'logisticregression__max_iter': [1000, 2500]
            }

In [77]:
%%time

lr_model = train_classifier(lr, lr_params, X_train_scaled, y_train, encoded_features)

Wall time: 4min 32s


In [78]:
lr_fprs, lr_tprs, lr_recall, lr_cls_report, lr_con_matrix = test_classifier(lr_model, X_test_scaled, y_test)

In [None]:
print(lr_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(lr_matrix)

In [None]:
encoded_features = [X_train.columns.get_loc(str(feature)) for feature in X_train.columns if X_train[feature].nunique() == 2] # for unscaled numerics
encoded_features

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt_params = {'decisiontreeclassifier__criterion': ['gini', 'entropy'],
             'decisiontreeclassifier__max_depth': [2, 4, 6, 8, 10],
             'decisiontreeclassifier__max_features': ['sqrt', 'log2']
            }

In [None]:
%%time

dt_model = train_classifier(dt, dt_params, X_train, y_train, encoded_features)

In [None]:
# RandomForest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_params = {'randomforestclassifier__n_estimators': [20, 40, 60, 80, 100],
            'randomforestclassifier__criterion': ['gini', 'entropy'],
            'randomforestclassifier__max_depth': [2, 4, 6, 8, 10],
            'randomforestclassifier__max_features': ['sqrt', 'log2']
            }

In [None]:
%%time

rf_model = train_classifier(rf, rf_params, X_train, y_train, encoded_features)

In [None]:
rf_metrics, rf_matrix = test_classifier(rf_model, X_test, y_test)

In [None]:
print(rf_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(rf_matrix)

In [None]:
# XGBoost

from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb_params = {'xgbclassifier__n_estimators': [20, 40, 60, 80, 100],
              'xgbclassifier__max_depth': [2, 4, 6, 8, 10],
              'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
              'xgbclassifier__min_child_weight': [1, 2, 3, 4],
              'xgbclassifier__subsample': [0.6, 0.8, 1.0],
              'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0]
             }

In [None]:
%%time

xgb_model = train_classifier(xgb, xgb_params, X_train, y_train, encoded_features)

In [None]:
xgb_metrics, xgb_matrix = test_classifier(xgb_model, X_test, y_test)

In [None]:
print(xgb_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(xgb_matrix)

In [None]:
# LightGBM

from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

lgbm_params = {'lgbmclassifier__max_depth': [2, 4, 6, 8, 10],
               'lgbmclassifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
               'lgbmclassifier__n_estimators': [20, 40, 60, 80, 100],
               'lgbmclassifier__min_child_weight': [1, 2, 3, 4],
               'lgbmclassifier__subsample': [0.6, 0.8, 1.0],
               'lgbmclassifier__colsample_bytree': [0.6, 0.8, 1.0]
              }

In [None]:
%%time

lgbm_model = train_classifier(lgbm, lgbm_params, X_train, y_train, encoded_features)

In [None]:
lgbm_metrics, lgbm_matrix = test_classifier(lgbm_model, X_test, y_test)

In [None]:
print(lgbm_metrics)
print('   ' * 18)
print('---' * 18)
print('   ' * 18)
print(lgbm_matrix)