In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from utils import eda
from random import randint

In [2]:
data = pd.read_csv("./ecd1-2023/dataset_train.csv")

In [3]:
SEED = randint(0, 1000)
COLUMNS_TO_DROP = ["ID"]
TARGET = "fraud_bool"
FEATURES = list(set(data.columns) - set([TARGET]) - set(COLUMNS_TO_DROP))
FEATURES_CATEGORICAL = [col for col in FEATURES if data[col].dtype == 'object']
FEATURES_NUMERIC = list(set(FEATURES) - set(FEATURES_CATEGORICAL))

print("SEED: ", SEED)
print("Features: ", FEATURES)
print("Target: ", TARGET)
print("Categorical features: ", FEATURES_CATEGORICAL)
print("Numeric features: ", FEATURES_NUMERIC)

SEED:  194
Features:  ['employment_status', 'prev_address_months_count', 'velocity_24h', 'has_other_cards', 'email_is_free', 'days_since_request', 'device_os', 'zip_count_4w', 'name_email_similarity', 'session_length_in_minutes', 'month', 'credit_risk_score', 'phone_home_valid', 'current_address_months_count', 'payment_type', 'recent_loan_approval_ratio', 'velocity_4w', 'housing_status', 'customer_age', 'velocity_6h', 'bank_branch_count_8w', 'income', 'transaction_amount_ratio', 'phone_mobile_valid', 'date_of_birth_distinct_emails_4w', 'device_distinct_emails_8w', 'foreign_request', 'proposed_credit_limit', 'bank_months_count', 'keep_alive_session', 'intended_balcon_amount', 'distance_to_nearest_bank_branch', 'device_fraud_count', 'credit_utilization_ratio', 'source']
Target:  fraud_bool
Categorical features:  ['employment_status', 'device_os', 'payment_type', 'housing_status', 'source']
Numeric features:  ['prev_address_months_count', 'velocity_24h', 'has_other_cards', 'email_is_free'

In [4]:
data = eda.handle_numeric_missing_values(data=data, columns=FEATURES_NUMERIC)
data = eda.handle_categorical_missing_values(data=data, columns=FEATURES_CATEGORICAL)

In [5]:
from sklearn.preprocessing import MinMaxScaler

data, new_cols = eda.handle_one_hot_encoding(data=data, columns=FEATURES_CATEGORICAL)
FEATURES_CATEGORICAL = new_cols
data = eda.handle_scaling(data=data, columns=FEATURES_NUMERIC, scaler=MinMaxScaler())

Encoding column:  employment_status
Encoding column:  device_os
Encoding column:  payment_type
Encoding column:  housing_status
Encoding column:  source


In [6]:
# Training and Testing Sets
from collections import Counter
from sklearn.model_selection import train_test_split

x = data[FEATURES_NUMERIC + FEATURES_CATEGORICAL]

y = data[TARGET]
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.20,
    random_state=SEED,
    stratify=y,
)

print(f"x_train: {x_train.shape}; {Counter(y_train)} imb: {Counter(y_train)[1]/Counter(y_train)[0]}")
print(f"x_test: {x_test.shape}; {Counter(y_test)} imb: {Counter(y_test)[1]/Counter(y_test)[0]}")

# x_train: (56000, 56); Counter({0: 45473, 1: 10527}) imb: 0.2315000109955358
# x_test: (14000, 56); Counter({0: 11368, 1: 2632}) imb: 0.2315270935960591

x_train: (56000, 56); Counter({0: 45473, 1: 10527}) imb: 0.2315000109955358
x_test: (14000, 56); Counter({0: 11368, 1: 2632}) imb: 0.2315270935960591


In [7]:
# from sklearn.ensemble import IsolationForest


# train = pd.concat([x_train, y_train], axis=1)
# x_train_0 = train[train[TARGET] == 0].drop(TARGET, axis=1)
# x_train_1 = train[train[TARGET] == 1].drop(TARGET, axis=1)
# print(x_train_0.shape)
# print(x_train_1.shape)

# od0 = IsolationForest(random_state=SEED, bootstrap=True).fit(x_train_0)
# od1 = IsolationForest(random_state=SEED, bootstrap=True).fit(x_train_1)

# x_train_0["is_outlier"] = od0.predict(x_train_0)
# x_train_0["is_outlier"] = x_train_0["is_outlier"].apply(lambda x: 1 if x == -1 else 0)
# x_train_1["is_outlier"] = od1.predict(x_train_1)
# x_train_1["is_outlier"] = x_train_1["is_outlier"].apply(lambda x: 1 if x == -1 else 0)

# print("X_train_0:", x_train_0["is_outlier"].value_counts())
# print("X_train_1:", x_train_1["is_outlier"].value_counts())

# x_train_0 = x_train_0[x_train_0["is_outlier"] == 1].drop("is_outlier", axis=1)
# x_train_0[TARGET] = 0
# x_train_1 = x_train_1[x_train_1["is_outlier"] == 1].drop("is_outlier", axis=1)
# x_train_1[TARGET] = 1

# print("X_train_0:", x_train_0.shape)
# print("X_train_1:", x_train_1.shape)

# x_train_new = pd.concat([x_train_0, x_train_1], axis=0)
# # mix rows in _new
# x_train_new = x_train_new.sample(frac=1).reset_index(drop=True)
# x_train_new.head()

# y_train = x_train_new[TARGET]
# x_train = x_train_new.drop(TARGET, axis=1)


# # clf = IsolationForest(random_state=SEED, bootstrap=True).fit(x_train)
# # x_train["is_outlier"] = clf.predict(x_train)
# # x_train["is_outlier"].value_counts()
# # out = x_train[x_train["is_outlier"] == -1].index
# # y_train[out].value_counts()

In [8]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=5, random_state=SEED)
# pca.fit(x_train_new)
# x_train_pca = pca.transform(x_train_new)
# plt.figure(figsize=(3, 3))
# plt.scatter(x_train_pca[:, 1], x_train_pca[:, 3], c=y_train_new, cmap="plasma", s=1)
# plt.xlabel("PCA 1")
# plt.ylabel("PCA 2")
# plt.grid(True)
# plt.show()

In [9]:
# x_train = x_train.drop(out)
# x_train = x_train.drop("is_outlier", axis=1)
# y_train = y_train.drop(out)
# y_train.value_counts()

In [10]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

counter = Counter(y_train)
print('Before', counter)

rus = RandomUnderSampler(random_state=SEED)
x_train_un, y_train_un = rus.fit_resample(x_train, y_train)

counter = Counter(y_train_un)
print('After', counter)

# Before Counter({0: 45473, 1: 10527})
# After Counter({0: 10527, 1: 10527})

Before Counter({0: 45473, 1: 10527})
After Counter({0: 10527, 1: 10527})


In [11]:
# from utils.score import ScoreClassification
# clf = IsolationForest(random_state=SEED, bootstrap=True).fit(x_test)
# y_pred = clf.predict(x_test)
# y_pred = np.where(y_pred == -1, 1, 0)
# print(sum(y_pred))
# print(len(y_pred))

# score = ScoreClassification()
# score.calculate(y_test, y_pred)
# print(score)

In [12]:
# from sklearn.feature_selection import SelectKBest, f_regression

# feature_selector = SelectKBest(k=20, score_func=f_regression)

# feature_selector.fit(x_train, y_train)
# x_train = feature_selector.transform(x_train)
# x_test = feature_selector.transform(x_test)

In [13]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10, random_state=SEED)
# pca.fit(x_train)
# x_train = pca.transform(x_train)
# x_test = pca.transform(x_test)

In [14]:
# plt.figure(figsize=(3, 3))
# plt.scatter(x_train[:, 3], x_train[:, 5], c=y_train, cmap="plasma", s=1)
# plt.xlabel("PCA 1")
# plt.ylabel("PCA 2")
# plt.grid(True)
# plt.show()

In [15]:
from xgboost import XGBClassifier
from utils.model import ModelOptimizer, SeachMethod

search_space = {
    "n_estimators": (50, 500),
    "max_depth": (1, 20),
    "learning_rate": (0.01, 0.5, "log-uniform"),
    "reg_alpha": (0, 1),
}

# search_space_grid = {
#     "n_estimators": [10, 50, 100, 200],
#     "max_depth": [1, 5, 10, 20, None],
#     "learning_rate": [0.01, 0.1, 0.5],
#     "reg_alpha": [0, 0.5, 1],
# }

opt = ModelOptimizer(
    name="xgboost",
    model=XGBClassifier(),
    search_space=search_space,
    search_method=SeachMethod.BAYES,
    k_fold=5,
)
opt.train(x_train, y_train)
opt.evaluate(x_test, y_test)
print(opt.scores)

Training model: xgboost
Hyper tuning: SeachMethod.BAYES
Best params: OrderedDict({'learning_rate': 0.18869253516173146, 'max_depth': 4, 'n_estimators': 216, 'reg_alpha': 1})
ScoreClassification(accuracy=0.8885714285714286, precision=0.8884057971014493, recall=0.46580547112462006, f1=0.6111665004985045, auc_roc=0.7261293365475316, kappa=0.5534078417002565, confusion_matrix=array([[11214,   154],
       [ 1406,  1226]]))


In [16]:
## Results XGBoost
## --------------------
# ScoreClassification(
#     accuracy=0.8890714285714286,
#     precision=0.8912255257432923,
#     recall=0.46694528875379937,
#     f1=0.612814759411618,
#     auc_roc=0.7268751778040637,
#     kappa=0.5553323468373336,
#     confusion_matrix=array([[11218, 150], [1403, 1229]]),
# )

# Best params: {'learning_rate': 0.5, 'max_depth': None, 'n_estimators': 10, 'reg_alpha': 0.5}
# ScoreClassification(
#     accuracy=0.8868571428571429,
#     precision=0.8847283406754772,
#     recall=0.45782674772036475,
#     f1=0.6034051076614922,
#     auc_roc=0.7220080255139474,
#     kappa=0.5450743964194138,
#     confusion_matrix=array([[11211, 157], [1427, 1205]]),
# )

# Best params: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200, 'reg_alpha': 0.5}
# ScoreClassification(
#     accuracy=0.8547857142857143,
#     precision=0.9009370816599732,
#     recall=0.25569908814589665,
#     f1=0.3983427049422906,
#     auc_roc=0.6245947938970159,
#     kappa=0.3437972866195328,
#     confusion_matrix=array([[11294, 74], [1959, 673]]),
# )

# Best params: {'learning_rate': 0.5, 'max_depth': 1, 'n_estimators': 200, 'reg_alpha': 1}
# ScoreClassification(
#     accuracy=0.8654285714285714,
#     precision=0.9065217391304348,
#     recall=0.3168693009118541,
#     f1=0.46959459459459457,
#     auc_roc=0.6546521029541678,
#     kappa=0.41236650364307814,
#     confusion_matrix=array([[11282, 86], [1798, 834]]),
# )

# Best params: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'reg_alpha': 1}
# ScoreClassification(
#     accuracy=0.8872857142857142,
#     precision=0.8818840579710145,
#     recall=0.4623860182370821,
#     f1=0.6066799601196411,
#     auc_roc=0.7240237621093926,
#     kappa=0.5482548552583364,
#     confusion_matrix=array([[11205, 163], [1415, 1217]]),
# )

# Best params: OrderedDict({'learning_rate': 0.18869253516173146, 'max_depth': 4, 'n_estimators': 216, 'reg_alpha': 1})
# ScoreClassification(
#     accuracy=0.8885714285714286,
#     precision=0.8884057971014493,
#     recall=0.46580547112462006,
#     f1=0.6111665004985045,
#     auc_roc=0.7261293365475316,
#     kappa=0.5534078417002565,
#     confusion_matrix=array([[11214, 154], [1406, 1226]]),
# )

In [17]:
# from sklearn.ensemble import RandomForestClassifier
# from utils.model import ModelOptimizer, SeachMethod

# # Create the random grid
# search_space = {
#     "n_estimators": [400],  # based on other experiments
#     "max_features": ["sqrt"],  # based on other experiments
#     "max_depth": [40], # based on other experiments
#     "min_samples_split": [2],  # based on other experiments
#     "min_samples_leaf": [1],  # based on other experiments
#     "bootstrap": [True],  # based on other experiments
# }

# opt = ModelOptimizer(
#     name="random_forest",
#     model=RandomForestClassifier(),
#     search_space=search_space,
#     search_method=SeachMethod.GRID,
#     k_fold=2,
# )
# opt.train(x_train, y_train)
# opt.evaluate(x_test, y_test)
# print(opt.scores)


In [18]:
## Results Random Forest
## --------------------
# Best params: {'bootstrap': True, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8892857142857142,
#     precision=0.8948905109489051,
#     recall=0.46580547112462006,
#     f1=0.6126936531734134,
#     auc_roc=0.7265691676523874,
#     kappa=0.5554765291607398,
#     confusion_matrix=array([[11224, 144], [1406, 1226]]),
# )
# ScoreClassification(
#     accuracy=0.8892857142857142,
#     precision=0.8948905109489051,
#     recall=0.46580547112462006,
#     f1=0.6126936531734134,
#     auc_roc=0.7265691676523874,
#     kappa=0.5554765291607398,
#     confusion_matrix=array([[11224, 144], [1406, 1226]]),
# )

# Best params: {'bootstrap': True, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
# ScoreClassification(
#     accuracy=0.8866428571428572,
#     precision=0.891972993248312,
#     recall=0.45174772036474165,
#     f1=0.5997477931904162,
#     auc_roc=0.7195402922724484,
#     kappa=0.5418316111360035,
#     confusion_matrix=array([[11224, 144], [1443, 1189]]),
# )

# Best params: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8707857142857143,
#     precision=0.7377238590410168,
#     recall=0.4851823708206687,
#     f1=0.5853770341508137,
#     auc_roc=0.722622853249884,
#     kappa=0.5126805425651592,
#     confusion_matrix=array([[10914, 454], [1355, 1277]]),
# )

# Best params: {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8678571428571429,
#     precision=0.7196629213483146,
#     recall=0.4867021276595745,
#     f1=0.5806890299184043,
#     auc_roc=0.721403491697486,
#     kappa=0.5057070793433653,
#     confusion_matrix=array([[10869, 499], [1351, 1281]]),
# )

# Best params: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8724285714285714,
#     precision=0.7411630558722919,
#     recall=0.4939209726443769,
#     f1=0.5927952576379388,
#     auc_roc=0.7269921541617381,
#     kappa=0.5207293929739896,
#     confusion_matrix=array([[10914, 454], [1332, 1300]]),
# )

# Best params: {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8743571428571428,
#     precision=0.7539267015706806,
#     recall=0.49240121580547114,
#     f1=0.5957251206619167,
#     auc_roc=0.727595752167338,
#     kappa=0.5251920826117189,
#     confusion_matrix=array([[10945, 423], [1336, 1296]]),
# )

# Best params: {'bootstrap': True, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
# ScoreClassification(
#     accuracy=0.8886428571428572,
#     precision=0.8953574060427414,
#     recall=0.4616261398176292,
#     f1=0.6091752318876912,
#     auc_roc=0.7245674682198632,
#     kappa=0.551852839855949,
#     confusion_matrix=array([[11226, 142], [1417, 1215]]),
# )