<a href="https://colab.research.google.com/github/vitaliy-sharandin/data_science_projects/blob/master/portfolio/classification/Fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Datasets
* https://www.kaggle.com/datasets/ealaxi/paysim1

# Tutorial
* https://thecleverprogrammer.com/2022/02/22/online-payments-fraud-detection-with-machine-learning/

#EDA

In [2]:
!pip install -U -q datasets
!pip install -U -q ydata-profiling
!pip install -U -q feature_engine
!pip install -U -q optuna

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/519.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.3/357.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

In [3]:
from datasets import load_dataset
from ydata_profiling import ProfileReport
import pandas as pd
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, fbeta_score
import optuna
from imblearn.over_sampling import SMOTE

In [4]:
fraud_dataset = load_dataset("vitaliy-sharandin/synthetic-fraud-detection")
fraud_df = fraud_dataset['train'].to_pandas()

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/494M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# profile = ProfileReport(fraud_df, title="Fraud data report", dark_mode=True)
# profile.to_notebook_iframe()

Phase results
* Analysis
  * No missing values.
  * No feature is highly correlated with target.
  * Some features are highly correlated among themselves.



# Feature selection and engineering

In [6]:
def categorize_variables(target, df_train, cat_numeric_unique_threshold=10):
  target = target
  categorical_numeric = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and df_train[var].nunique()<=cat_numeric_unique_threshold]
  continuous = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and var not in categorical_numeric]
  mixed = [var for var in df_train.columns if pd.api.types.infer_dtype(df_train[var]) == 'mixed']
  categorical_object = [var for var in df_train.columns if df_train[var].dtype=='O' and var not in mixed]
  sorted_features = [target]+categorical_numeric+continuous+categorical_object+mixed
  print('Total columns: '+str(df_train.columns.size)+'\nColumns after sorting: '+str(len(sorted_features)))
  return target, categorical_numeric, continuous, mixed, categorical_object
target, categorical_numeric, continuous, mixed, categorical_object = categorize_variables('isFraud', fraud_df)

Total columns: 11
Columns after sorting: 11


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fraud_df, fraud_df[target], test_size=0.2, stratify=fraud_df[target], random_state=42)

In [8]:
encoder = OrdinalEncoder(
    variables=categorical_object,
    encoding_method='ordered'
)

X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.fit_transform(X_test, y_test)

In [9]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Model selection

In [12]:
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ['XGBoost', 'LGBM'])

    if classifier_name =="XGBoost":
      params = {
          # "tree_method": 'gpu_hist',
          "objective": "binary:logistic",
          "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
          "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
          "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
          "max_depth" : trial.suggest_int("max_depth", 1, 9),
          "eta" : trial.suggest_float("eta", 1e-8, 1.0, log=True),
          "gamma" : trial.suggest_float("gamma", 1e-8, 1.0, log=True),
          "grow_policy" : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
      }
      model = XGBClassifier(**params)

    else:
      params = {
          "objective": "binary",
          "metric": "binary_logloss",
          "verbosity": -1,
          "boosting_type": "gbdt",
          # "device" : "gpu",
          "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
          "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
          "num_leaves": trial.suggest_int("num_leaves", 2, 256),
          "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
          "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
          "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
          "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      }
      model = LGBMClassifier(**params)

    f2_scorer = make_scorer(fbeta_score, beta=2)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring=f2_scorer)
    average_score = score.mean()

    return average_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

display(study.best_params)
display(study.best_value)

[I 2023-08-26 15:35:36,532] A new study created in memory with name: no-name-7f362f39-f0be-47fe-a336-f19fdae1461c


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-08-26 15:39:43,150] Trial 0 finished with value: 1.0 and parameters: {'classifier': 'LGBM', 'lambda_l1': 0.0059994204267344, 'lambda_l2': 6.022935098323057e-08, 'num_leaves': 142, 'feature_fraction': 0.6563740267256111, 'bagging_fraction': 0.5453040342453932, 'bagging_freq': 5, 'min_child_samples': 13}. Best is trial 0 with value: 1.0.
[W 2023-08-26 15:41:06,747] Trial 1 failed with parameters: {'classifier': 'XGBoost', 'booster': 'gbtree', 'lambda': 1.6373717710478635e-06, 'alpha': 0.0005299112401273169, 'max_depth': 8, 'eta': 0.45783161830119046, 'gamma': 1.2474816594327758e-06, 'grow_policy': 'depthwise'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-7d23ebea52cf>", line 37, in objective
    score = cross_val_score(model, X_train, y_train, cv=3, scoring=f2_scorer)
  File "/usr

KeyboardInterrupt: ignored