<a href="https://colab.research.google.com/github/vitaliy-sharandin/data_science_projects/blob/master/portfolio/classification/Fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Datasets
* https://www.kaggle.com/datasets/ealaxi/paysim1

# Tutorial
* https://thecleverprogrammer.com/2022/02/22/online-payments-fraud-detection-with-machine-learning/

#EDA

In [11]:
!pip install -U -q datasets
!pip install -U -q ydata-profiling
!pip install -U -q feature_engine
!pip install -U -q optuna
!pip install -U -q boruta

In [12]:
from datasets import load_dataset
from ydata_profiling import ProfileReport
import pandas as pd
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, fbeta_score
import optuna
from imblearn.over_sampling import SMOTE
from boruta import BorutaPy

ImportError: ignored

In [7]:
fraud_dataset = load_dataset("vitaliy-sharandin/synthetic-fraud-detection")
fraud_df = fraud_dataset['train'].to_pandas()

NameError: ignored

In [None]:
# profile = ProfileReport(fraud_df, title="Fraud data report", dark_mode=True)
# profile.to_notebook_iframe()

In [None]:
fraud_df = fraud_df.drop(columns='isFlaggedFraud')

In [None]:
new_df_length = 2000000

fraud_samples = fraud_df[fraud_df['isFraud'] == 0]
non_fraud_samples = fraud_df[fraud_df['isFraud'] == 1]

fraud_samples_to_keep = new_df_length - len(non_fraud_samples)

fraud_downsampled = fraud_samples.sample(n=fraud_samples_to_keep, random_state=42)

df_downsampled = pd.concat([fraud_downsampled, non_fraud_samples], axis=0)

fraud_df = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

Phase results
* Analysis
  * No missing values.
  * No feature is highly correlated with target.
  * Some features are highly correlated among themselves.



# Feature selection and engineering

In [None]:
def categorize_variables(target, df_train, cat_numeric_unique_threshold=10):
  target = target
  categorical_numeric = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and df_train[var].nunique()<=cat_numeric_unique_threshold]
  continuous = [var for var in df_train.columns if df_train[var].dtype!='O' and var!=target and var not in categorical_numeric]
  mixed = [var for var in df_train.columns if pd.api.types.infer_dtype(df_train[var]) == 'mixed']
  categorical_object = [var for var in df_train.columns if df_train[var].dtype=='O' and var not in mixed]
  sorted_features = [target]+categorical_numeric+continuous+categorical_object+mixed
  print('Total columns: '+str(df_train.columns.size)+'\nColumns after sorting: '+str(len(sorted_features)))
  return target, categorical_numeric, continuous, mixed, categorical_object
target, categorical_numeric, continuous, mixed, categorical_object = categorize_variables('isFraud', fraud_df)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fraud_df[fraud_df.columns.difference([target])], fraud_df[target], test_size=0.2, stratify=fraud_df[target], random_state=42)

In [None]:
encoder = OrdinalEncoder(
    variables=categorical_object,
    encoding_method='ordered'
)

X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.fit_transform(X_test, y_test)

In [None]:
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)
# X_test, y_test = smote.fit_resample(X_test, y_test)

In [None]:
# %%time
# params = {
#     'objective': 'binary:logistic',  # binary classification
#     'eval_metric': 'logloss',        # log-likelihood loss
#     'eta': 0.1,                      # learning rate
#     'max_depth': 6,                  # maximum depth of a tree
#     'subsample': 0.7,                # subsample ratio of the training instances
#     'colsample_bytree': 0.7,          # subsample ratio of columns when constructing each tree .
#     'tree_method': 'hist'
# }

# model = XGBClassifier(**params)

# feat_selector = BorutaPy(model, n_estimators='auto', random_state=42)
# feat_selector.fit(X_train.values, y_train.values)
# selected_rf_features = pd.DataFrame({'Feature':list(X_train.columns),
#                                        'Ranking':feat_selector.ranking_}).sort_values(by='Ranking')
# selected_rf_features.nsmallest(40, 'Ranking').plot.barh(x='Feature',figsize=(24,5))

# Model selection

In [None]:
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ['XGBoost', 'LGBM'])

    if classifier_name =="XGBoost":
      params = {
          # "device": 'cuda',
          "objective": "binary:logistic",
          "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
          "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
          "max_depth" : trial.suggest_int("max_depth", 1, 9),
          "eta" : trial.suggest_float("eta", 1e-8, 1.0, log=True),
          "gamma" : trial.suggest_float("gamma", 1e-8, 1.0, log=True),
          "grow_policy" : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
      }
      model = XGBClassifier(**params)

    else:
      params = {
          "objective": "binary",
          "metric": "binary_logloss",
          "verbosity": -1,
          "boosting_type": "gbdt",
          # "device" : "gpu",
          "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
          "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
          "num_leaves": trial.suggest_int("num_leaves", 2, 256),
          "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
          "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
          "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
          "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      }
      model = LGBMClassifier(**params)

    f2_scorer = make_scorer(fbeta_score, beta=2)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring=f2_scorer)
    average_score = score.mean()

    return average_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

display(study.best_params)
display(study.best_value)