In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import gc
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import space_eval
import time
import math
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
import pprint
pp = pprint.PrettyPrinter(indent=4)
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold

In [None]:
data_dir= "/kaggle/input/creditcardfraud"

In [None]:
df = pd.read_csv(data_dir + "/" + "creditcard.csv")

In [None]:
df.head()

This dataset is available in the cleaned format with PCA applied on some unspecified underlying original varilables hidden from public due to its sensitive nature. 

In [None]:
input_cols = ["V" + str(x) for x in range(1,29)] + ["Amount"]

In [None]:
X = df[input_cols]

In [None]:
y = df["Class"]

In [None]:
y.value_counts()

As we can see that the dataset is heavily imbalanced as there are very samples with target class value 1 than 0.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=7)

We will balance dataset with SMOTE, which will oversample the samples that have minority class as output value by introducing new synthetic samples that have slightly different values of input variables from each other.

In [None]:
# Balance dataset with SMOTE
sm = SMOTE(random_state=7)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
X_train_bal = pd.DataFrame(X_train_bal, columns=input_cols)
y_train_bal = pd.Series(y_train_bal)

Next let's find out the best hyperparameters for LightGBM classifier model. We will use several tuning rounds and in each round we narrow down the search range of hyperparameter space. We are using Tree-structured Parzen Estimator (TPE) algorithm to explore hyperparameter space in each round. We are using Hyperopt library where objective function calculates the negative f1 score as value to be minimized while searching for the optimal values of hyperparameters. Finally we find out the best number of iterations with reduced learning rate for gradient boosting algorithm to be used for training on entire training dataset, before evaluating its performance against test dataset.

In [None]:
number_of_evals = 100
def find_best_params_for_lgb(X, y):
    evaluated_point_scores = {}
    
    def objective(params):
        garbage=gc.collect()
        if (str(params) in evaluated_point_scores):
            return evaluated_point_scores[str(params)]
        else:          
            kf = KFold(n_splits=2, random_state=7)
            scores = []
            for train_index, test_index in kf.split(X.values):                
                X_train, X_val = X.values[train_index], X.values[test_index]
                y_train, y_val = y.values.ravel()[train_index], y.values.ravel()[test_index]
            
                train_data = lgb.Dataset(X_train, 
                                label=y_train,
                                feature_name=list(X.columns),
                                )
                
                validation_data = lgb.Dataset(X_val, 
                                label=y_val,
                                feature_name=list(X.columns),
                                )
                
                evals_result = {}
                bst = lgb.train(params, train_data, 
                                valid_sets=[train_data, validation_data], 
                                valid_names=['train', 'val'], 
                                evals_result=evals_result, 
                                num_boost_round=10000,
                                early_stopping_rounds=100,
                                verbose_eval=None,
                               )

                y_val_preds = np.where(bst.predict(X_val) > 0.5, 1, 0)
                score = f1_score(y_val, y_val_preds)
                scores.append(score)
                
#             print("Evaluating params:")
#             pp.pprint(params)
            socre=np.mean(scores).item(0)
#             print("f1: " + str(score))
            evaluated_point_scores[str(params)] = -score
            return -score
        
    # This parameter tuner is able to narrow down parameter search space after each tuning round
    parameters_tuned = {
                        "num_leaves": (32, 1024),     
                        "max_depth": (6, 64),
                        "feature_fraction": (0.9, 1.0),
                        "max_bin": (50, 250),
                        "bagging_fraction": (0.7, 1.0),
                        "lambda_l1": (1.0, 10.0),
                        "lambda_l2": (1.0, 100.0)
                        }
    best_params = None
    number_of_tuning_rounds = 3
    for tuning_round in range(number_of_tuning_rounds,0,-1):     
        
        # Narrowing down the parameter space to be explored in this round of parameter tuning      
        
        parameter_space_range = {     
        } 
        for parameter in parameters_tuned.keys():
            if best_params is not None:
                prev_best = best_params[parameter]
                (lower_val, upper_val) = parameters_tuned[parameter]
                range_one_side = (tuning_round/number_of_tuning_rounds) * ((upper_val - lower_val)/2.0)
                parameter_space_range[parameter] = (max(lower_val, prev_best - range_one_side),  min(upper_val, prev_best + range_one_side))
            else:
                # For the initial tuning round
                parameter_space_range = parameters_tuned.copy()
                
        param_space = {
            'objective': hp.choice("objective", ["binary"]),        
            "max_depth": scope.int(hp.quniform("max_depth", parameter_space_range["max_depth"][0], parameter_space_range["max_depth"][1], 1)),
            "learning_rate": hp.choice("learning_rate", [0.2]),
            "num_leaves": scope.int(hp.quniform("num_leaves", parameter_space_range["num_leaves"][0], parameter_space_range["num_leaves"][1], 10)),   
            "max_bin": scope.int(hp.quniform("max_bin", parameter_space_range["max_bin"][0], parameter_space_range["max_bin"][1], 10)),
            "bagging_fraction": hp.quniform('bagging_fraction', parameter_space_range["bagging_fraction"][0], parameter_space_range["bagging_fraction"][1], 0.05),
            "feature_fraction": hp.uniform("feature_fraction", parameter_space_range["feature_fraction"][0], parameter_space_range["feature_fraction"][1]),
            "bagging_freq": hp.choice("bagging_freq", [1]),
            "lambda_l1": hp.quniform('lambda_l1', parameter_space_range["lambda_l1"][0], parameter_space_range["lambda_l1"][1], 1),        
            "lambda_l2": hp.quniform('lambda_l2', parameter_space_range["lambda_l2"][0], parameter_space_range["lambda_l2"][1], 5),
            "loss_function": hp.choice("loss_function", ["binary_error"]), 
            "eval_metric": hp.choice("eval_metric", ["binary_error"]),
            "metric": hp.choice("metric", ["binary_error"]),
            "random_state": hp.choice("random_state", [7]),
            "verbose": hp.choice("verbose", [None])
        }
        start_time = time.time()
        best_params = space_eval(
            param_space, 
            fmin(objective, 
                 param_space, 
                 algo=hyperopt.tpe.suggest,
                 max_evals=number_of_evals))
    
    # Finding best number of iterations with learning rate 0.1
    best_params["learning_rate"] = 0.1

    kf = KFold(n_splits=5)

    num_iterations_array = []
    for train_index, test_index in kf.split(X.values):                
        X_train, X_val = X.values[train_index], X.values[test_index]
        y_train, y_val = y.values.ravel()[train_index], y.values.ravel()[test_index]

        train_data = lgb.Dataset(X_train, 
                        label=y_train,
                        feature_name=list(X.columns),
                        )

        validation_data = lgb.Dataset(X_val, 
                        label=y_val,
                        feature_name=list(X.columns),
                        )

        evals_result = {}
        bst = lgb.train(best_params, train_data, 
                        valid_sets=[train_data, validation_data], 
                        valid_names=['train', 'val'], 
                        evals_result=evals_result, 
                        num_boost_round=10000,
                        early_stopping_rounds=100,
                        verbose_eval=None,
                       )

        num_iterations_array.append(bst.best_iteration)        

    best_params["num_iterations"] = int(np.mean(num_iterations_array).item(0))        
    print ("Best Hyperparameters found:")
    pp.pprint(best_params)
    return best_params

In [None]:
best_params = find_best_params_for_lgb(X=X_train_bal, y=y_train_bal)

In [None]:
train_data = lgb.Dataset(X_train_bal.values, 
                            label=y_train_bal.values.ravel(),
                            feature_name=list(X_train_bal.columns),
                        )

In [None]:
bst = lgb.train(best_params, train_data)

In [None]:
y_probs = bst.predict(X_test)

Calculating AUC ROC score

In [None]:
test_score = roc_auc_score(y_test, y_probs)

In [None]:
test_score

Calculating F1-Score with sample representing a ****fraudulant transaction considered as positive sample

In [None]:
y_preds = np.where(y_probs > 0.5, 1, 0)

In [None]:
f1 = f1_score(y_test, y_preds)

In [None]:
f1

The performance of the model can be further improved by exploring the Hyperparameter space at more granuarlity level. This can be achieved by:

1. Increasing number of hyperparameter value combinations evaluated at each tuning round (currently we are evaluating 100 combinations per tuning round)
2. Number of tuning rounds, where each tuning round narrows down the parameter value range (currently we are using 3 tuning rounds)

This will take more execution time to explore the hyperparameter space to find the optimal parameters.

Bayesian Optimization technique can also be used to narrow down search space of Hyperparams.