# Credit card fraud detection problem

Kaggle dataset & problem description: https://www.kaggle.com/mlg-ulb/creditcardfraud

## Dataset

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import pandas as pd
import numpy as np

In [3]:
data_df = pd.read_csv('/content/gdrive/MyDrive/creditcard.csv')

## Data preprocessing

In [4]:
Q1 = data_df.quantile(0.25)
Q3 = data_df.quantile(0.75)

In [5]:
Q1 = Q1[1:-1]
Q3 = Q3[1:-1]
IQR = Q3 - Q1

In [7]:
columns_list = data_df.columns[1:-1]
for m in zip(columns_list, Q1, Q3, IQR):
  data_df_o_iqr = data_df[(data_df[m[0]] > (m[1] - 1.5 * m[3])) & (data_df[m[0]] < (m[2] + 1.5 * m[3]))]

In [8]:
y_iqr = data_df_o_iqr["Class"]
X_iqr = data_df_o_iqr.copy().drop("Class",axis=1)

In [None]:
print('Original imbalanced dataset shape %s' % Counter(y_iqr))

### data balancing:

#### ADASYN (Adaptive Synthetic Sampling Method for Imbalanced Data)

In [11]:
from imblearn.over_sampling import ADASYN
from collections import Counter

In [None]:
ada = ADASYN(random_state=42)
X_iqr_ada, y_iqr_ada = ada.fit_resample(X_iqr, y_iqr)

In [13]:
print('Resampled dataset after ADASYN shape %s' % Counter(y_iqr_ada))

Resampled dataset after ADASYN shape Counter({0: 252502, 1: 252461})


In [15]:
X_iqr_ada.shape

(504963, 30)

In [16]:
y_iqr_ada.shape

(504963,)

#### sklearn의 model selection train_test_split

shuffle=True설정 후 9:1로 split

In [17]:
from sklearn.model_selection import train_test_split

X_train_iqr_ada, X_test_iqr_ada, y_train_iqr_ada, y_test_iqr_ada = train_test_split(X_iqr_ada, y_iqr_ada,
    shuffle=True, random_state=42, test_size=0.1)

In [18]:
X_train_iqr_ada.shape

(454466, 30)

In [19]:
X_test_iqr_ada.shape

(50497, 30)

In [20]:
y_train_iqr_ada.shape

(454466,)

In [21]:
y_test_iqr_ada.shape

(50497,)

## Modeling & Evaluation

### 1. Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
log_reg.fit(X_train_iqr_ada, y_train_iqr_ada)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
y_pred_proba = log_reg.predict_proba(X_test_iqr_ada)

y_pred_proba

array([[0.00000000e+00, 1.00000000e+00],
       [3.92348279e-06, 9.99996077e-01],
       [3.65552033e-12, 1.00000000e+00],
       ...,
       [9.93159641e-01, 6.84035933e-03],
       [1.32544125e-08, 9.99999987e-01],
       [2.62006162e-04, 9.99737994e-01]])

In [26]:
y_pred = log_reg.predict(X_test_iqr_ada)

y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [28]:
np.sum(y_test_iqr_ada == y_pred) / len(y_test_iqr_ada) 

0.9733251480285957

### 2. AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train_iqr_ada, y_train_iqr_ada)

In [None]:
y_pred = ada_clf.predict(X_test_iqr_ada)
np.sum(y_test_iqr_ada == y_pred) / len(y_test_iqr_ada) 

### 3. XGBoost


In [None]:
from sklearn.metrics import mean_squared_error
try:
    import xgboost
except ImportError as ex:
    print("에러: xgboost not installed.")
    xgboost = None

if xgboost is not None:  
    xgb_reg = xgboost.XGBRegressor(random_state=42)
    xgb_reg.fit(X_train_iqr_ada, y_train_iqr_ada)
    y_pred_xg = xgb_reg.predict(X_test_iqr_ada)
    val_error_xg = mean_squared_error(y_test_iqr_ada, y_pred_xg) 
    print("Validation MSE:", val_error_xg)

Validation MSE: 0.013497367648783948


### 4. lightgbm

In [None]:
from lightgbm import LGBMRegressor

lgb_reg = LGBMRegressor(random_state=42)
lgb_reg.fit(X_train_iqr_ada, y_train_iqr_ada)

y_pred_lgbm = lgb_reg.predict(X_test_iqr_ada)
val_error_lgbm = mean_squared_error(y_test_iqr_ada, y_pred_lgbm)
val_error_lgbm

0.0024182977222669388