# (연구&보람) CTGAN – 신용카드

신록예찬  
2023-07-30

# imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [4]:
# from ctgan import CTGAN
# from ctgan import load_demo

In [5]:
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

In [6]:
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

# Overview

## 데이터 종류

-   fraudTrain.csv: (1048575, 23), 기본데이터
-   df02: (214520, 23), is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1
    에서는 모든 샘플을 뽑아서 정리한 새로운 자료
-   df50 = (12012, 23), df20에서 is_fraud==0 와 is_fraud==1 의 비율을
    맞추어서 샘플을 뽑은 것

| 데이터        | shape         | 사기거래빈도 | 설명                                                                                       |
|------------------|------------------|------------------|------------------|
| fraudTrain    | (1048575, 22) | 0.00573      | 원래자료                                                                                   |
| df02          | (214520, 22)  | 0.028        | is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1 에서는 모든 샘플을 뽑아서 정리한 새로운 자료 |
| df50          | (12012, 22)   | 0.5          | df02에서 사기비율을 50퍼로 맞추어 샘플링한 자료                                            |
| df50_tr       | (9009, 22)    | 0.49828      | df50에서 랜덤으로 train/test를 분리하여 얻은 train dataset                                 |
| df50_test     | (3003, 22)    | 0.50516      | df50에서 랜덤으로 train/test를 분리하여 얻은 test dataset                                  |
| df02_tr       | (211517, 22)  | 0.02122      | df02에서 df50_test에 해당하는 인덱스를 제외                                                |
| fraudTrain_tr | (1045572, 22) | 0.00429      | fraudTrain에서 df50_test에 해당하는 인덱스를 제외                                          |

`-` fraudTrain

In [7]:
fraudTrain = pd.read_csv("fraudTrain.csv").iloc[:,1:]

In [8]:
fraudTrain.columns

In [9]:
fraudTrain['is_fraud']

In [10]:
fraudTrain.is_fraud.mean().round(5)

`-` df20

In [11]:
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

In [12]:
df02.is_fraud.mean().round(5)

`-` df50

In [13]:
df50 = down_sample_textbook(df02)
df50.shape

In [14]:
df50

In [17]:
df50.is_fraud.mean().round(5)

`-` df50_tr, df50_test

In [18]:
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

In [19]:
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)

`-` df02_tr, fraudTrain_tr

In [20]:
df02_tr = df02.loc[[i not in df50_test.index for i in df02.index],:].copy()
fraudTrain_tr = fraudTrain.loc[[i not in df50_test.index for i in fraudTrain.index],:].copy()

In [21]:
df02_tr.shape, fraudTrain_tr.shape

In [22]:
df02_tr.is_fraud.mean().round(5), fraudTrain_tr.is_fraud.mean().round(5)

# 분석방법정리

|           | Train   | Test      | 모형             | 설명변수 | 참고     |
|-----------|---------|-----------|------------------|----------|----------|
| 분석1     | df02_tr | df50_test | 그레디언트부스팅 | amt      | base     |
| **분석2** | df02_tr | df50_test | 그레디언트부스팅 | amt      | 가장좋음 |

# 분석1

`-` step1: data

In [26]:
X = np.array(df02_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df02_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [27]:
lrnr = ensemble.GradientBoostingClassifier()

`-` step3: fit

In [28]:
lrnr.fit(X,y)

`-` step4: evaluate

In [29]:
thresh = y.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [30]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [31]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2

# 분석2

`-` step1: data

In [None]:
# Names of the columns that are discrete
discrete_columns = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'is_fraud']
ctgan = CTGAN(epochs=10)
ctgan.fit(df, discrete_columns)

# Create synthetic data
df2 = ctgan.sample(1000)

In [37]:
df02_tr.columns

In [None]:
['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'is_fraud']

In [36]:
df02_tr

In [26]:
X = np.array(df02_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df02_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [27]:
lrnr = ensemble.GradientBoostingClassifier()

`-` step3: fit

In [28]:
lrnr.fit(X,y)

`-` step4: evaluate

In [29]:
thresh = y.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [30]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [31]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2