# imports 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [3]:
# from ctgan import CTGAN
# from ctgan import load_demo

In [4]:
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

In [5]:
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

# Overview

## 데이터 종류 

- fraudTrain.csv: (1048575, 23), 기본데이터
- df02: (214520, 23), is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1 에서는 모든 샘플을 뽑아서 정리한 새로운 자료
- df50 = (12012, 23), df20에서 is_fraud==0 와 is_fraud==1 의 비율을 맞추어서 샘플을 뽑은 것 

|데이터|shape|사기거래빈도|설명|
|-|-|-|-|
|fraudTrain|(1048575, 22)|0.00573|원래자료|
|df02|(214520, 22)|0.028| is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1 에서는 모든 샘플을 뽑아서 정리한 새로운 자료|
|df50|(12012, 22)|0.5| df02에서 사기비율을 50퍼로 맞추어 샘플링한 자료|
|df50_tr|(9009, 22)|0.49828| df50에서 랜덤으로 train/test를 분리하여 얻은 train dataset|
|df50_test|(3003, 22)|0.50516| df50에서 랜덤으로 train/test를 분리하여 얻은 test dataset|
|df02_tr|(211517, 22)|0.02122| df02에서 df50_test에 해당하는 인덱스를 제외|
|fraudTrain_tr|(1045572, 22)|0.00429| fraudTrain에서 df50_test에 해당하는 인덱스를 제외|

`-` fraudTrain

In [35]:
fraudTrain = pd.read_csv("fraudTrain.csv").iloc[:,1:]

In [36]:
fraudTrain.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [37]:
fraudTrain['is_fraud']

0          0
1          0
2          0
3          0
4          0
          ..
1048570    0
1048571    0
1048572    0
1048573    0
1048574    0
Name: is_fraud, Length: 1048575, dtype: int64

In [38]:
fraudTrain.is_fraud.mean().round(5)

0.00573

`-` df20

In [13]:
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

(214520, 22)

In [14]:
df02.is_fraud.mean().round(5)

0.028

`-` df50

In [15]:
df50 = down_sample_textbook(df02)
df50.shape

(12012, 22)

In [16]:
df50

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
2449,2019-01-02 1:06,4.613310e+12,fraud_Rutherford-Mertz,grocery_pos,281.06,Jason,Murphy,M,542 Steve Curve Suite 011,Collettsville,...,35.9946,-81.7266,885,Soil scientist,1988-09-15,e8a81877ae9a0a7f883e15cb39dc4022,1325466397,36.430124,-81.179483,1
2472,2019-01-02 1:47,3.401870e+14,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,Misty,Hart,F,27954 Hall Mill Suite 575,San Antonio,...,29.4400,-98.4590,1595797,Horticultural consultant,1960-10-28,bc7d41c41103877b03232f03f1f8d3f5,1325468849,29.819364,-99.142791,1
2523,2019-01-02 3:05,3.401870e+14,fraud_Goodwin-Nitzsche,grocery_pos,276.31,Misty,Hart,F,27954 Hall Mill Suite 575,San Antonio,...,29.4400,-98.4590,1595797,Horticultural consultant,1960-10-28,b98f12f4168391b2203238813df5aa8c,1325473523,29.273085,-98.836360,1
2546,2019-01-02 3:38,4.613310e+12,fraud_Erdman-Kertzmann,gas_transport,7.03,Jason,Murphy,M,542 Steve Curve Suite 011,Collettsville,...,35.9946,-81.7266,885,Soil scientist,1988-09-15,397894a5c4c02e3c61c784001f0f14e4,1325475483,35.909292,-82.091010,1
2553,2019-01-02 3:55,3.401870e+14,fraud_Koepp-Parker,grocery_pos,275.73,Misty,Hart,F,27954 Hall Mill Suite 575,San Antonio,...,29.4400,-98.4590,1595797,Horticultural consultant,1960-10-28,7863235a750d73a244c07f1fb7f0185a,1325476547,29.786426,-98.683410,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363827,2019-06-17 19:30,2.475090e+15,fraud_Frami Group,entertainment,81.13,John,Miller,M,153 Mccullough Springs Apt. 857,Lamberton,...,44.2378,-95.2739,1507,Land/geomatics surveyor,1993-10-12,c66cb411019c7dfd4d89f42a1ba4765f,1339961448,44.212695,-95.661879,0
140154,2019-03-17 14:33,2.131550e+14,fraud_Bahringer-Streich,food_dining,55.00,Christopher,Sheppard,M,39218 Baker Shoals,Bristow,...,38.1981,-86.6821,965,Horticultural therapist,1982-02-10,316b9d25b9fa7d08a6831b7dab6634cd,1331994839,38.394240,-86.413557,0
860597,2019-12-17 12:31,2.280870e+15,fraud_Lubowitz-Walter,kids_pets,8.12,Katherine,Cooper,F,3854 Lauren Springs Suite 648,Oakford,...,40.0994,-89.9601,530,Transport planner,1967-09-23,d92e9e63d9b24c3ccb92d05cba4cac54,1355747517,39.695248,-89.853063,0
29341,2019-01-18 9:20,4.878360e+15,fraud_Denesik and Sons,shopping_pos,3.52,Tina,Alvarez,F,1976 Tyler Underpass,Early,...,42.4483,-95.1726,885,"Pilot, airline",1949-08-14,8390ce51cfb8482b618ebc4ac370bcf7,1326878457,42.633204,-95.598143,0


In [17]:
df50.is_fraud.mean().round(5)

0.5

`-` df50_tr, df50_test 

In [18]:
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

In [19]:
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)

(0.49828, 0.50516)

`-` df02_tr, fraudTrain_tr

In [20]:
df02_tr = df02.loc[[i not in df50_test.index for i in df02.index],:].copy()
fraudTrain_tr = fraudTrain.loc[[i not in df50_test.index for i in fraudTrain.index],:].copy()

In [21]:
df02_tr.shape, fraudTrain_tr.shape

((211517, 22), (1045572, 22))

In [22]:
df02_tr.is_fraud.mean().round(5), fraudTrain_tr.is_fraud.mean().round(5)

(0.02122, 0.00429)

# 분석방법정리 

||Train|Test|모형|설명변수|참고|
|-|-|-|-|-|-|
|분석1|df02_tr|df50_test|그레디언트부스팅|amt|base|
|**분석2**|df02_tr|df50_test|그레디언트부스팅|amt|가장좋음|

# 분석1

`-` step1: data 

In [26]:
X = np.array(df02_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df02_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성 

In [27]:
lrnr = ensemble.GradientBoostingClassifier()

`-` step3: fit 

In [28]:
lrnr.fit(X,y)

`-` step4: evaluate

In [29]:
thresh = y.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [30]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [31]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
분석2,0.88678,0.910677,0.86025,0.884746


# 분석2

`-` step1: data 

In [None]:
# Names of the columns that are discrete
discrete_columns = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'is_fraud']
ctgan = CTGAN(epochs=10)
ctgan.fit(df, discrete_columns)

# Create synthetic data
df2 = ctgan.sample(1000)

In [37]:
df02_tr.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [None]:
['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'is_fraud']

In [36]:
df02_tr

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
669418,2019-10-12 18:21,4.089100e+18,"fraud_Haley, Jewess and Bechtelar",shopping_pos,7.53,Debra,Stark,F,686 Linda Rest,Kilgore,...,32.3836,-94.8653,24536,Multimedia programmer,1983-10-14,d313353fa30233e5fab5468e852d22fc,1350066071,32.202008,-94.371865,0
32567,2019-01-20 13:06,4.247920e+12,fraud_Turner LLC,travel,3.79,Judith,Moss,F,46297 Benjamin Plains Suite 703,Washington Court House,...,39.5370,-83.4550,22305,Television floor manager,1939-03-09,88c65b4e1585934d578511e627fe3589,1327064760,39.156673,-82.930503,0
156587,2019-03-24 18:09,4.026220e+12,fraud_Klein Group,entertainment,59.07,Debbie,Payne,F,204 Ashley Neck Apt. 169,Preston,...,41.5224,-71.9934,4720,Broadcast presenter,1977-05-18,3bd9ede04b5c093143d5e5292940b670,1332612553,41.657152,-72.595751,0
1020243,2020-02-25 15:12,4.957920e+12,fraud_Monahan-Morar,personal_care,25.58,Alan,Parsons,M,0547 Russell Ford Suite 574,Kirk,...,39.6171,-102.4776,207,Network engineer,1955-12-04,19e16ee7a01d229e750359098365e321,1361805120,39.080346,-103.213452,0
116272,2019-03-06 23:19,4.178100e+15,fraud_Kozey-Kuhlman,personal_care,84.96,Jill,Flores,F,639 Cruz Islands,Baroda,...,41.9488,-86.4913,3104,"Horticulturist, commercial",1981-03-29,a0c8641ca1f5d6e243ed5a2246e66176,1331075954,42.502065,-86.732664,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047054,2020-03-10 3:40,3.546670e+15,fraud_Kunze Inc,grocery_pos,289.37,Jordan,May,M,1626 Susan Course,Aledo,...,32.7004,-97.6039,13602,Optometrist,1984-07-05,70b8c60c7bc2ed6e82505c8018919b1f,1362886857,32.238135,-98.129466,1
1047089,2020-03-10 3:59,3.589290e+15,fraud_Kris-Weimann,misc_net,690.49,Paula,Estrada,F,350 Stacy Glens,Spencer,...,43.7557,-97.5936,343,"Development worker, international aid",1972-03-05,fb1ddd251bbec9b84c9755e856d51723,1362887989,43.254214,-98.267759,1
1047157,2020-03-10 4:31,3.546670e+15,"fraud_Casper, Hand and Zulauf",grocery_pos,324.74,Jordan,May,M,1626 Susan Course,Aledo,...,32.7004,-97.6039,13602,Optometrist,1984-07-05,4dca0549e43b7e265cae7fd8a7e563b4,1362889904,33.607221,-97.996506,1
1047208,2020-03-10 4:59,3.589290e+15,fraud_Kiehn Inc,grocery_pos,331.33,Paula,Estrada,F,350 Stacy Glens,Spencer,...,43.7557,-97.5936,343,"Development worker, international aid",1972-03-05,d18c55035998e461aa9040e254b74925,1362891561,44.228731,-98.330520,1


In [26]:
X = np.array(df02_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df02_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성 

In [27]:
lrnr = ensemble.GradientBoostingClassifier()

`-` step3: fit 

In [28]:
lrnr.fit(X,y)

`-` step4: evaluate

In [29]:
thresh = y.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [30]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [31]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
분석2,0.88678,0.910677,0.86025,0.884746
