# **\[FRAUD\]** 신용카드 거래 사기탐지 Try1 변형

신록예찬  
2023-05-12

# imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF 
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

  from .autonotebook import tqdm as notebook_tqdm

In [3]:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

# Overview

## 데이터 종류

-   fraudTrain.csv: (1048575, 23), 기본데이터
-   df02: (214520, 23), is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1
    에서는 모든 샘플을 뽑아서 정리한 새로운 자료
-   df50 = (12012, 23), df20에서 is_fraud==0 와 is_fraud==1 의 비율을
    맞추어서 샘플을 뽑은 것

| 데이터        | shape         | 사기거래빈도 | 설명                                                                                       |
|------------------|------------------|------------------|------------------|
| fraudTrain    | (1048575, 22) | 0.00573      | 원래자료                                                                                   |
| df02          | (214520, 22)  | 0.028        | is_fraud==0 에서는 20퍼의 샘플만, is_fraud==1 에서는 모든 샘플을 뽑아서 정리한 새로운 자료 |
| df50          | (12012, 22)   | 0.5          | df02에서 사기비율을 50퍼로 맞추어 샘플링한 자료                                            |
| df50_tr       | (9009, 22)    | 0.49828      | df50에서 랜덤으로 train/test를 분리하여 얻은 train dataset                                 |
| df50_test     | (3003, 22)    | 0.50516      | df50에서 랜덤으로 train/test를 분리하여 얻은 test dataset                                  |
| df02_tr       | (211517, 22)  | 0.02122      | df02에서 df50_test에 해당하는 인덱스를 제외                                                |
| fraudTrain_tr | (1045572, 22) | 0.00429      | fraudTrain에서 df50_test에 해당하는 인덱스를 제외                                          |

`-` fraudTrain

In [4]:
fraudTrain = pd.read_csv("fraudTrain.csv").iloc[:,1:]
fraudTrain.shape

In [5]:
fraudTrain.is_fraud.mean().round(5)

`-` df20

In [6]:
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

In [7]:
df02.is_fraud.mean().round(5)

`-` df50

In [8]:
df50 = down_sample_textbook(df02)
df50.shape

In [9]:
df50

In [10]:
df50.is_fraud.mean().round(5)

`-` df50_tr, df50_test

In [11]:
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

In [12]:
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)

`-` df02_tr, fraudTrain_tr

In [13]:
df02_tr = df02.loc[[i not in df50_test.index for i in df02.index],:].copy()
fraudTrain_tr = fraudTrain.loc[[i not in df50_test.index for i in fraudTrain.index],:].copy()

In [14]:
df02_tr.shape, fraudTrain_tr.shape

In [15]:
df02_tr.is_fraud.mean().round(5), fraudTrain_tr.is_fraud.mean().round(5)

# 분석방법정리

|       | Train     | Test     | 모형     | 설명변수   | 그래프임베딩 |
|-------|-----------|----------|----------|------------|--------------|
| 분석1 | df50train | df50test | 로지스틱 | amt        | X            |
| 분석2 | df02train | df50test | 로지스틱 | amt        | X            |
| 분석3 | df50train | df50test | 로지스틱 | amt,amtano | X            |
| 분석4 | df02train | df50test | 로지스틱 | amt,amtano | X            |

In [49]:
pd.concat([_results1,_results2,_results3,_results4])

# 분석1

`-` step1: data

In [17]:
X = np.array(df50_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [18]:
lrnr = sklearn.linear_model.LogisticRegression()

`-` step3: fit

In [19]:
lrnr.fit(X,y)

`-` step4: evaluate

In [20]:
thresh = df50_tr.is_fraud.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
# yyhat = lrnr.predict(XX) 

In [21]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [22]:
_results1= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1

# 분석2

`-` step1: data

In [23]:
X = np.array(fraudTrain_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(fraudTrain_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [24]:
lrnr = sklearn.linear_model.LogisticRegression()

`-` step3: fit

In [25]:
lrnr.fit(X,y)

`-` step4: evaluate

In [26]:
thresh = fraudTrain_tr.is_fraud.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [27]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [28]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2

# 분석3

`-` 함수

In [29]:
def amtano1(df_train):
    df = df_train.copy()
    df = df.assign(amtano=0)
    normalize = lambda arr: (arr-np.median(arr))/np.std(arr) if np.std(arr)!=0 else arr*0 
    for cc_num, sub_df in df.groupby('cc_num'):
        df.loc[df.cc_num == cc_num,['amtano']] = normalize(sub_df.amt)
    return df  

In [30]:
def amtano2(df_train,df_test):
    df = pd.concat([df_train,df_test])
    df_amtano = amtano_train(df) 
    return df_test.assign(amtano = df_amtano.loc[[i in df_test.index for i in df_amtano.index],'amtano'])

`-` step1: data

In [31]:
X = np.array(amtano1(df50_tr).loc[:,['amt','amtano']])
XX = np.array(amtano1(df50_test).loc[:,['amt','amtano']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [32]:
lrnr = sklearn.linear_model.LogisticRegression()

`-` step3: fit

In [33]:
lrnr.fit(X,y)

`-` step4: evaluate

In [34]:
thresh = df50_tr.is_fraud.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
#yyhat = lrnr.predict(XX) 

In [35]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [36]:
_results3= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3

# 분석4

`-` step1: data

In [37]:
X = np.array(amtano1(fraudTrain_tr).loc[:,['amt','amtano']])
XX = np.array(amtano1(df50_test).loc[:,['amt','amtano']])
y = np.array(fraudTrain_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

`-` step2: lrnr 생성

In [38]:
lrnr = sklearn.linear_model.LogisticRegression()

`-` step3: fit

In [39]:
lrnr.fit(X,y)

`-` step4: evaluate

In [40]:
thresh = fraudTrain_tr.is_fraud.mean()
yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
# yyhat = lrnr.predict(XX) 

In [41]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [42]:
_results4= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석4'])
_results4

# 분석5

`-` step1: data

In [43]:
Gtr = build_graph_bipartite(df50_tr)
Gtest = build_graph_bipartite(df50_test)
X,y = embedding(Gtr)
XX,yy = embedding(Gtest)

Computing transition probabilities: 100%|██████████| 1501/1501 [00:00<00:00, 16084.29it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:02<00:00,  3.53it/s]
Computing transition probabilities: 100%|██████████| 1501/1501 [00:00<00:00, 16125.78it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:02<00:00,  3.56it/s]

`-` step2: lrnr 생성

In [44]:
lrnr = sklearn.linear_model.LogisticRegression()

`-` step3: fit

In [45]:
lrnr.fit(X,y)

`-` step4: evaluate

In [46]:
#thresh = df50_tr.is_fraud.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
yyhat = lrnr.predict(XX) 

In [47]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [48]:
_results5= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석5'])
_results5

-   기준이 다름