# 1. Imports 

In [16]:
import torch
import torch_geometric
import pandas as pd
import numpy as np
import sklearn.metrics
import pickle
import time
import os 
import datetime

In [17]:
def evaluate(y, yhat, yhat_prob): # gcn 
    acc = sklearn.metrics.accuracy_score(y,yhat)
    pre = sklearn.metrics.precision_score(y,yhat)
    rec = sklearn.metrics.recall_score(y,yhat)
    f1 = sklearn.metrics.f1_score(y,yhat)
    auc = sklearn.metrics.roc_auc_score(y,yhat_prob)
    return {'acc':acc,'pre':pre,'rec':rec,'f1':f1,'auc':auc}
def summarize_results(yy,yyhat,yyhat_prob,data,prev_results):
    eval_result = evaluate(yy, yyhat, yyhat_prob)
    result = {
        'model': ['GCN'],
        'time': [None],
        'acc': [eval_result['acc']],
        'pre': [eval_result['pre']],
        'rec': [eval_result['rec']],
        'f1': [eval_result['f1']],
        'auc': [eval_result['auc']],
        'graph_based': [True],
        'method': ['Proposed'],
        'throw_rate': [None],
        'train_size': [data._train_size],
        'train_cols': ['amt'],
        'train_frate': [data._train_frate],
        'test_size': [data._test_size],
        'test_frate': [data._test_frate],
        'hyper_params': [None],
        'theta': [data._theta],
        'gamma': [data._gamma]
    } 
    return pd.DataFrame(result)

# 2. Load Data 

In [18]:
fnames = [l for l in os.listdir('./data') if l.split('.')[-1] == 'pkl']
fnames.sort()
fnames

['torch_geometric_data1_1.0e+07_0.8.pkl',
 'torch_geometric_data1_1.0e+07_0.95.pkl',
 'torch_geometric_data2_1.0e+07_0.8.pkl',
 'torch_geometric_data2_1.0e+07_0.95.pkl',
 'torch_geometric_data3_1.0e+07_0.8.pkl',
 'torch_geometric_data3_1.0e+07_0.95.pkl',
 'torch_geometric_data4_1.0e+07_0.8.pkl',
 'torch_geometric_data4_1.0e+07_0.95.pkl',
 'torch_geometric_data5_1.0e+07_0.8.pkl',
 'torch_geometric_data5_1.0e+07_0.95.pkl',
 'torch_geometric_data6_1.0e+07_0.8.pkl',
 'torch_geometric_data6_1.0e+07_0.95.pkl',
 'torch_geometric_data7_1.0e+07_0.8.pkl',
 'torch_geometric_data7_1.0e+07_0.95.pkl',
 'torch_geometric_data8_1.0e+07_0.8.pkl',
 'torch_geometric_data8_1.0e+07_0.95.pkl']

In [4]:
geodata_list = [] 
df_train_list = [] 
df_trainindex_list = []
for fname in fnames:
    i = fname[20]
    df_trainindex_list.append(i)
    df_train_list.append(pd.read_csv(f'./data/df_train{i}.csv')) 
    with open(f'./data/{fname}', 'rb') as file:
        geodata_list.append(pickle.load(file))        

In [5]:
for i,data in enumerate(geodata_list):
    print(f"data-{i}")
    print(f"train_size = {data._train_size}")
    print(f"train_size = {data._test_size}")
    print(f"train_frate = {data._train_frate:.4f}")
    print(f"test_frate = {data._test_frate:.4f}")
    print(f"theta = {data._theta}")
    print(f"gamma = {data._gamma}")
    print("---")

data-0
train_size = 734003
train_size = 314572
train_frate = 0.0057
test_frate = 0.0057
theta = 10000000.0
gamma = 0.8
---
data-1
train_size = 734003
train_size = 314572
train_frate = 0.0057
test_frate = 0.0057
theta = 10000000.0
gamma = 0.95
---
data-2
train_size = 420500
train_size = 314572
train_frate = 0.0100
test_frate = 0.0057
theta = 10000000.0
gamma = 0.8
---
data-3
train_size = 420500
train_size = 314572
train_frate = 0.0100
test_frate = 0.0057
theta = 10000000.0
gamma = 0.95
---
data-4
train_size = 84100
train_size = 314572
train_frate = 0.0500
test_frate = 0.0057
theta = 10000000.0
gamma = 0.8
---
data-5
train_size = 84100
train_size = 314572
train_frate = 0.0500
test_frate = 0.0057
theta = 10000000.0
gamma = 0.95
---
data-6
train_size = 42050
train_size = 314572
train_frate = 0.1000
test_frate = 0.0057
theta = 10000000.0
gamma = 0.8
---
data-7
train_size = 42050
train_size = 314572
train_frate = 0.1000
test_frate = 0.0057
theta = 10000000.0
gamma = 0.95
---
data-8
train_siz

# 3. 적합 & 결과저장 -- one data set 

In [76]:
df_test = pd.read_csv("./data/df_test.csv")

In [77]:
data = geodata_list[0]
df_train = df_train_list[0]
i = df_trainindex_list[0]

In [75]:
EPOCHS = 1000
LR = 0.001
WEIGHT_DECAY = 0.0005
FILTERS = [16,8]
GAMMA = data._gamma.__str__().replace('.','')
#--#
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch_geometric.nn.GCNConv(1, FILTERS[0])
        self.conv2 = torch_geometric.nn.GCNConv(FILTERS[0], FILTERS[1])
        self.linr = torch.nn.Linear(FILTERS[1], 2)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = torch.nn.functional.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = self.linr(x)
        return torch.nn.functional.log_softmax(x, dim=1)
    def get_hidden(self,data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.nn.functional.relu(x)        
        return x          
#--#

In [None]:
net = GCN()
net.to("cuda:0")
loss_fn = torch.nn.functional.nll_loss
optimizr = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
net.train()
for epoc in range(1,EPOCHS+1):
    netout = net(data.to("cuda:0"))
    loss = loss_fn(netout[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizr.step()
    optimizr.zero_grad()
    #--#
    if epoc % 200 == 0: 
        print(f"epoch = {epoc}/{EPOCHS}\tloss = {loss:.4f}")
net.eval()
net.to("cpu")
data.to("cpu")
#--#

epoch = 200/1000	loss = 0.0763
epoch = 400/1000	loss = 0.0610
epoch = 600/1000	loss = 0.0378
epoch = 800/1000	loss = 0.0286
epoch = 1000/1000	loss = 0.0280


In [74]:
y = (data.y[data.train_mask])
yy = (data.y[data.test_mask])
hidden = net.get_hidden(data).detach()
h = hidden[data.train_mask]
hh = hidden[data.test_mask]
netout = net(data).detach()
#--#
yhat_prob = torch.exp(netout[data.train_mask])[:,-1]
yhat = (yhat_prob > 0.5)
yyhat_prob = torch.exp(netout[data.test_mask])[:,-1]
yyhat = (yyhat_prob > 0.5)
#--#
df_result = pd.DataFrame(summarize_results(yy,yyhat,yyhat_prob,data,prev_results=None))
df_result['epoch'] = EPOCHS
df_result['lr'] = LR
df_result['weight_decay'] = WEIGHT_DECAY
df_result['filters'] = f"{FILTERS[0]},{FILTERS[1]}"
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
df_result.T.to_csv(f'./results/df_gcn_result_{date}_gamma{GAMMA}.csv')
#--#
df_train_h = pd.DataFrame(h)
df_train_h.columns = [f"h{i}" for i in df_train_h.columns]
df_test_h = pd.DataFrame(hh)
df_test_h.columns = [f"h{i}" for i in df_test_h.columns]
pd.concat(
    [df_train_h, 
     df_train.assign(is_fraud_hat = yhat,is_fraud_hat_prob = yhat_prob)]
    ,axis=1
).to_csv(f'./results/df_train{i}_{date}_gamma{GAMMA}.csv',index=False)
pd.concat(
    [df_test_h, 
     df_test.assign(is_fraud_hat = yyhat,is_fraud_hat_prob = yyhat_prob)]
    ,axis=1
).to_csv(f'./results/df_test_{date}_gamma{GAMMA}.csv',index=False)
print("result saved")

# 4. 적합 & 결과저장 -- 배치

In [81]:
def run(df_test,data,df_train,i):
    EPOCHS = 4000
    LR = 0.001
    WEIGHT_DECAY = 0.0005
    FILTERS = [16,8]
    GAMMA = data._gamma.__str__().replace('.','')
    #--#
    class GCN(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = torch_geometric.nn.GCNConv(1, FILTERS[0])
            self.conv2 = torch_geometric.nn.GCNConv(FILTERS[0], FILTERS[1])
            self.linr = torch.nn.Linear(FILTERS[1], 2)
        def forward(self, data):
            x, edge_index = data.x, data.edge_index
            x = self.conv1(x, edge_index)
            x = torch.nn.functional.relu(x)
            x = torch.nn.functional.dropout(x, training=self.training)
            x = self.conv2(x, edge_index)
            x = torch.nn.functional.relu(x)
            x = self.linr(x)
            return torch.nn.functional.log_softmax(x, dim=1)
        def get_hidden(self,data):
            x, edge_index = data.x, data.edge_index
            x = self.conv1(x, edge_index)
            x = torch.nn.functional.relu(x)
            x = self.conv2(x, edge_index)
            x = torch.nn.functional.relu(x)        
            return x          
    #--#
    net = GCN()
    net.to("cuda:0")
    loss_fn = torch.nn.functional.nll_loss
    optimizr = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    net.train()
    for epoc in range(1,EPOCHS+1):
        netout = net(data.to("cuda:0"))
        loss = loss_fn(netout[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizr.step()
        optimizr.zero_grad()
        #--#
        if epoc % 200 == 0: 
            print(f"epoch = {epoc}/{EPOCHS}\tloss = {loss:.4f}")
    net.eval()
    net.to("cpu")
    data.to("cpu")
    #--#
    y = (data.y[data.train_mask])
    yy = (data.y[data.test_mask])
    hidden = net.get_hidden(data).detach()
    h = hidden[data.train_mask]
    hh = hidden[data.test_mask]
    netout = net(data).detach()
    #--#
    yhat_prob = torch.exp(netout[data.train_mask])[:,-1]
    yhat = (yhat_prob > 0.5)
    yyhat_prob = torch.exp(netout[data.test_mask])[:,-1]
    yyhat = (yyhat_prob > 0.5)
    #--#
    df_result = pd.DataFrame(summarize_results(yy,yyhat,yyhat_prob,data,prev_results=None))
    df_result['epoch'] = EPOCHS
    df_result['lr'] = LR
    df_result['weight_decay'] = WEIGHT_DECAY
    df_result['filters'] = f"{FILTERS[0]},{FILTERS[1]}"
    date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    df_result.T.to_csv(f'./results/df_gcn_result_{date}_gamma{GAMMA}.csv')
    #--#
    df_train_h = pd.DataFrame(h)
    df_train_h.columns = [f"h{i}" for i in df_train_h.columns]
    df_test_h = pd.DataFrame(hh)
    df_test_h.columns = [f"h{i}" for i in df_test_h.columns]
    pd.concat(
        [df_train_h, 
         df_train.assign(is_fraud_hat = yhat,is_fraud_hat_prob = yhat_prob)]
        ,axis=1
    ).to_csv(f'./results/df_train{i}_{date}_gamma{GAMMA}.csv',index=False)
    pd.concat(
        [df_test_h, 
         df_test.assign(is_fraud_hat = yyhat,is_fraud_hat_prob = yyhat_prob)]
        ,axis=1
    ).to_csv(f'./results/df_test_{date}_gamma{GAMMA}.csv',index=False)
    print("result saved")    

In [82]:
df_test = pd.read_csv("./data/df_test.csv")
for data,df_train, i in zip(geodata_list,df_train_list,df_trainindex_list):
    run(df_test,data,df_train,i)

epoch = 200/4000	loss = 0.0649
epoch = 400/4000	loss = 0.0564
epoch = 600/4000	loss = 0.0351
epoch = 800/4000	loss = 0.0281
epoch = 1000/4000	loss = 0.0278
epoch = 1200/4000	loss = 0.0272
epoch = 1400/4000	loss = 0.0276
epoch = 1600/4000	loss = 0.0275
epoch = 1800/4000	loss = 0.0270
epoch = 2000/4000	loss = 0.0273
epoch = 2200/4000	loss = 0.0270
epoch = 2400/4000	loss = 0.0272
epoch = 2600/4000	loss = 0.0271
epoch = 2800/4000	loss = 0.0271
epoch = 3000/4000	loss = 0.0272
epoch = 3200/4000	loss = 0.0270
epoch = 3400/4000	loss = 0.0272
epoch = 3600/4000	loss = 0.0268
epoch = 3800/4000	loss = 0.0273
epoch = 4000/4000	loss = 0.0271
result saved
epoch = 200/4000	loss = 0.1208
epoch = 400/4000	loss = 0.0897
epoch = 600/4000	loss = 0.0444
epoch = 800/4000	loss = 0.0304
epoch = 1000/4000	loss = 0.0251
epoch = 1200/4000	loss = 0.0227
epoch = 1400/4000	loss = 0.0206
epoch = 1600/4000	loss = 0.0189
epoch = 1800/4000	loss = 0.0180
epoch = 2000/4000	loss = 0.0173
epoch = 2200/4000	loss = 0.0173
epo