# 1. Imports

In [1]:
import torch
import torch_geometric
import pandas as pd
import numpy as np
import pickle 
import itertools
import multiprocessing
from functools import partial
import matplotlib.pyplot as plt

In [2]:
def throw(df, fraud_rate, random_state=42):
    # Separate fraud and non-fraud transactions and make copies
    df1 = df[df['is_fraud'] == 1].copy()  # Fraud transactions
    df0 = df[df['is_fraud'] == 0].copy()  # Non-fraud transactions
    
    # Downsample non-fraud transactions
    # Downsample the non-fraud transactions considering the fraud rate
    df0_downsample = (len(df1) * (1 - fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=random_state)
    
    # Concatenate fraud transactions and downsampled non-fraud transactions
    df_p = pd.concat([df1, df0_down])
    
    # Return the concatenated DataFrame
    return df_p
    
def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3, random_state=42):
    # Split the data into 70:30 ratio with test_fraud_rate as input
    n = len(data_frame)

    # Separate fraud and non-fraud transactions
    fraud_data = data_frame[data_frame['is_fraud'] == 1]
    normal_data = data_frame[data_frame['is_fraud'] == 0]

    # Calculate the size of the test data
    test_samples = int(test_fraud_rate * (n * test_rate))
    remaining_test_samples = int(n * test_rate) - test_samples

    # Randomly sample test data from fraud and non-fraud transactions
    test_fraud_data = fraud_data.sample(n=test_samples, replace=False, random_state=random_state)
    test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False, random_state=random_state)

    # Concatenate test data
    test_data = pd.concat([test_normal_data, test_fraud_data])

    # Create training data
    train_data = data_frame[~data_frame.index.isin(test_data.index)]

    return train_data, test_data

In [3]:
# GCN에서 mask를 위한 함수 
def _concat(df_tr, df_tst):   
    df = pd.concat([df_tr, df_tst])
    train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
    test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
    mask = (train_mask, test_mask)
    return df, mask

# 거래시간차이 계산 
def _compute_time_difference(sub_df):
    t = sub_df.unix_time.to_numpy() 
    t_diff = abs(t.reshape(-1,1) - t.reshape(1,-1)).reshape(-1,1)
    idx = np.array(list(itertools.product(sub_df.index,sub_df.index)))
    result = np.concatenate([idx,t_diff],axis=1)
    return result

def _parallel_apply(func, data_frames, num_processes=None):
    """
    Apply a function to a list of DataFrames in parallel.
    
    :param func: The function to apply.
    :param data_frames: The list of DataFrames to process.
    :param num_processes: The number of processes to use.
    :return: A list containing the processed results.
    """
    if num_processes is None:
        num_processes = multiprocessing.cpu_count()  # Number of available CPU cores

    with multiprocessing.Pool(processes=num_processes) as pool:
        results = pool.map(func, data_frames)

    return results
    
def df2geodata(df_train,df_test,theta,gamma):
    df2, mask = _concat(df_train, df_test)   
    df = df2.reset_index()
    groups = df.groupby('cc_num')
    data_frames = [sub_df for _,sub_df in groups]  # 여기에 데이터프레임들을 리스트로 
    processed_results = _parallel_apply(_compute_time_difference, data_frames)
    processed_results = np.concatenate(processed_results).astype(np.float64)
    edge_index = processed_results[:,:2]
    dist = processed_results[:,2]
    weight = (dist != 0) * np.exp(-dist/theta)
    edge_index = torch.tensor(edge_index[weight > gamma]).long().t()    
    x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
    y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
    data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
    return data

# 2. Load Data 

In [4]:
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    

In [5]:
fraudTrain.is_fraud.sum()

6006

In [6]:
FRAUD_RATE = fraudTrain.is_fraud.mean()
FRAUD_RATE

0.005727773406766326

# 3. 데이터의 정리 

## A. 분리 

In [7]:
df_train1, df_test = split_dataframe(fraudTrain,test_fraud_rate=FRAUD_RATE)

In [8]:
df_train2 = throw(df_train1, fraud_rate=0.01) 
df_train3 = throw(df_train1, fraud_rate=0.05) 
df_train4 = throw(df_train1, fraud_rate=0.1) 
df_train5 = throw(df_train1, fraud_rate=0.2) 
df_train6 = throw(df_train1, fraud_rate=0.3) 
df_train7 = throw(df_train1, fraud_rate=0.4) 
df_train8 = throw(df_train1, fraud_rate=0.5) 

In [9]:
print(f"""<df1>
training set := {df_train1.shape}, {df_train1.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df2>
training set := {df_train2.shape}, {df_train2.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df3>
training set := {df_train3.shape}, {df_train3.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df4>
training set := {df_train4.shape}, {df_train4.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df5>
training set := {df_train5.shape}, {df_train5.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df6>
training set := {df_train6.shape}, {df_train6.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df2>
training set := {df_train7.shape}, {df_train7.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
<df8>
training set := {df_train8.shape}, {df_train8.is_fraud.mean():.4f}, {df_train1.is_fraud.sum()}
test set := {df_test.shape}, {df_test.is_fraud.mean():.4f}, {df_test.is_fraud.sum()}
""")

<df1>
training set := (734003, 22), 0.0057, 4205
test set := (314572, 22), 0.0057, 1801
<df2>
training set := (420500, 22), 0.0100, 4205
test set := (314572, 22), 0.0057, 1801
<df3>
training set := (84100, 22), 0.0500, 4205
test set := (314572, 22), 0.0057, 1801
<df4>
training set := (42050, 22), 0.1000, 4205
test set := (314572, 22), 0.0057, 1801
<df5>
training set := (21025, 22), 0.2000, 4205
test set := (314572, 22), 0.0057, 1801
<df6>
training set := (14017, 22), 0.3000, 4205
test set := (314572, 22), 0.0057, 1801
<df2>
training set := (10512, 22), 0.4000, 4205
test set := (314572, 22), 0.0057, 1801
<df8>
training set := (8410, 22), 0.5000, 4205
test set := (314572, 22), 0.0057, 1801



## B. DF저장 

In [10]:
df_train1.to_csv("./data/df_train1.csv",index=False)
df_train2.to_csv("./data/df_train2.csv",index=False)
df_train3.to_csv("./data/df_train3.csv",index=False)
df_train4.to_csv("./data/df_train4.csv",index=False)
df_train5.to_csv("./data/df_train5.csv",index=False)
df_train6.to_csv("./data/df_train6.csv",index=False)
df_train7.to_csv("./data/df_train7.csv",index=False)
df_train8.to_csv("./data/df_train8.csv",index=False)
df_test.to_csv("./data/df_test.csv",index=False)

## C. Make_Graph

In [11]:
THETA = 1e7
GAMMA = 0.95

In [12]:
torch_geometric_data1 = df2geodata(df_train1,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data2 = df2geodata(df_train2,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data3 = df2geodata(df_train3,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data4 = df2geodata(df_train4,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data5 = df2geodata(df_train5,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data6 = df2geodata(df_train6,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data7 = df2geodata(df_train7,df_test,theta=THETA,gamma=GAMMA)
torch_geometric_data8 = df2geodata(df_train8,df_test,theta=THETA,gamma=GAMMA)

In [13]:
torch_geometric_data_list = [
    torch_geometric_data1,
    torch_geometric_data2,
    torch_geometric_data3,
    torch_geometric_data4,
    torch_geometric_data5,
    torch_geometric_data6,
    torch_geometric_data7,
    torch_geometric_data8
]
df_train_list = [
    df_train1,
    df_train2,
    df_train3,
    df_train4,
    df_train5,
    df_train6,
    df_train7,
    df_train8
]

In [14]:
for data, df_train in zip(torch_geometric_data_list,df_train_list):
    data._train_size = len(df_train)
    data._train_frate = df_train.is_fraud.mean()
    data._test_size = len(df_test)
    data._test_frate = df_test.is_fraud.mean()
    data._theta = THETA
    data._gamma = GAMMA 

In [15]:
for (i,data) in enumerate(torch_geometric_data_list):
    with open(f'data/torch_geometric_data{i+1}_{THETA:.1e}_{GAMMA}.pkl', 'wb') as f:
        pickle.dump(data, f)