In [1]:

from src.baseline.data_ingestion import DataIngestorFactory, DataIngestorConfig
from src.baseline.features_engineering import PreprocessorPipeline
from torch.utils.data import Dataset
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import torch
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")



In [2]:

os.chdir(rf"c:\Users\thuhi\workspace\fraud_detection")
print(os.getcwd())

factory = DataIngestorFactory()
ingestor = factory.create_ingestor("duration_pkl")
train_df, validation_df = ingestor.ingest(
    dir_path=rf"C:\Users\thuhi\workspace\fraud_detection\data\transformed_data",
    start_train_date="2018-07-25",
    train_duration=7,
    test_duration=7,
    delay=7
)

config = DataIngestorConfig()

c:\Users\thuhi\workspace\fraud_detection


In [3]:
class SequentialFraudDetectionDataset(Dataset):
    def __init__(self,df:pd.DataFrame,seq_len =7):
        self.config = DataIngestorConfig()
        self.seq_len = seq_len
        self.num_samples = len(df)
        self.df = df.copy()
        self.df.sort_values("TX_DATETIME",inplace=True)
        self.df.reset_index(drop=True,inplace=True)
        self.df["tmp_idx"] = np.arange(len(self.df))
        for i in range( seq_len+1):
            self.df[f"tx{i}"] = self.df.groupby("CUSTOMER_ID")["tmp_idx"].shift(seq_len-i-1)
        self.df = self.df.sort_values(["CUSTOMER_ID", "TX_DATETIME"]).fillna(self.num_samples)
        # Create a -1 index row with all zero values (matching df columns)
        zero_row = pd.DataFrame({col: [0] for col in df.columns}, index=[self.num_samples ])
        self.df = pd.concat([zero_row, self.df])
        # Precompute features and targets as tensors
        self.df.sort_index(inplace=True)
        self.features = torch.tensor(self.df[self.config.input_features].values, dtype=torch.float32)
        self.targets = torch.tensor(self.df[self.config.output_feature].values, dtype=torch.int8)
        # Precompute sequence indices
        self.tx_indices = torch.tensor(self.df[[f"tx{i}" for i in range(1, seq_len + 1)]].values, dtype=torch.long)
    def __len__(self):
        return self.num_samples
    def __getitem__(self, index):
        # Use precomputed tensors with gathered indices
        #st = time.time()
        tx_ids = self.tx_indices[index]
        # Gather features for the sequence
        features = self.features[tx_ids]
        target = self.targets[index]
        #logger.info(f"time{time.time()-st}")
        return features, target

In [None]:
train_preprocessed = PreprocessorPipeline(train_df,add_method=["scale"]).process()
data = SequentialFraudDetectionDataset(train_preprocessed)
data[1]


(tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.3022e-05, 1.3523e-01, 6.8467e-01, 8.6757e-03, 5.6426e-05, 0.0000e+00],
         [3.3949e-02, 1.3523e-01, 3.6224e-01, 1.5241e-02, 4.6452e-02, 0.0000e+00]]),
 tensor(0, dtype=torch.int8))

: 

In [6]:
class FraudSequenceDataset(torch.utils.data.Dataset):
    
    def __init__(self, x,y,customer_ids, dates, seq_len, padding_mode = 'zeros', output=True):
        'Initialization'
        
        # x,y,customer_ids, and dates must have the same length
        
        # storing the features x in self.features and adding the "padding" transaction at the end
        if padding_mode == "mean":
            self.features = torch.vstack([x, x.mean(axis=0)])
        elif padding_mode == "zeros":
            self.features = torch.vstack([x, torch.zeros(x[0,:].shape)])            
        else:
            raise ValueError('padding_mode must be "mean" or "zeros"')
        self.y = y
        self.customer_ids = customer_ids
        self.dates = dates
        self.seq_len = seq_len
        self.output = output
        
        #===== computing sequences ids =====  
        
        
        df_ids_dates = pd.DataFrame({'CUSTOMER_ID':customer_ids,
        'TX_DATETIME':dates})
        
        df_ids_dates["tmp_index"]  = np.arange(len(df_ids_dates))
        df_groupby_customer_id = df_ids_dates.groupby("CUSTOMER_ID")
        sequence_indices = pd.DataFrame(
            {
                "tx_{}".format(n): df_groupby_customer_id["tmp_index"].shift(seq_len - n - 1)
                for n in range(seq_len)
            }
        )
        
        #replaces -1 (padding) with the index of the padding transaction (last index of self.features)
        self.sequences_ids = sequence_indices.fillna(len(self.features) - 1).values.astype(int)              


    def __len__(self):
        'Denotes the total number of samples'
        # not len(self.features) because of the added padding transaction
        return len(self.customer_ids)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        
        tx_ids = self.sequences_ids[index]
        
        if self.output:
            #transposing because the CNN considers the channel dimension before the sequence dimension
            return self.features[tx_ids,:].transpose(0,1), self.y[index]
        else:
            return self.features[tx_ids,:].transpose(0,1)

In [7]:
x_train = torch.tensor(train_df[config.input_features_transformed].values)
y_train = torch.tensor(train_df[config.output_feature].values)

training_set = FraudSequenceDataset(x_train, y_train,
                                    train_df['CUSTOMER_ID'].values,
                                    train_df['TX_DATETIME'].values,
                                    seq_len=7,
                                    padding_mode="zeros")

In [8]:
training_set.sequences_ids[34216]

array([76792,  8365, 11818, 11842, 18540, 27439, 34216])

In [9]:
dates = train_df['TX_DATETIME'].values
customer_ids = train_df['CUSTOMER_ID'].values

In [10]:
indices_sort = np.argsort(dates)
sorted_dates = dates[indices_sort]
sorted_ids = customer_ids[indices_sort]

In [11]:
unique_customer_ids = np.unique(sorted_ids)
unique_customer_ids[0:10]

array([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [12]:
idx = 0
current_customer_id = unique_customer_ids[idx]
customer_mask = sorted_ids == current_customer_id
# this is the full sequence of transaction indices (after sort) for customer 0
customer_full_seq = np.where(customer_mask)[0]
# this is the full sequence of transaction indices (before sort) for customer 0
customer_full_seq_original_indices = indices_sort[customer_full_seq]
customer_full_seq_original_indices

array([ 8365, 11818, 11842, 18540, 27439, 34216, 41017, 46994, 50788,
       58551, 61935, 72196, 73411], dtype=int64)

In [13]:
def rolling_window(array, window):
    a = np.concatenate([np.ones((window-1,))*-1,array])
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides).astype(int)
customer_all_seqs = rolling_window(customer_full_seq_original_indices,7)
customer_all_seqs

array([[   -1,    -1,    -1,    -1,    -1,    -1,  8365],
       [   -1,    -1,    -1,    -1,    -1,  8365, 11818],
       [   -1,    -1,    -1,    -1,  8365, 11818, 11842],
       [   -1,    -1,    -1,  8365, 11818, 11842, 18540],
       [   -1,    -1,  8365, 11818, 11842, 18540, 27439],
       [   -1,  8365, 11818, 11842, 18540, 27439, 34216],
       [ 8365, 11818, 11842, 18540, 27439, 34216, 41017],
       [11818, 11842, 18540, 27439, 34216, 41017, 46994],
       [11842, 18540, 27439, 34216, 41017, 46994, 50788],
       [18540, 27439, 34216, 41017, 46994, 50788, 58551],
       [27439, 34216, 41017, 46994, 50788, 58551, 61935],
       [34216, 41017, 46994, 50788, 58551, 61935, 72196],
       [41017, 46994, 50788, 58551, 61935, 72196, 73411]])