In [4]:

from src.components.data_ingestion import DataIngestorFactory, DataIngestorConfig
from src.components.features_engineering import PreprocessorPipeline
from src.components.exception import CustomException
from torch.utils.data import Dataset
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import torch
import sys

pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")



In [5]:
os.chdir(rf"c:\Users\thuhi\workspace\fraud_detection")
print(os.getcwd())

batch_size = 512
epoch = 20
patience = 5
pos_mul = 3
hidden_size = 128
start_date = "2018-10-01"
end_train_date = "2022-12-31"
start_test_date = "2018-10-01"
end_test_date = "2022-12-31"
c_seq_len = 7
t_seq_len = 7
lstm_hidden_size=256  # Hyperparameter to tune
combiner_hidden_size=512 
fast_dev_run=False
config = DataIngestorConfig()


factory = DataIngestorFactory()
ingestor = factory.create_ingestor("duration_pkl")
train_df, validation_df = ingestor.ingest(
    dir_path=rf"C:\Users\thuhi\workspace\fraud_detection\data\transformed_upsampled_data",
    start_train_date=start_date,
    end_train_date= end_train_date,
    end_test_date= end_test_date,
    start_test_date= start_test_date)

c:\Users\thuhi\workspace\fraud_detection
2025-06-25 11:39:25,360 - INFO - Train: 2018-10-01 - 2022-12-31
2025-06-25 11:39:26,597 - INFO - Test: 2018-10-01 - 2022-12-31


In [None]:
class SequentialFraudDetectionDataset(Dataset):
    def __init__(self,df:pd.DataFrame,seq_len =5,transformed = "agg"):
        try:
            self.df = df.copy()
            self.df.sort_values("TX_DATETIME",inplace=True)
            self.df.reset_index(drop=True,inplace=True)
            self.df["tmp_tx"] = self.df.index
            self.num_samples = len(self.df)
            config = DataIngestorConfig()
            

            self.target =  torch.tensor(df.loc[:,config.output_feature].values,dtype=torch.int8)
            self.target = torch.hstack([self.target,torch.tensor(0)])
            if transformed == "agg":
                features = config.input_features_transformed
            elif transformed == "customer":
                features =  config.input_features_customer
            elif transformed == "terminal":
                features =  config.input_features_terminal
            else:
                features =  config.input_features
            features_cols = torch.tensor(df.loc[:,features].values,dtype=torch.float32)
            padding = torch.zeros(1,features_cols.shape[1],dtype=torch.float32)
            self.features = torch.vstack([features_cols,padding])
            padding_idx = self.num_samples

            if transformed == "terminal":
                by= "TERMINAL_ID"
            else:
                by = "CUSTOMER_ID"
            df_for_indices = pd.DataFrame({
            by: self.df[by],
            'tmp_idx': np.arange(self.num_samples) # Values 0...9999
            })

            df_groupby = df_for_indices.groupby(by)
    
            sequences = {
                # shift(0) is the current transaction, shift(1) is the one before, etc.
                f"tx_step_{i}": df_groupby['tmp_idx'].shift(i)
                for i in range(seq_len - 1, -1, -1)
            }
            sequences_df = pd.DataFrame(sequences)
            self.tx_indices = torch.tensor(sequences_df.fillna(padding_idx).values, dtype=torch.long)
        except Exception as e:
            raise CustomException(e,sys)
    def __len__(self):
        return self.num_samples
    def __getitem__(self, index):
        # Use precomputed tensors with gathered indices
        #st = time.time()
        tx_ids = self.tx_indices[index]
        # Gather features for the sequence
        features = self.features[tx_ids]
        target = self.target[tx_ids]
        #logger.info(f"time{time.time()-st}")
        return features, target

train_preprocessed = PreprocessorPipeline(train_df,add_method=["scale"]).process()
data= SequentialFraudDetectionDataset(train_preprocessed)
data1 = SequentialFraudDetectionDataset(train_preprocessed,transformed='customer',seq_len=7)
data2 = SequentialFraudDetectionDataset(train_preprocessed,transformed='terminal',seq_len=7) 
# Find the first i such that the last value in data2.tx_indices[i] is 15724
data2.tx_indices[34396]

In [41]:
class BaggingSequentialFraudDetectionDataset(Dataset):
    def __init__(self, df: pd.DataFrame, c_seq_len=5,t_seq_len=5,mode=None):
        try:
            self.mode =mode
            self.config = DataIngestorConfig()
            
            self.t_historical_len = t_seq_len - 1
            self.c_historical_len = c_seq_len - 1
            self.df_source = df.copy().sort_values("TX_DATETIME").reset_index(drop=True)
            self.num_samples = len(self.df_source)

            # --- 1. Prepare Target and Current Transaction Features ---
            self.targets = torch.tensor(self.df_source[self.config.output_feature].values, dtype=torch.int8)
            target_padding = torch.zeros(1, dtype=torch.float32)
            self.targets = torch.hstack([self.targets, target_padding])
            # The "current" stream gets all transformed features
            self.current_tx_features = torch.tensor(self.df_source[self.config.input_features_transformed].values, dtype=torch.float32)

            # --- 2. Prepare Customer History Stream ---
            customer_features_tensor = torch.tensor(self.df_source[self.config.input_features_customer].values, dtype=torch.float32)
            cust_padding = torch.zeros(1, customer_features_tensor.shape[1], dtype=torch.float32)
            self.customer_features_pool = torch.vstack([customer_features_tensor, cust_padding])
            cust_padding_idx = len(self.customer_features_pool) - 1

            # --- 3. Prepare Terminal History Stream ---
            terminal_features_tensor = torch.tensor(self.df_source[self.config.input_features_terminal].values, dtype=torch.float32)
            term_padding = torch.zeros(1, terminal_features_tensor.shape[1], dtype=torch.float32)
            self.terminal_features_pool = torch.vstack([terminal_features_tensor, term_padding])
            term_padding_idx = len(self.terminal_features_pool) - 1

            # --- 4. Create Index Mappings for Both History Types ---
            df_for_indices = pd.DataFrame({
                'CUSTOMER_ID': self.df_source['CUSTOMER_ID'],
                'TERMINAL_ID': self.df_source['TERMINAL_ID'],
                'tmp_idx': np.arange(self.num_samples)
            })

            # Customer history indices (looks back 1 to `historical_len` steps)
            cust_groupby = df_for_indices.groupby('CUSTOMER_ID')
            cust_sequences = {f"tx_{i}": cust_groupby['tmp_idx'].shift(i) for i in range(1, self.c_historical_len + 1)}
            self.customer_indices = torch.tensor(pd.DataFrame(cust_sequences).fillna(cust_padding_idx).values, dtype=torch.long)
            
            # Terminal history indices (looks back 1 to `historical_len` steps)
            term_groupby = df_for_indices.groupby('TERMINAL_ID')
            term_sequences = {f"tx_{i}": term_groupby['tmp_idx'].shift(i) for i in range(1, self.t_historical_len + 1)}
            self.terminal_indices = torch.tensor(pd.DataFrame(term_sequences).fillna(term_padding_idx).values, dtype=torch.long)

        except Exception as e:
            raise CustomException(e, sys)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        # --- Assemble Customer History ---
        cust_hist_indices = self.customer_indices[index]
        customer_history_seq = self.customer_features_pool[cust_hist_indices]

        # --- Assemble Terminal History ---
        term_hist_indices = self.terminal_indices[index]
        terminal_history_seq = self.terminal_features_pool[term_hist_indices]

        # --- Get Current Transaction Info ---
        current_tx = self.current_tx_features[index]

        # --- Get Target Label ---
        target = self.targets[index]
        if not self.mode:
            return (customer_history_seq, terminal_history_seq, current_tx), target
        if self.mode =="eda":
            return (customer_history_seq, terminal_history_seq, current_tx), (self.targets[cust_hist_indices],self.targets[term_hist_indices],self.targets[index]),(term_hist_indices,cust_hist_indices)
      

In [43]:
data = BaggingSequentialFraudDetectionDataset(df = validation_df,mode="eda")
data.df_source.sort_values(["TERMINAL","TX_DATETIME"])

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DURING_WEEKEND,...,CUSTOMER_ID_NB_TX_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_TX_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW,TERMINAL_ID_NB_TX_1DAY_WINDOW,TERMINAL_ID_RISK_1DAY_WINDOW,TERMINAL_ID_NB_TX_7DAY_WINDOW,TERMINAL_ID_RISK_7DAY_WINDOW,TERMINAL_ID_NB_TX_30DAY_WINDOW,TERMINAL_ID_RISK_30DAY_WINDOW
3249,1375940,2018-10-07 15:29:21.064706692,0,2352,29.610961,12400251,143,1,2,1,...,7.0,49.597143,7.0,49.597143,2.0,0.0,7.0,0.0,33.0,0.00000
16755,1375940,2018-11-02 20:50:28.048204726,0,2352,25.326738,12400251,143,1,2,0,...,26.0,183.921888,110.0,200.200998,0.0,0.0,0.0,0.0,3.0,1.00000
30111,1375940,2018-11-28 15:42:09.511672811,0,2352,27.792566,12400251,143,1,2,0,...,22.0,54.860909,66.0,54.323788,2.0,0.0,8.0,0.0,26.0,0.00000
42915,1375940,2018-12-23 10:54:18.609418019,0,2352,19.919250,12400251,143,1,2,1,...,2.0,87.453758,10.0,73.547575,1.0,0.0,9.0,0.0,37.0,0.00000
56955,1375940,2019-01-19 14:21:34.514412725,0,2352,31.341608,12400251,143,1,2,1,...,1.0,19.210627,2.0,18.978355,0.0,0.0,5.0,0.0,34.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312959,861561,2020-07-10 08:23:35.560254504,4997,4876,15.904438,7752513,89,1,2,0,...,11.0,43.795455,61.0,44.518689,1.0,0.0,9.0,0.0,40.0,0.00000
322849,861561,2020-08-06 09:34:07.007461461,4997,4876,21.698386,7752513,89,1,2,0,...,15.0,106.480000,59.0,93.556102,2.0,0.0,9.0,0.0,21.0,0.00000
330867,861561,2020-08-30 10:45:57.621053821,4997,4876,17.864790,7752513,89,1,2,1,...,21.0,8.675714,89.0,13.240787,1.0,1.0,8.0,1.0,32.0,0.90625
339468,861561,2020-09-25 17:42:43.299233160,4997,4876,21.503373,7752513,89,1,2,0,...,23.0,83.436087,69.0,79.855362,2.0,0.0,10.0,0.0,50.0,0.00000


In [None]:
data[527046]

((tensor([[  1.0000,  66.7018,   2.0000, 112.8738,   4.0000, 101.7886],
          [  1.0000,  80.1800,  14.0000,  90.3614,  50.0000,  93.1520],
          [  4.0000,  39.1000,  27.0000,  36.8574,  89.0000,  39.9558],
          [  2.0000,  33.5650,  25.0000,  36.2352,  89.0000,  39.7064]]),
  tensor([[ 1.,  0.,  7.,  0., 40.,  0.],
          [ 1.,  0.,  7.,  0., 35.,  0.],
          [ 0.,  0., 10.,  0., 37.,  0.],
          [ 1.,  0.,  7.,  0., 37.,  0.]]),
  tensor([1.0205e+02, 1.0000e+00, 1.0000e+00, 2.0000e+00, 9.0703e+01, 3.0000e+00,
          9.8091e+01, 2.9000e+01, 9.5921e+01, 1.0000e+00, 0.0000e+00, 1.0000e+01,
          0.0000e+00, 4.3000e+01, 2.3256e-02])),
 (tensor([0., 1., 0., 0.]), tensor([0., 1., 0., 0.]), tensor(0.)),
 (tensor([527045, 527044, 527043, 527042]),
  tensor([527045, 527044, 527043, 527042])))

: 