In [1]:
#  This notebook performs feature engineering on transaction data:
#  Key steps:
#  1. Loads transaction data from 'combined_transactions.csv'
#  2. Creates several derived features:
#     - IS_HIGH_AMOUNT: Boolean flag for transactions > 220
#     - CUSTOMER_TX_COUNT_7D: Rolling 7-day transaction count per customer
#     - TERMINAL_FRAUD_COUNT_28D: Rolling 28-day fraud count per terminal
#     - CUSTOMER_AVG_AMOUNT_7D: Rolling 7-day average transaction amount per customer
#     - AMOUNT_TO_AVG_RATIO: Current transaction amount vs customer's 7-day average
#  3. Saves engineered features to 'engineered_transactions.csv'


In [2]:
import pandas as pd

df = pd.read_csv("../outputs/combined_transactions.csv")
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])

df = df.sort_values("TX_DATETIME").reset_index(drop=True)
print(df.head())


   TRANSACTION_ID         TX_DATETIME  CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31          596         3156      57.16   
1               1 2018-04-01 00:02:10         4961         3412      81.51   
2               2 2018-04-01 00:07:56            2         1365     146.00   
3               3 2018-04-01 00:09:29         4128         8737      64.49   
4               4 2018-04-01 00:10:34          927         9906      50.99   

   TX_TIME_SECONDS  TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               31             0         0                  0  
1              130             0         0                  0  
2              476             0         0                  0  
3              569             0         0                  0  
4              634             0         0                  0  


In [3]:
df['IS_HIGH_AMOUNT'] = df['TX_AMOUNT'] > 220


In [4]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\ADMIN\OneDrive\Desktop\internship\fraud_detection\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [5]:
df = df.sort_values(['CUSTOMER_ID', 'TX_DATETIME'])

df['CUSTOMER_TX_COUNT_7D'] = 0

from tqdm import tqdm
import numpy as np

tqdm.pandas()

df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
customer_ids = df['CUSTOMER_ID'].unique()

for customer_id in tqdm(customer_ids, desc="Processing customers"):
    customer_df = df[df['CUSTOMER_ID'] == customer_id]
    
    tx_times = customer_df['TX_DATETIME']
    counts = []
    
    for i in range(len(tx_times)):
        current_time = tx_times.iloc[i]
        window_start = current_time - pd.Timedelta(days=7)
        count = tx_times[(tx_times > window_start) & (tx_times <= current_time)].count()
        counts.append(count)
    
    df.loc[df['CUSTOMER_ID'] == customer_id, 'CUSTOMER_TX_COUNT_7D'] = counts


Processing customers: 100%|██████████| 4990/4990 [17:14<00:00,  4.82it/s]


In [6]:
from tqdm import tqdm

df = df.sort_values(['TERMINAL_ID', 'TX_DATETIME'])
df['TERMINAL_FRAUD_COUNT_28D'] = 0

for terminal_id in tqdm(df['TERMINAL_ID'].unique(), desc="Processing terminals"):
    terminal_df = df[df['TERMINAL_ID'] == terminal_id]
    
    tx_times = terminal_df['TX_DATETIME']
    fraud_flags = terminal_df['TX_FRAUD']
    
    fraud_counts = []
    for i in range(len(terminal_df)):
        current_time = tx_times.iloc[i]
        window_start = current_time - pd.Timedelta(days=28)
        
        count = fraud_flags[(tx_times > window_start) & (tx_times <= current_time)].sum()
        fraud_counts.append(count)
    
    df.loc[df['TERMINAL_ID'] == terminal_id, 'TERMINAL_FRAUD_COUNT_28D'] = fraud_counts


Processing terminals: 100%|██████████| 10000/10000 [19:38<00:00,  8.49it/s] 


In [7]:
from tqdm import tqdm
df = df.sort_values(['CUSTOMER_ID', 'TX_DATETIME'])
df['CUSTOMER_AVG_AMOUNT_7D'] = 0.0

for customer_id in tqdm(df['CUSTOMER_ID'].unique(), desc="Processing customer averages"):
    customer_df = df[df['CUSTOMER_ID'] == customer_id]
    
    tx_times = customer_df['TX_DATETIME']
    tx_amounts = customer_df['TX_AMOUNT']
    
    avg_amounts = []
    for i in range(len(customer_df)):
        current_time = tx_times.iloc[i]
        window_start = current_time - pd.Timedelta(days=7)
        
        past_amounts = tx_amounts[(tx_times > window_start) & (tx_times <= current_time)]
        avg = past_amounts.mean() if not past_amounts.empty else 0
        avg_amounts.append(avg)
    
    df.loc[df['CUSTOMER_ID'] == customer_id, 'CUSTOMER_AVG_AMOUNT_7D'] = avg_amounts


Processing customer averages: 100%|██████████| 4990/4990 [19:21<00:00,  4.30it/s]


In [8]:
df['AMOUNT_TO_AVG_RATIO'] = df['TX_AMOUNT'] / (df['CUSTOMER_AVG_AMOUNT_7D'] + 1e-5)


In [9]:
df.to_csv("../outputs/engineered_transactions.csv", index=False)
print("✅ Feature engineering complete. Saved to outputs/engineered_transactions.csv")


✅ Feature engineering complete. Saved to outputs/engineered_transactions.csv
