In [None]:
# This code performs the following operations:
# 1. Loads all .pkl files from '../dataset/data' directory
# 2. Combines them into a single DataFrame
# 3. Converts TX_DATETIME column to datetime format
# 4. Sorts data by transaction datetime
# 5. Checks for missing values
# 6. Exports the combined data to CSV

# Key statistics:
# - Source: Multiple pickle files from ../dataset/data
# - Output: ../outputs/combined_transactions.csv
# - Operations: Concatenation, datetime conversion, sorting

In [None]:
import pandas as pd
import os

data_folder = "../dataset/data" 

pkl_files = sorted([f for f in os.listdir(data_folder) if f.endswith('.pkl')])
dfs = [pd.read_pickle(os.path.join(data_folder, file)) for file in pkl_files]

df = pd.concat(dfs, ignore_index=True)

df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])

df = df.sort_values('TX_DATETIME').reset_index(drop=True)

print(f"✅ Loaded {len(pkl_files)} files. Shape: {df.shape}")
print(df.head())
print("\nMissing values:\n", df.isnull().sum())
df.to_csv("../outputs/combined_transactions.csv", index=False)
print("📁 Exported to outputs/combined_transactions.csv")


✅ Loaded 183 files. Shape: (1754155, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31         596        3156      57.16   
1               1 2018-04-01 00:02:10        4961        3412      81.51   
2               2 2018-04-01 00:07:56           2        1365     146.00   
3               3 2018-04-01 00:09:29        4128        8737      64.49   
4               4 2018-04-01 00:10:34         927        9906      50.99   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0              31            0         0                  0  
1             130            0         0                  0  
2             476            0         0                  0  
3             569            0         0                  0  
4             634            0         0                  0  

Missing values:
 TRANSACTION_ID       0
TX_DATETIME          0
CUSTOMER_ID          0
TERMINAL_ID          0
TX_AMOUNT            