In [4]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os
import duckdb
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%pip install kagglehub --quiet

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Define file paths
parquet_path = "./raw_data/financial_fraud_detection_dataset.parquet"
cleaned_parquet_path = "./cleaned_data/cleaned_fraud.parquet"

# 1. Check if source Parquet exists
if not os.path.exists(parquet_path):
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

print(f"üìÅ Source Parquet: {parquet_path}")
print(f"üìÅ Target Cleaned Parquet: {cleaned_parquet_path}")
print(f"üìä Original size: {os.path.getsize(parquet_path) / (1024**3):.2f} GB")

üìÅ Source Parquet: ./raw_data/financial_fraud_detection_dataset.parquet
üìÅ Target Cleaned Parquet: ./cleaned_data/cleaned_fraud.parquet
üìä Original size: 0.19 GB


In [8]:
# CREATE A NEW DIRECTORY cleaned_parquet_path directory FIRST
os.makedirs(os.path.dirname(cleaned_parquet_path), exist_ok=True)

#
con = duckdb.connect()


In [9]:
# Create a Temporal Table for data cleaning:
con.execute(
    f"""
    CREATE TABLE raw_data AS
    SELECT * 
    FROM read_parquet('{parquet_path}') 
    """
    )

# to see the temporal table raw_data
con.execute(f"""
            SELECT *
            FROM raw_data LIMIT 5
            """).fetch_df()


Unnamed: 0,transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,fraud_type,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash
0,T100000,2023-08-22 09:22:43.516168,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,,-0.21,3,0.22,card,13.101.214.112,D8536477
1,T100001,2023-08-04 01:58:02.606711,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,,-0.14,7,0.96,ACH,172.52.47.194,D2622631
2,T100002,2023-05-12 11:39:33.742963,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,,-1.78,20,0.89,card,185.98.35.23,D4823498
3,T100003,2023-10-10 06:04:43.195112,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,,-0.6,6,0.37,wire_transfer,107.136.36.87,D9961380
4,T100004,2023-09-24 08:09:02.700162,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,,0.79,13,0.27,ACH,108.161.108.255,D7637601


In [10]:
# List all columns and their types, only for visualization, this line is not necessary

con.execute(
    f"""
    PRAGMA table_info('raw_data')
    """
).fetch_df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,transaction_id,VARCHAR,False,,False
1,1,timestamp,TIMESTAMP,False,,False
2,2,sender_account,VARCHAR,False,,False
3,3,receiver_account,VARCHAR,False,,False
4,4,amount,DOUBLE,False,,False
5,5,transaction_type,VARCHAR,False,,False
6,6,merchant_category,VARCHAR,False,,False
7,7,location,VARCHAR,False,,False
8,8,device_used,VARCHAR,False,,False
9,9,is_fraud,BOOLEAN,False,,False


In [11]:
#raw_data is the SQL temporal table. 

# feature engineering: divide timestamp in diferent columns: month, day, hour.
# Create new column for Day of the week. 

con.execute(f"""
            CREATE TABLE fraud_data_clean AS
            SELECT
                *,
                EXTRACT (YEAR FROM timestamp) AS year,
                EXTRACT (MONTH FROM timestamp) AS month,
                EXTRACT (DAY FROM timestamp)    AS day_of_month,
                EXTRACT (HOUR FROM timestamp) AS hour,
                EXTRACT (ISODOW FROM timestamp) AS day_of_week
            FROM raw_data
            """)

<_duckdb.DuckDBPyConnection at 0x1eb4c0bc2b0>

In [12]:
# to visualize the output

con.execute(f"""
            SELECT *
            FROM fraud_data_clean
            """).fetch_df()




Unnamed: 0,transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,...,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,year,month,day_of_month,hour,day_of_week
0,T100000,2023-08-22 09:22:43.516168,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,...,3,0.22,card,13.101.214.112,D8536477,2023,8,22,9,2
1,T100001,2023-08-04 01:58:02.606711,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,...,7,0.96,ACH,172.52.47.194,D2622631,2023,8,4,1,5
2,T100002,2023-05-12 11:39:33.742963,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,...,20,0.89,card,185.98.35.23,D4823498,2023,5,12,11,5
3,T100003,2023-10-10 06:04:43.195112,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,...,6,0.37,wire_transfer,107.136.36.87,D9961380,2023,10,10,6,2
4,T100004,2023-09-24 08:09:02.700162,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,...,13,0.27,ACH,108.161.108.255,D7637601,2023,9,24,8,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,T5099995,2023-11-17 23:20:29.746144,ACC597319,ACC749300,10.87,withdrawal,retail,Toronto,atm,False,...,17,0.18,UPI,243.92.38.163,D4439579,2023,11,17,23,5
4999996,T5099996,2023-09-23 11:23:20.659686,ACC749625,ACC709783,181.40,payment,grocery,Sydney,atm,False,...,4,0.58,wire_transfer,28.252.18.249,D5029311,2023,9,23,11,6
4999997,T5099997,2023-11-18 00:52:34.527092,ACC629492,ACC680736,12.54,payment,utilities,New York,mobile,False,...,6,0.99,card,111.199.174.121,D6333607,2023,11,18,0,6
4999998,T5099998,2023-03-25 04:32:13.609837,ACC984720,ACC296935,376.29,deposit,restaurant,Dubai,pos,False,...,5,0.32,wire_transfer,221.110.215.14,D1551203,2023,3,25,4,6


In [13]:
# Feature excluded: from fraud_data_clean. In DuckDB, one query is needed to alter each single column,  more than one column cannot be altered with a single query.
#Drop timestamp. 
#Drop fraud_type
#Drop transaction_id


con.execute(f"ALTER TABLE fraud_data_clean DROP COLUMN timestamp")
con.execute(f"ALTER TABLE fraud_data_clean DROP COLUMN fraud_type")
con.execute(f"ALTER TABLE fraud_data_clean DROP COLUMN transaction_id")


con.execute(f"""
            SELECT *
            FROM fraud_data_clean
            """).fetch_df()


Unnamed: 0,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,year,month,day_of_month,hour,day_of_week
0,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,-0.21,3,0.22,card,13.101.214.112,D8536477,2023,8,22,9,2
1,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,-0.14,7,0.96,ACH,172.52.47.194,D2622631,2023,8,4,1,5
2,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,-1.78,20,0.89,card,185.98.35.23,D4823498,2023,5,12,11,5
3,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,-0.60,6,0.37,wire_transfer,107.136.36.87,D9961380,2023,10,10,6,2
4,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,0.79,13,0.27,ACH,108.161.108.255,D7637601,2023,9,24,8,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,ACC597319,ACC749300,10.87,withdrawal,retail,Toronto,atm,False,1416.524233,-0.14,17,0.18,UPI,243.92.38.163,D4439579,2023,11,17,23,5
4999996,ACC749625,ACC709783,181.40,payment,grocery,Sydney,atm,False,999.089702,-1.79,4,0.58,wire_transfer,28.252.18.249,D5029311,2023,9,23,11,6
4999997,ACC629492,ACC680736,12.54,payment,utilities,New York,mobile,False,3871.584025,-0.30,6,0.99,card,111.199.174.121,D6333607,2023,11,18,0,6
4999998,ACC984720,ACC296935,376.29,deposit,restaurant,Dubai,pos,False,-4096.765453,-1.43,5,0.32,wire_transfer,221.110.215.14,D1551203,2023,3,25,4,6


In [14]:
#Drop NULL rows from time_since_last_transaction
con.execute(f"""
    DELETE FROM fraud_data_clean
            WHERE time_since_last_transaction IS NULL
""")

con.execute(f"""
            SELECT *
            FROM fraud_data_clean
            """).fetch_df()




Unnamed: 0,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,year,month,day_of_month,hour,day_of_week
0,ACC420214,ACC222629,318.12,withdrawal,restaurant,Tokyo,pos,False,-4797.552868,-0.94,16,0.64,UPI,88.85.250.147,D3353785,2023,4,25,14,2
1,ACC759858,ACC433871,25.03,transfer,online,Dubai,pos,False,3705.738348,-0.56,1,0.48,ACH,89.235.76.67,D4950912,2023,8,17,1,4
2,ACC702235,ACC658588,5.33,transfer,online,Toronto,pos,False,2158.906433,0.77,7,0.18,ACH,132.247.155.53,D9285320,2023,12,28,23,4
3,ACC818001,ACC846452,261.11,payment,entertainment,Tokyo,atm,False,-71.393848,0.43,12,0.41,wire_transfer,186.251.230.65,D4842173,2023,8,18,9,5
4,ACC293626,ACC440136,28.61,transfer,retail,London,pos,False,1400.413482,-1.48,18,0.53,UPI,233.115.221.14,D7106200,2023,10,30,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4103482,ACC597319,ACC749300,10.87,withdrawal,retail,Toronto,atm,False,1416.524233,-0.14,17,0.18,UPI,243.92.38.163,D4439579,2023,11,17,23,5
4103483,ACC749625,ACC709783,181.40,payment,grocery,Sydney,atm,False,999.089702,-1.79,4,0.58,wire_transfer,28.252.18.249,D5029311,2023,9,23,11,6
4103484,ACC629492,ACC680736,12.54,payment,utilities,New York,mobile,False,3871.584025,-0.30,6,0.99,card,111.199.174.121,D6333607,2023,11,18,0,6
4103485,ACC984720,ACC296935,376.29,deposit,restaurant,Dubai,pos,False,-4096.765453,-1.43,5,0.32,wire_transfer,221.110.215.14,D1551203,2023,3,25,4,6


In [15]:
# Select  name of columns
con.execute(
    f"""
    SELECT name AS column_name
    FROM pragma_table_info('fraud_data_clean')
    """
).fetch_df()

Unnamed: 0,column_name
0,sender_account
1,receiver_account
2,amount
3,transaction_type
4,merchant_category
5,location
6,device_used
7,is_fraud
8,time_since_last_transaction
9,spending_deviation_score


In [16]:
# Found columns with null values

con.execute(
    f"""
    SELECT
        COUNT(*) FILTER (WHERE sender_account IS NULL)               AS sender_account_nulls,
        COUNT(*) FILTER (WHERE receiver_account IS NULL)             AS receiver_account_nulls,
        COUNT(*) FILTER (WHERE amount IS NULL)                       AS amount_nulls,
        COUNT(*) FILTER (WHERE transaction_type IS NULL)             AS transaction_type_nulls,
        COUNT(*) FILTER (WHERE merchant_category IS NULL)            AS merchant_category_nulls,
        COUNT(*) FILTER (WHERE location IS NULL)                     AS location_nulls,
        COUNT(*) FILTER (WHERE device_used IS NULL)                  AS device_used_nulls,
        COUNT(*) FILTER (WHERE is_fraud IS NULL)                     AS is_fraud_nulls,
        COUNT(*) FILTER (WHERE time_since_last_transaction IS NULL)  AS time_since_last_transaction_nulls,
        COUNT(*) FILTER (WHERE spending_deviation_score IS NULL)     AS spending_deviation_score_nulls,
        COUNT(*) FILTER (WHERE velocity_score IS NULL)               AS velocity_score_nulls,
        COUNT(*) FILTER (WHERE geo_anomaly_score IS NULL)            AS geo_anomaly_score_nulls,
        COUNT(*) FILTER (WHERE payment_channel IS NULL)              AS payment_channel_nulls,
        COUNT(*) FILTER (WHERE ip_address IS NULL)                   AS ip_address_nulls,
        COUNT(*) FILTER (WHERE device_hash IS NULL)                  AS device_hash_nulls,
        COUNT(*) FILTER (WHERE  year IS NULL)                        AS year_nulls,
        COUNT(*) FILTER (WHERE month IS NULL)                        AS month_nulls,
        COUNT(*) FILTER (WHERE day_of_month IS NULL)                 AS day_of_month_nulls,
        COUNT(*) FILTER (WHERE hour IS NULL)                         AS hour_nulls,
        COUNT(*) FILTER (WHERE day_of_week IS NULL)                 AS day_of_week_nulls,
FROM fraud_data_clean
"""
).fetch_df()

Unnamed: 0,sender_account_nulls,receiver_account_nulls,amount_nulls,transaction_type_nulls,merchant_category_nulls,location_nulls,device_used_nulls,is_fraud_nulls,time_since_last_transaction_nulls,spending_deviation_score_nulls,velocity_score_nulls,geo_anomaly_score_nulls,payment_channel_nulls,ip_address_nulls,device_hash_nulls,year_nulls,month_nulls,day_of_month_nulls,hour_nulls,day_of_week_nulls
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Determine number of positive and negative fraud cases after remove null cases
# Check all distinct values and their frequencies in is_fraud
con.execute(
    f"""
    SELECT
        is_fraud,
        COUNT(*) AS cnt
    FROM fraud_data_clean
    GROUP BY is_fraud
    ORDER BY cnt DESC
    """
).fetch_df()

Unnamed: 0,is_fraud,cnt
0,False,3923934
1,True,179553


In [18]:
# save the clean table in a parquet file. path: cleaned_parquet_path = "./cleaned_data/cleaned_fraud.parquet"
con.execute(f"""
    COPY fraud_data_clean
    TO '{cleaned_parquet_path}'
    (FORMAT 'PARQUET', COMPRESSION 'zstd')
""")

# TO VERIFY IF THE FILE EXIST: 
print("‚úÖ Archivo encontrado" if os.path.exists(cleaned_parquet_path) else "‚ùå No encontrado")


‚úÖ Archivo encontrado


In [19]:
# 1. Check if source Parquet exists
if not os.path.exists(cleaned_parquet_path):
    raise FileNotFoundError(f"Parquet file not found at {cleaned_parquet_path}")

print(f"üìÅ Target Cleaned Parquet: {cleaned_parquet_path}")
print(f"üìä Original size: {os.path.getsize(cleaned_parquet_path) / (1024**3):.2f} GB")

üìÅ Target Cleaned Parquet: ./cleaned_data/cleaned_fraud.parquet
üìä Original size: 0.14 GB


# For future consideration: DROP Time_since_last_transaction
