## Goal

The objective of this notebook is to construct meaningful and predictive features from the cleaned transactional, user, and merchant datasets. These features will be used for downstream modeling tasks such as fraud detection. The process includes:

- Merging relevant data sources
- Extracting temporal and behavioral signals
- Encoding categorical variables
- Creating domain-informed ratios and transformations
- Scaling numerical variables
- Saving the processed dataset for modeling


In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

In [2]:
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

DATA_PROCESSED_DIR = Path("../data/processed")
DATA_INTERIM_DIR = Path("../data/interim")

In [3]:
transactions = pd.read_parquet(DATA_INTERIM_DIR / "transaction_locations_dropped.parquet")
users = pd.read_parquet(DATA_INTERIM_DIR / "users_log_transformed.parquet")
merchants = pd.read_parquet(DATA_INTERIM_DIR / "merchants_binned.parquet")

In [4]:
df = transactions.merge(users, on="user_id").merge(merchants, on="merchant_id")

In [5]:
print("Initial shape:", df.shape)

Initial shape: (500000, 33)


In [6]:
df.head()

Unnamed: 0,transaction_id,timestamp,user_id,merchant_id,amount,channel,currency,device,payment_method,is_international,...,category,country_y,trust_score,number_of_alerts_last_6_months,avg_transaction_amount,account_age_months,has_fraud_history,log_avg_transaction_amount,alerts_binned,account_age_group
0,TX000000,2022-06-17 23:28:00,U14804,M0314,130.03,in-store,EUR,Android,debit_card,1,...,travel,France,0.581711,3,74.97,23,1,4.330339,1-3,Mid
1,TX000001,2022-01-04 15:39:00,U16634,M0675,132.0,online,EUR,Android,debit_card,1,...,electronics,Germany,0.568933,2,56.92,23,0,4.059063,1-3,Mid
2,TX000002,2022-09-09 21:58:00,U18005,M0479,8.65,online,EUR,Android,credit_card,1,...,gaming,Denmark,0.60807,5,98.93,75,0,4.60447,4+,Established
3,TX000003,2023-11-20 06:40:00,U13690,M0538,19.82,mobile,EUR,iOS,credit_card,0,...,electronics,Portugal,0.592656,4,21.6,82,0,3.11795,4+,Established
4,TX000004,2022-04-28 08:08:00,U04642,M0128,101.92,in-store,EUR,Android,credit_card,1,...,gaming,Spain,0.450223,4,60.61,82,0,4.120824,4+,Established


In [7]:
df.columns

Index(['transaction_id', 'timestamp', 'user_id', 'merchant_id', 'amount',
       'channel', 'currency', 'device', 'payment_method', 'is_international',
       'session_length_seconds', 'is_first_time_merchant', 'is_fraud', 'age',
       'sex', 'education', 'primary_source_of_income', 'country_x',
       'signup_date', 'risk_score', 'account_age_days',
       'log_monthly_installments', 'log_monthly_expenses', 'category',
       'country_y', 'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history',
       'log_avg_transaction_amount', 'alerts_binned', 'account_age_group'],
      dtype='object')

In [8]:
df.describe()

Unnamed: 0,timestamp,amount,is_international,session_length_seconds,is_first_time_merchant,is_fraud,age,signup_date,risk_score,account_age_days,log_monthly_installments,log_monthly_expenses,trust_score,number_of_alerts_last_6_months,avg_transaction_amount,account_age_months,has_fraud_history,log_avg_transaction_amount
count,500000,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,2022-12-31 16:45:04.497720576,49.915662,0.928662,614.987182,0.501248,0.084822,45.951568,2022-10-06 13:27:43.660799744,0.501592,451.439078,5.154037,6.512901,0.682945,3.056854,81.851645,61.546824,0.520824,3.898029
min,2022-01-01 00:06:00,0.0,0.0,30.0,0.0,0.0,18.0,2020-04-21 00:00:00,0.0,-446.0,0.00995,0.039221,0.0,0.0,0.12,6.0,0.0,0.113329
25%,2022-07-01 11:03:45,14.37,1.0,323.0,0.0,0.0,32.0,2021-07-09 00:00:00,0.398761,3.0,4.482211,5.87116,0.546098,2.0,24.72,32.0,0.0,3.247269
50%,2022-12-31 20:08:30,34.56,1.0,614.0,1.0,0.0,46.0,2022-10-08 00:00:00,0.500496,450.0,5.339123,6.716825,0.697761,3.0,58.29,61.0,1.0,4.082441
75%,2023-07-02 08:45:15,69.1725,1.0,907.0,1.0,0.0,60.0,2023-12-29 00:00:00,0.603615,906.0,6.037871,7.409403,0.826846,4.0,117.19,92.0,1.0,4.772293
max,2023-12-31 23:57:00,714.86,1.0,1200.0,1.0,1.0,74.0,2025-03-22 00:00:00,1.0,1350.0,8.060445,9.305226,1.0,9.0,709.04,119.0,1.0,6.565321
std,,49.990657,0.257389,338.091496,0.499999,0.278617,16.525405,,0.149917,519.505291,1.224402,1.258722,0.194412,1.722031,79.139077,33.427673,0.499567,1.165481


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 33 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   transaction_id                  500000 non-null  object        
 1   timestamp                       500000 non-null  datetime64[ns]
 2   user_id                         500000 non-null  object        
 3   merchant_id                     500000 non-null  object        
 4   amount                          500000 non-null  float64       
 5   channel                         500000 non-null  object        
 6   currency                        500000 non-null  object        
 7   device                          500000 non-null  object        
 8   payment_method                  500000 non-null  object        
 9   is_international                500000 non-null  int64         
 10  session_length_seconds          500000 non-null  int64  

In [10]:
categorical_cols = [
    "channel", "currency", "device", "payment_method", "sex",
    "education", "primary_source_of_income", "alerts_binned", "account_age_group"
]

for col in categorical_cols:
    df[col] = df[col].astype("category")

df[categorical_cols].dtypes

channel                     category
currency                    category
device                      category
payment_method              category
sex                         category
education                   category
primary_source_of_income    category
alerts_binned               category
account_age_group           category
dtype: object

In [11]:
# One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=[
    "channel", "currency", "device", "payment_method", "sex",
    "education", "primary_source_of_income", "alerts_binned", "account_age_group"
], drop_first=True)

print("Shape after one-hot encoding:", df_encoded.shape)
df_encoded.head()

Shape after one-hot encoding: (500000, 46)


Unnamed: 0,transaction_id,timestamp,user_id,merchant_id,amount,is_international,session_length_seconds,is_first_time_merchant,is_fraud,age,...,education_PhD,primary_source_of_income_Employment,primary_source_of_income_Retirement,primary_source_of_income_Savings,primary_source_of_income_Student Aid,primary_source_of_income_Unemployment,alerts_binned_1-3,alerts_binned_4+,account_age_group_Mid,account_age_group_New
0,TX000000,2022-06-17 23:28:00,U14804,M0314,130.03,1,145,0,0,23,...,False,False,False,True,False,False,True,False,True,False
1,TX000001,2022-01-04 15:39:00,U16634,M0675,132.0,1,32,1,0,53,...,True,False,False,False,False,False,True,False,True,False
2,TX000002,2022-09-09 21:58:00,U18005,M0479,8.65,1,604,1,0,58,...,False,False,False,False,False,False,False,True,False,False
3,TX000003,2023-11-20 06:40:00,U13690,M0538,19.82,0,1031,1,0,59,...,False,True,False,False,False,False,False,True,False,False
4,TX000004,2022-04-28 08:08:00,U04642,M0128,101.92,1,330,0,0,28,...,True,False,False,False,False,True,False,True,False,False


In [12]:
 print(df_encoded.columns[-10:])

Index(['education_PhD', 'primary_source_of_income_Employment',
       'primary_source_of_income_Retirement',
       'primary_source_of_income_Savings',
       'primary_source_of_income_Student Aid',
       'primary_source_of_income_Unemployment', 'alerts_binned_1-3',
       'alerts_binned_4+', 'account_age_group_Mid', 'account_age_group_New'],
      dtype='object')


In [13]:
numeric_cols = [
    "amount", "session_length_seconds", "age", "risk_score",
    "account_age_days", "log_monthly_installments", "log_monthly_expenses",
    "trust_score", "number_of_alerts_last_6_months", "avg_transaction_amount",
    "account_age_months", "log_avg_transaction_amount"
]

scaler = StandardScaler()

# Skalowanie
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

df_encoded[numeric_cols].describe()

Unnamed: 0,amount,session_length_seconds,age,risk_score,account_age_days,log_monthly_installments,log_monthly_expenses,trust_score,number_of_alerts_last_6_months,avg_transaction_amount,account_age_months,log_avg_transaction_amount
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,8.777334000000001e-17,1.131397e-16,-1.078746e-16,6.262084e-16,8.665069e-18,-4.519478e-16,9.915766e-16,8.799859e-16,2.1374900000000002e-17,-3.187992e-16,-2.053469e-17,-7.436611e-16
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-0.9985008,-1.730265,-1.691432,-3.345797,-1.72749,-4.201309,-5.143062,-3.512875,-1.775146,-1.032761,-1.661703,-3.247333
25%,-0.7110468,-0.8636345,-0.8442505,-0.685923,-0.8632049,-0.5486977,-0.5098359,-0.7039029,-0.6137258,-0.7219152,-0.883904,-0.5583626
50%,-0.307171,-0.002919869,0.002930763,-0.007315117,-0.002770096,0.1511645,0.1620089,0.07620972,-0.03301569,-0.2977248,-0.01635844,0.1582282
75%,0.3852091,0.8637103,0.850112,0.6805316,0.8749889,0.7218497,0.7122325,0.7401876,0.5476944,0.4465353,0.9110178,0.7501329
max,13.30139,1.73034,1.697293,3.324556,1.729649,2.373738,2.218383,1.630844,3.451245,7.925149,1.718733,2.288579


In [14]:
drop_cols = [
    "transaction_id", "timestamp", "user_id", "merchant_id",
    "channel", "currency", "device", "payment_method", "sex", "education",
    "primary_source_of_income", "alerts_binned", "account_age_group",
    "signup_date", "country_x", "country_y", "category"
]

df_encoded = df_encoded.copy()
df_encoded[df.columns] = df[df.columns]
df_model = df_encoded.drop(columns=[col for col in drop_cols if col in df_encoded.columns])



df_model.to_parquet(DATA_PROCESSED_DIR / "df_model_ready.parquet", index=False)

print(f"Final shape: {df_model.shape}")

Final shape: (500000, 38)
