In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from narwhals.selectors import categorical

In [2]:
PROCESSED_DATA = Path("../data/processed/")
FIGURES_DATA = Path("../reports/figures/")
INTERIM_DATA = Path("../data/interim/")

In [3]:
df = pd.read_parquet(INTERIM_DATA / "merge_data.parquet")

In [4]:
df.columns

Index(['transaction_id', 'timestamp', 'user_id', 'merchant_id', 'amount',
       'channel', 'currency', 'device', 'location', 'payment_method',
       'is_international', 'session_length_seconds', 'is_first_time_merchant',
       'is_fraud', 'age', 'sex', 'education', 'primary_source_of_income',
       'sum_of_monthly_installments', 'sum_of_monthly_expenses',
       'country_user', 'signup_date', 'risk_score', 'category',
       'country_merchant', 'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history'],
      dtype='object')

In [5]:
df.drop(columns=["currency", "user_id", "merchant_id", "location"], inplace=True)

In [6]:
df.columns

Index(['transaction_id', 'timestamp', 'amount', 'channel', 'device',
       'payment_method', 'is_international', 'session_length_seconds',
       'is_first_time_merchant', 'is_fraud', 'age', 'sex', 'education',
       'primary_source_of_income', 'sum_of_monthly_installments',
       'sum_of_monthly_expenses', 'country_user', 'signup_date', 'risk_score',
       'category', 'country_merchant', 'trust_score',
       'number_of_alerts_last_6_months', 'avg_transaction_amount',
       'account_age_months', 'has_fraud_history'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   transaction_id                  500000 non-null  object        
 1   timestamp                       500000 non-null  datetime64[ns]
 2   amount                          500000 non-null  float64       
 3   channel                         500000 non-null  object        
 4   device                          500000 non-null  object        
 5   payment_method                  500000 non-null  object        
 6   is_international                500000 non-null  int64         
 7   session_length_seconds          500000 non-null  int64         
 8   is_first_time_merchant          500000 non-null  int64         
 9   is_fraud                        500000 non-null  int64         
 10  age                             500000 non-null  int64  

In [9]:
df['education'] = df['education'].fillna('Missing')

In [10]:
df.drop(columns=["transaction_id"], inplace=True)

In [13]:
pd.get_dummies(df['education'], dtype=int)

Unnamed: 0,Bachelor,High School,Master,Missing,PhD
0,1,0,0,0,0
1,0,0,0,0,1
2,1,0,0,0,0
3,0,0,1,0,0
4,0,0,0,0,1
...,...,...,...,...,...
499995,1,0,0,0,0
499996,0,1,0,0,0
499997,1,0,0,0,0
499998,0,1,0,0,0


In [16]:
categorical_cols = ["channel", "device", "payment_method", "sex", "education", "primary_source_of_income", "category"]

In [28]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, dtype=int, drop_first=True)

In [29]:
df_encoded.shape

(500000, 42)

In [31]:
df.columns

Index(['timestamp', 'amount', 'channel', 'device', 'payment_method',
       'is_international', 'session_length_seconds', 'is_first_time_merchant',
       'is_fraud', 'age', 'sex', 'education', 'primary_source_of_income',
       'sum_of_monthly_installments', 'sum_of_monthly_expenses',
       'country_user', 'signup_date', 'risk_score', 'category',
       'country_merchant', 'trust_score', 'number_of_alerts_last_6_months',
       'avg_transaction_amount', 'account_age_months', 'has_fraud_history'],
      dtype='object')

In [30]:
from sklearn.preprocessing import StandardScaler

In [32]:
numerical_cols = ["amount", "session_length_seconds", "age", "sum_of_monthly_installments", "risk_score", "trust_score",
				  "number_of_alerts_last_6_months", "avg_transaction_amount", "account_age_months"]

In [33]:
scaler = StandardScaler()

In [35]:
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

In [37]:
df_encoded.describe()

Unnamed: 0,timestamp,amount,is_international,session_length_seconds,is_first_time_merchant,is_fraud,age,sum_of_monthly_installments,sum_of_monthly_expenses,signup_date,...,primary_source_of_income_Retirement,primary_source_of_income_Savings,primary_source_of_income_Student Aid,primary_source_of_income_Unemployment,category_education,category_electronics,category_gaming,category_grocery,category_restaurants,category_travel
count,500000,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000,...,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,2022-12-31 16:45:04.497720576,8.754597000000001e-17,0.928662,1.131397e-16,0.501248,0.084822,-1.078746e-16,-3.996092e-17,1182.346649,2022-10-06 13:27:43.660799744,...,0.165126,0.164698,0.162216,0.173432,0.140724,0.153108,0.127132,0.156492,0.134314,0.136254
min,2022-01-01 00:06:00,-0.9985008,0.0,-1.730265,0.0,0.0,-1.691432,-0.9928511,0.04,2020-04-21 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2022-07-01 11:03:45,-0.7110468,1.0,-0.8636345,0.0,0.0,-0.8442505,-0.7048258,353.66,2021-07-09 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2022-12-31 20:08:30,-0.307171,1.0,-0.002919869,1.0,0.0,0.002930763,-0.3097876,825.19,2022-10-08 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2023-07-02 08:45:15,0.3852091,1.0,0.8637103,1.0,0.0,0.850112,0.3843131,1650.44,2023-12-29 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2023-12-31 23:57:00,13.30139,1.0,1.73034,1.0,1.0,1.697293,9.437243,10994.33,2025-03-22 00:00:00,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,,1.000001,0.257389,1.000001,0.499999,0.278617,1.000001,1.000001,1168.96796,,...,0.371295,0.370908,0.368649,0.378621,0.347737,0.360092,0.333121,0.363322,0.34099,0.343058


In [38]:
df_encoded.to_parquet(INTERIM_DATA / "encoded_data.parquet")

In [39]:
df_encoded.shape

(500000, 42)