## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA

In [2]:
df = pd.read_csv("../../ETL/data/fraud_incidents.csv")

In [3]:
df.head(15)

Unnamed: 0,4086aa77-4090-4776-8cb4-2e77ffdf591f,a315527c-cf1a-4336-b954-1a752fd98482,Unauthorized Transaction,2025-03-10 06:55:10,Pending
0,867c5583-aa80-4318-93fb-302fc35c5cbb,7b3cd7d7-3bfe-4fe4-b120-58eba10e4d69,Phishing,2025-02-06 15:36:33,Pending
1,85561331-fb96-446f-bf79-01cae851103a,1b11ef20-54e3-4e02-ac10-c9dca2a13ade,Phishing,2025-01-27 07:01:10,Investigating
2,dcbe13d5-7757-4532-8709-f6347899f6fd,a9d06942-a250-4d90-89d7-2442bf0f0798,Account Breach,2025-03-07 06:11:22,Pending
3,05969593-d0e0-438f-a533-be40ba4fcfbb,a880d2c2-ddb3-40c8-9ac0-026c18827aff,Account Breach,2025-02-11 11:22:49,Pending
4,759c3d04-a30e-4f56-86db-3899ed74ada4,8a6b3a1e-7723-43b3-a3ab-ee596853d5b3,Unauthorized Transaction,2025-03-19 23:20:39,Investigating
5,6b122437-c758-44e8-80c4-07abd69623ad,a62e25b6-f970-4555-bf2f-8826e913e58e,Unauthorized Transaction,2025-01-08 15:53:03,Pending
6,aab16acc-e9b9-42ee-85f1-8d287a96620e,666b8683-e592-43ad-9c2c-42ca99bfc9be,Unauthorized Transaction,2025-03-21 00:33:59,Investigating
7,f222d9e4-ea99-4401-a4ba-dd9081c8e460,15d1d2ec-1de1-43aa-9cb0-db4a4468f05f,Account Breach,2025-01-18 23:31:10,Investigating
8,98eee8f4-a372-41b4-ae1b-c77b770b304d,cad74c47-5a10-4e36-811e-2ab581aa0217,Account Breach,2025-03-05 23:37:09,Pending
9,66d29759-6448-4eee-8bfb-f30a567d12d2,0aaba017-7275-46ff-b70b-fdf0492196b6,Account Breach,2025-03-13 04:35:53,Pending


In [4]:
# 1. Check for missing values
print(df.isnull().sum())

4086aa77-4090-4776-8cb4-2e77ffdf591f    0
a315527c-cf1a-4336-b954-1a752fd98482    0
Unauthorized Transaction                0
2025-03-10 06:55:10                     0
Pending                                 0
dtype: int64


In [5]:
# 2. Check for duplicates
print(df.duplicated().sum())

0


In [6]:
# 3. Check for data types
print(df.dtypes)

4086aa77-4090-4776-8cb4-2e77ffdf591f    object
a315527c-cf1a-4336-b954-1a752fd98482    object
Unauthorized Transaction                object
2025-03-10 06:55:10                     object
Pending                                 object
dtype: object


In [7]:
# 4. Check for unique values
print(df.nunique())

4086aa77-4090-4776-8cb4-2e77ffdf591f    39999
a315527c-cf1a-4336-b954-1a752fd98482    32898
Unauthorized Transaction                    3
2025-03-10 06:55:10                     39890
Pending                                     3
dtype: int64


In [9]:
# 5. Check for the distribution of numerical columns
df.describe()

Unnamed: 0,4086aa77-4090-4776-8cb4-2e77ffdf591f,a315527c-cf1a-4336-b954-1a752fd98482,Unauthorized Transaction,2025-03-10 06:55:10,Pending
count,39999,39999,39999,39999,39999
unique,39999,32898,3,39890,3
top,867c5583-aa80-4318-93fb-302fc35c5cbb,c8e7babb-5a65-4b3a-85ac-8660da605151,Account Breach,2025-03-17 02:01:33,Pending
freq,1,5,13395,2,13370


In [16]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Verify changes
print(df.dtypes)

account_id                 object
customer_id                object
account_type               object
account_balance           float64
currency                   object
created_at         datetime64[ns]
dtype: object


### Handle Outliers (IQR Method)

In [13]:
Q1 = df['account_balance'].quantile(0.25)
Q3 = df['account_balance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(lower_bound, upper_bound)


-50091.94625000001 150151.48375


### Your account_balance column has no outliers based on the IQR method. The data is well-distributed.
Explanation: 
Negative Lower Bound (-50091.95): A negative balance makes no sense for a typical account balance if it's not supposed to allow overdrafts.
However, it being negative indicates that there are no negative values in your data — all positive balances fall above the lower bound.

No outliers on the lower side.

Upper Bound (150151.48):The maximum value in your data is 99,999.42, which is well below the upper bound.

No outliers on the upper side.

In [12]:
# Check numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", num_cols)


Numerical columns: Index(['account_balance'], dtype='object')


In [None]:
###