## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA

In [2]:
df = pd.read_csv("../../ETL/data/fraud_incidents.csv")

In [3]:
df.head(15)

Unnamed: 0,incident_id,customer_id,incident_type,incident_date,status
0,e1e7e879-633f-4a41-b772-6bedfaa7dbf2,74d9f031-8a84-4f87-a6c5-2ea1b6cfb9fc,Phishing,2025-01-29 16:34:53,Investigating
1,bfae4b1a-0377-4ff6-b21b-328d6e4ab0f0,338a469c-ccbc-4d7c-b135-ebf72818aca4,Phishing,2025-02-11 22:31:38,Resolved
2,deb8d7e9-2156-4e24-85c0-b42793ac8907,b315fa7e-d189-4487-b5dd-e4b76fea4812,Unauthorized Transaction,2025-01-16 19:32:44,Resolved
3,d5b70a52-3fad-4767-b505-f72ee10d545c,d7f96e0c-f4ea-4e84-9de0-54eeb2b2bda8,Phishing,2025-02-07 13:47:32,Resolved
4,b0ab2c00-b44a-47eb-a5dd-cc5495430e70,96a88cb3-d3e3-4de8-9beb-3a2fcefc37d6,Unauthorized Transaction,2025-02-27 07:18:02,Pending
5,fef00ac4-cc09-4ff7-92e5-63d18a9ddff1,250d707b-cdeb-4cad-a850-54fe5801e9a9,Phishing,2025-03-21 15:19:48,Resolved
6,fef766c1-7bce-443a-b795-6202557787b5,1fe641c4-7c3c-4ec0-9c6f-ff5d5306ab82,Unauthorized Transaction,2025-01-15 06:59:07,Pending
7,7bf71c51-6605-47fc-aa04-a2fb3f66360e,03422825-1990-4b8f-b524-72d34282abd0,Unauthorized Transaction,2025-01-17 02:55:32,Pending
8,df3f1073-95f4-42ec-a35c-a01365f3fea3,fa241bbc-bbd1-430b-9fdf-1121d957e808,Phishing,2025-01-16 12:28:51,Pending
9,f198373c-3958-45fe-838e-043b714d11bc,f98c1bc8-9518-4a7b-94be-b3784cdac82f,Phishing,2025-02-24 09:02:05,Investigating


In [4]:
# 1. Check for missing values
print(df.isnull().sum())

incident_id      0
customer_id      0
incident_type    0
incident_date    0
status           0
dtype: int64


In [5]:
# 2. Check for duplicates
print(df.duplicated().sum())

0


In [6]:
# 3. Check for data types
print(df.dtypes)

incident_id      object
customer_id      object
incident_type    object
incident_date    object
status           object
dtype: object


In [7]:
# 4. Check for unique values
print(df.nunique())

incident_id      40000
customer_id      32965
incident_type        3
incident_date    39885
status               3
dtype: int64


In [8]:
# 5. Check for the distribution of numerical columns
df.describe()

Unnamed: 0,incident_id,customer_id,incident_type,incident_date,status
count,40000,40000,40000,40000,40000
unique,40000,32965,3,39885,3
top,e1e7e879-633f-4a41-b772-6bedfaa7dbf2,da00d149-7902-4b5e-b12e-229e918b5345,Unauthorized Transaction,2025-01-01 11:22:34,Investigating
freq,1,6,13452,2,13446


In [12]:
df['incident_date'] = pd.to_datetime(df['incident_date'], errors='coerce')

# Verify changes
print(df.dtypes)

incident_id              object
customer_id              object
incident_type            object
incident_date    datetime64[ns]
status                   object
dtype: object


In [14]:
# Check numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", num_cols)


Numerical columns: Index([], dtype='object')


In [None]:
### It means there is no numerical column in fraud incidents dataset. So, we will skip this step. There is no outlier detection needed for this dataset.