## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA

In [2]:
df = pd.read_csv("../../ETL/data/accounts.csv")

In [3]:
df.head(15)

Unnamed: 0,account_id,customer_id,account_type,account_balance,currency,created_at
0,91c4c77d-994c-47af-a4f9-818358468803,e91b4955-3450-446e-a203-7b54e796259d,Savings,83009.85,USD,2025-03-22 17:07:12
1,d91f757c-6208-4333-a42d-6da7f74adf68,b46d5435-4f11-4fec-a8e8-569e6f608cae,Checking,95387.39,USD,2025-03-22 17:07:12
2,32da0ff3-74b0-425d-a870-f4312053a810,549b98fd-7562-4b8e-81dc-bc36b8a4bb8e,Business,48961.79,USD,2025-03-22 17:07:12
3,c571976d-4b02-4001-b1e6-6481c8d00345,ff34f1c1-5728-4395-bbad-cf7f19aedae3,Business,3611.35,USD,2025-03-22 17:07:12
4,27def753-9b09-4cc4-a5fa-b06f2e19da11,50177ad2-e8e6-42af-8541-f96079453d92,Business,92351.62,USD,2025-03-22 17:07:12
5,36e1e9f3-a79c-441d-afa6-c92b7ce2593d,ded811e2-cd52-42c5-ba89-6c4636f4578c,Savings,85434.47,USD,2025-03-22 17:07:12
6,e8594fed-8712-4fb9-b990-598962f66584,a64e3d75-86e2-4e43-a287-4a00b669e4f4,Checking,70568.1,USD,2025-03-22 17:07:12
7,4bf31b3a-3753-49bd-b6ce-a81d8e5a59d3,8c3b0d76-9e73-477a-9443-fd0d17221698,Savings,30022.16,USD,2025-03-22 17:07:12
8,7c0967cf-453f-4e05-82be-928dbfe534a2,bd41de0e-8887-400a-9aba-65fcf471e291,Business,99031.29,USD,2025-03-22 17:07:12
9,3f01d11c-6a4b-404e-b323-c3c5764de45f,2523fe1c-5d50-445c-87d4-bae294f8f773,Checking,83989.63,USD,2025-03-22 17:07:12


In [4]:
# 1. Check for missing values
print(df.isnull().sum())

account_id         0
customer_id        0
account_type       0
account_balance    0
currency           0
created_at         0
dtype: int64


In [5]:
# 2. Check for duplicates
print(df.duplicated().sum())

0


In [6]:
# 3. Check for data types
print(df.dtypes)

account_id          object
customer_id         object
account_type        object
account_balance    float64
currency            object
created_at          object
dtype: object


In [7]:
# 4. Check for unique values
print(df.nunique())

account_id         100000
customer_id        100000
account_type            3
account_balance     99484
currency                1
created_at              2
dtype: int64


In [8]:
print(df['account_id'].nunique())  # Should also be 100,000
# If less than 100,000, there are duplicates.


100000


In [9]:
# 5. Check for the distribution of numerical columns
df.describe()

Unnamed: 0,account_balance
count,100000.0
mean,50228.233847
std,28856.660775
min,100.0
25%,25290.2575
50%,50474.11
75%,75193.415
max,99999.93


In [10]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Verify changes
print(df.dtypes)

account_id                 object
customer_id                object
account_type               object
account_balance           float64
currency                   object
created_at         datetime64[ns]
dtype: object


### Handle Outliers (IQR Method)

In [11]:
Q1 = df['account_balance'].quantile(0.25)
Q3 = df['account_balance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(lower_bound, upper_bound)


-49564.47875000002 150048.15125000002


### Your account_balance column has no outliers based on the IQR method. The data is well-distributed.
Explanation: 
Negative Lower Bound: A negative balance makes no sense for a typical account balance if it's not supposed to allow overdrafts.
However, it being negative indicates that there are no negative values in your data — all positive balances fall above the lower bound.

No outliers on the lower side.

Upper Bound :The maximum value in your data is 99,999.42, which is well below the upper bound.

No outliers on the upper side.

In [12]:
# Check numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", num_cols)


Numerical columns: Index(['account_balance'], dtype='object')


In [None]:
####