## Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA

In [4]:
df = pd.read_csv("../../ETL/data/accounts.csv")

In [14]:
df.head(15)

Unnamed: 0,account_id,customer_id,account_type,account_balance,currency,created_at
0,e84f0a27-3af5-4c75-83b4-ed8ac03d7818,0600cf21-8e9d-4b88-9c6f-7b4cbe1987d8,Checking,4167.68,USD,2025-03-21 15:03:26
1,edc47a1c-f967-48dc-80e6-6a53bbcc8b8b,36b44e1b-a808-45b3-996b-da1a662656d1,Checking,27912.49,USD,2025-03-21 15:03:26
2,d547d6e9-b2db-4e6e-9a95-eeff8ba48f2d,1c7a7887-f014-413a-92aa-ff953f48a80b,Checking,12375.68,USD,2025-03-21 15:03:26
3,3ac5ba04-0f8a-44c1-b071-e83ffad75974,8edf0514-7e93-40bb-920b-802a41b982eb,Savings,20239.97,USD,2025-03-21 15:03:26
4,a8412299-a9ab-4b51-a71b-ee8be4199136,01e14eee-6af8-40f1-8961-9f534ee3de3f,Business,37535.04,USD,2025-03-21 15:03:26
5,a4efcb4c-366d-4ae2-8a6b-28da35c47d50,175a3f9a-d794-4f79-81d9-0ad69dd1549e,Business,69036.53,USD,2025-03-21 15:03:26
6,5a6dd039-ec17-460b-92b1-de71274c95f1,8917d67e-09a8-4c4b-8784-895f092c1abe,Business,64065.29,USD,2025-03-21 15:03:26
7,48f9dbc7-34f6-4d95-944c-901b2dc9c708,f9d2084b-ada8-4337-af53-a9187798501c,Checking,41426.49,USD,2025-03-21 15:03:26
8,83feb658-64b1-41b1-9b35-86a4e3ba1f26,e886556f-b861-4901-8fa5-4194d3739560,Savings,48361.62,USD,2025-03-21 15:03:26
9,3fa52e9c-6987-4687-b1a6-88eda6a94f7b,c20829ec-0c1b-4f24-88c0-e119ca8da00b,Business,64205.61,USD,2025-03-21 15:03:26


In [5]:
# 1. Check for missing values
print(df.isnull().sum())

account_id         0
customer_id        0
account_type       0
account_balance    0
currency           0
created_at         0
dtype: int64


In [6]:
# 2. Check for duplicates
print(df.duplicated().sum())

0


In [7]:
# 3. Check for data types
print(df.dtypes)

account_id          object
customer_id         object
account_type        object
account_balance    float64
currency            object
created_at          object
dtype: object


In [8]:
# 4. Check for unique values
print(df.nunique())

account_id         100000
customer_id        100000
account_type            3
account_balance     99511
currency                1
created_at              2
dtype: int64


In [None]:
print(df['account_id'].nunique())  # Should also be 100,000
# If less than 100,000, there are duplicates.


100000


In [9]:
# 5. Check for the distribution of numerical columns
df.describe()

Unnamed: 0,account_balance
count,100000.0
mean,50041.675897
std,28838.76373
min,100.02
25%,24999.34
50%,50152.24
75%,75060.1975
max,99999.42


In [16]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Verify changes
print(df.dtypes)

account_id                 object
customer_id                object
account_type               object
account_balance           float64
currency                   object
created_at         datetime64[ns]
dtype: object


### Handle Outliers (IQR Method)

In [13]:
Q1 = df['account_balance'].quantile(0.25)
Q3 = df['account_balance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(lower_bound, upper_bound)


-50091.94625000001 150151.48375


### Your account_balance column has no outliers based on the IQR method. The data is well-distributed.
Explanation: 
Negative Lower Bound (-50091.95): A negative balance makes no sense for a typical account balance if it's not supposed to allow overdrafts.
However, it being negative indicates that there are no negative values in your data — all positive balances fall above the lower bound.

No outliers on the lower side.

Upper Bound (150151.48):The maximum value in your data is 99,999.42, which is well below the upper bound.

No outliers on the upper side.

In [12]:
# Check numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", num_cols)


Numerical columns: Index(['account_balance'], dtype='object')


In [None]:
####