## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA

In [3]:
df = pd.read_csv("../../ETL/data/customers.csv")

In [4]:
# 1. Check for missing values
print(df.isnull().sum())

customer_id    0
first_name     0
last_name      0
email          0
phone          0
dob            0
gender         0
created_at     0
dtype: int64


In [5]:
# 2. Check for duplicates
print(df.duplicated().sum())

0


In [6]:
# 3. Check for data types
print(df.dtypes)

customer_id    object
first_name     object
last_name      object
email          object
phone          object
dob            object
gender         object
created_at     object
dtype: object


In [7]:
# 4. Check for unique values
print(df.nunique())

customer_id    100000
first_name        690
last_name        1000
email          100000
phone          100000
dob             17467
gender              3
created_at         24
dtype: int64


In [8]:
# 5. Check for the distribution of numerical columns
df.describe()

Unnamed: 0,customer_id,first_name,last_name,email,phone,dob,gender,created_at
count,100000,100000,100000,100000,100000,100000,100000,100000
unique,100000,690,1000,100000,100000,17467,3,24
top,e91b4955-3450-446e-a203-7b54e796259d,Michael,Smith,kimberly01@example.org,601.316.7106x969,2002-02-15,Female,2025-03-22 17:06:53
freq,1,2305,2117,1,1,16,33399,4446


In [9]:
# Convert 'dob' and 'created_at' to datetime format
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')  # Converts valid dates, sets errors to NaT
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Verify changes
print(df.dtypes)


customer_id            object
first_name             object
last_name              object
email                  object
phone                  object
dob            datetime64[ns]
gender                 object
created_at     datetime64[ns]
dtype: object


### Handle Outliers (IQR Method)

In [11]:
# Define function to remove outliers based on IQR
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Define numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns

# Apply to numerical columns
for col in num_cols:
    df = remove_outliers(df, col)


In [12]:
# Check numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", num_cols)


Numerical columns: Index([], dtype='object')


In [None]:
###