In [None]:
import pandas as pd
import numpy as np

# --- 1. Data Setup ---
### Create the initial DataFrame based on the provided table

In [None]:
data = {
    'Transaction_ID': [1001, 1002, 1003, 1004, 1005, 1006],
    'Customer_Name': ['Ahmed Ali', 'Sara Omar', 'Ali Saleh', 'Nada Hassan', 'Omar Khalid', 'Ahmed Ali'],
    'Age': [28, np.nan, 35, 42, np.nan, 28],
    'Email': ['ahmed@mail.com', 'sara@mail.com', np.nan, 'nada@mail.com', 'omar@mail.com', 'ahmed@mail.com'],
    'Join_Date': ['2025-01-10', '2025-02-15', '2025-03-20', np.nan, '2025-05-05', '2025-01-10'],
    'Total_Purchase': [250, 300, 150, 400, np.nan, 250]
}
df = pd.DataFrame(data)
df

# --- 2. Initial Exploration & Checks ---

## --- Data Exploration ---

### Get data types, number of rows, and columns

In [None]:
df.info()

In [None]:
df.shape

### Check the number of null values in each column

In [None]:
df.isnull().sum()

### Identify duplicate rows before deleting them

In [None]:
df[df.duplicated()]

# --- 3. Filtering and Analysis ---

### Identify rows where Age is greater than 30

In [None]:
df[df['Age'] > 30]

### Identify rows with more than one null value

In [None]:
df[df.isnull().sum(axis=1) > 1]

# --- 4. Data Cleaning ---

### Remove duplicate rows

In [None]:
df.drop_duplicates(inplace=True)
df

### Convert the Join_Date column to datetime

In [None]:
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
df.dtypes

### Replace null values in the Age column with the mean

In [None]:
# mean_age = df['Age'].mean()
# df['Age'].fillna(mean_age, inplace=True)
# df['Age']
df['Age'].fillna(df['Age'].mean(), inplace=True)

### Replace null values in the Total_Purchase column with 0

In [None]:
# df['Total_Purchase'].fillna(0, inplace=True)
# df['Total_Purchase']
df['Total_Purchase'].fillna(0, inplace=True)

## Let's see the cleaned DataFrame
### Note: The NaN in Email and Join_Date still exist

In [None]:
df

### Deleting all remaining rows with any nulls

In [None]:
df_no_missing = df.dropna()
len(df_no_missing)

### Data after cleaning

In [None]:
df