# 02_data_cleaning.ipynb

## **Objective:**
Perform data cleaning and preprocessing to ensure data quality for analysis.

---

## **1️⃣ Import Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import logging
import os

## **2️⃣ Set Up Logging**

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## **3️⃣ Define File Paths**
- Reads the raw dataset from the CSV file.

In [None]:
RAW_DATA_PATH = "../data/customer_data.csv"
CLEANED_DATA_PATH = "../data/customer_data_clean.csv"

## **4️⃣ Load Raw Data**

In [None]:
def load_data(file_path):
    """Loads dataset with error handling."""
    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return None
    
    try:
        df = pd.read_csv(file_path)
        logging.info("Data successfully loaded.")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

## **5️⃣ Handle Missing Values**
- Removes rows with missing values.

In [None]:
def clean_missing_values(df):
    """Drops rows with missing values."""
    if df is None:
        logging.error("No data to clean.")
        return
    
    try:
        df.dropna(inplace=True)
        logging.info(f"Missing values removed. Rows before: {len(df)}, After: {len(df)}")
        return df
    except Exception as e:
        logging.error(f"Error cleaning missing values: {e}")
        return None

## **6️⃣ Convert Data Types**
- Ensures `purchase_amount` is numeric.

In [None]:
def convert_data_types(df):
    """Converts purchase_amount to numeric type."""
    if df is None:
        logging.error("No data to convert.")
        return
    
    try:
        df['purchase_amount'] = pd.to_numeric(df['purchase_amount'], errors='coerce')
        logging.info("Data types converted successfully.")
        return df
    except Exception as e:
        logging.error(f"Error converting data types: {e}")
        return None

## **7️⃣ Feature Engineering**
- Adds a log-transformed column for `purchase_amount`.

In [None]:
def feature_engineering(df):
    """Creates log-transformed purchase_amount column."""
    df['purchase_amount_log'] = df['purchase_amount'].apply(lambda x: np.log(x + 1))
    return df

## **8️⃣ Sort & Save Cleaned Data**
- Sorts by `purchase_amount` and saves the cleaned dataset.

In [None]:
def save_cleaned_data(df, file_path):
    """Saves the cleaned dataset to a CSV file."""
    df = df.sort_values('purchase_amount', ascending=False)
    df.to_csv(file_path, index=False)
    logging.info("Cleaned data saved successfully.")


## **9️⃣ Execute Data Cleaning Process**

In [None]:
df = load_data(RAW_DATA_PATH)
if df is not None:
    df = clean_missing_values(df)
    df = convert_data_types(df)
    df = feature_engineering(df)
    save_cleaned_data(df, CLEANED_DATA_PATH)

## **Summary & Next Steps**
✅ Data has been cleaned and saved.
✅ Next, move to `03_eda_analysis.ipynb` for exploratory data analysis.