# 02_data_cleaning.ipynb

## **Objective:**
Perform data cleaning and preprocessing to ensure data quality for analysis.

---

## **1️⃣ Import Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## **2️⃣ Load Raw Data**
- Reads the raw dataset from the CSV file.

In [2]:
# Define file paths
RAW_DATA_PATH = "data/customer_data.csv"
CLEANED_DATA_PATH = "data/customer_data_clean.csv"

def load_data(file_path):
    """Loads dataset with error handling."""
    try:
        df = pd.read_csv(file_path)
        logging.info("Data successfully loaded.")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

## **3️⃣ Handle Missing Values**
- Removes rows with missing values.

In [3]:
def clean_missing_values(df):
    """Drops rows with missing values."""
    df_cleaned = df.dropna()
    logging.info(f"Missing values removed. Rows before: {len(df)}, After: {len(df_cleaned)}")
    return df_cleaned

## **4️⃣ Convert Data Types**
- Ensures `purchase_amount` is numeric.

In [4]:
def convert_data_types(df):
    """Converts purchase_amount to numeric type."""
    df['purchase_amount'] = pd.to_numeric(df['purchase_amount'], errors='coerce')
    return df


## **5️⃣ Feature Engineering**
- Adds a log-transformed column for `purchase_amount`.

In [5]:
def feature_engineering(df):
    """Creates log-transformed purchase_amount column."""
    df['purchase_amount_log'] = df['purchase_amount'].apply(lambda x: np.log(x + 1))
    return df

## **6️⃣ Sort & Save Cleaned Data**
- Sorts by `purchase_amount` and saves the cleaned dataset.

In [6]:
def save_cleaned_data(df, file_path):
    """Saves the cleaned dataset to a CSV file."""
    df = df.sort_values('purchase_amount', ascending=False)
    df.to_csv(file_path, index=False)
    logging.info("Cleaned data saved successfully.")


## **7️⃣ Execute Data Cleaning Process**

In [7]:
df = load_data(RAW_DATA_PATH)
if df is not None:
    df = clean_missing_values(df)
    df = convert_data_types(df)
    df = feature_engineering(df)
    save_cleaned_data(df, CLEANED_DATA_PATH)

2025-02-28 04:06:43,303 - ERROR - Error loading data: [Errno 2] No such file or directory: 'data/customer_data.csv'


## **8️⃣ Summary & Next Steps**
✅ Data has been cleaned and saved.
✅ Next, move to `03_eda_analysis.ipynb` for exploratory data analysis.