# 03_eda_analysis.ipynb

## **Objective:**
Perform Exploratory Data Analysis (EDA) to identify trends and insights.

---

## **1️⃣ Import Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os

## **2️⃣ Set Up Logging**

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## **3️⃣ Define File Paths**

In [None]:
CLEANED_DATA_PATH = "../data/customer_data_clean.csv"

## **4️⃣ Load Cleaned Data**

In [None]:
def load_data(file_path):
    """Loads cleaned dataset with error handling."""
    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return None
    
    try:
        df = pd.read_csv(file_path)
        logging.info("Cleaned data successfully loaded.")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None
    
    # Load cleaned data
df = load_data(CLEANED_DATA_PATH)

## **5️⃣ Basic Data Overview**
- Displays dataset structure, summary, and missing values.

In [None]:
def data_overview(df):
    """Prints dataset summary and missing values."""
    logging.info("Generating dataset overview.")
    
    print("\n--- Data Info ---")
    print(df.info())
    print("\n--- Data Description ---")
    print(df.describe())
    print("\n--- Missing Values ---")
    print(df.isnull().sum())

## **6️⃣ Visualizing Purchase Amount Distribution**

In [None]:
def plot_purchase_distribution(df):
    """Plots histogram and boxplot for purchase_amount."""
    logging.info("Plotting purchase amount distribution.")
    
    fig, axes = plt.subplots(2, 1, figsize=(10, 8))
    
    # Histogram
    sns.histplot(df['purchase_amount'], bins=30, kde=True, ax=axes[0])
    axes[0].set_title("Purchase Amount Distribution")
    
    # Boxplot
    sns.boxplot(x=df['purchase_amount'], ax=axes[1])
    axes[1].set_title("Boxplot of Purchase Amount")
    
    plt.tight_layout()
    plt.show()

## **7️⃣ Log-Transformed Purchase Amount Distribution**
- Helps visualize skewed data better.

In [None]:
def plot_log_purchase_distribution(df):
    """Plots histogram and boxplot for log-transformed purchase amount."""
    logging.info("Plotting log-transformed purchase amount distribution.")
    
    if 'purchase_amount' not in df:
        logging.error("purchase_amount column not found.")
        return
    
    df['purchase_amount_log'] = np.log1p(df['purchase_amount'])

    fig, axes = plt.subplots(2, 1, figsize=(10, 8))

    # Histogram
    sns.histplot(df['purchase_amount_log'], bins=30, kde=True, ax=axes[0])
    axes[0].set_title("Log-Transformed Purchase Amount Distribution")
    
    # Boxplot
    sns.boxplot(x=df['purchase_amount_log'], ax=axes[1])
    axes[1].set_title("Boxplot of Log-Transformed Purchase Amount")
    
    plt.tight_layout()
    plt.show()

## **8️⃣ Analyzing Customer Spending by Region**

In [None]:
def plot_spending_by_region(df):
    """Plots average spending per region."""
    logging.info("Plotting customer spending by region.")
    
    plt.figure(figsize=(10, 5))
    sns.barplot(x='region', y='purchase_amount', data=df, estimator=np.mean, errorbar=None, palette='viridis', hue='region', legend=False)
    plt.xticks(rotation=45)
    plt.title("Average Customer Spending by Region")
    plt.show()

## **9️⃣ Analyzing Purchase Frequency**

In [None]:
def plot_purchase_frequency(df):
    """Plots purchase frequency distribution."""
    logging.info("Plotting purchase frequency distribution.")
    
    plt.figure(figsize=(10, 8))
    sns.countplot(x='purchase_frequency', data=df, palette='coolwarm', hue='purchase_frequency', legend=False)
    plt.xticks(rotation=45)
    plt.title("Purchase Frequency Distribution")
    plt.show()

## **🔟 Execute EDA**

In [None]:
def perform_eda(df):
    if df is None:
        logging.error("No data to analyze.")
        return
    
    try:
        data_overview(df)
        plot_purchase_distribution(df)
        plot_log_purchase_distribution(df)
        plot_spending_by_region(df)
        plot_purchase_frequency(df)
        logging.info("Exploratory data analysis complete.")
    except Exception as e:
        logging.error(f"Error during EDA: {e}")

perform_eda(df)

## **Summary & Next Steps**
✅ EDA performed successfully.
✅ Next, move to `04_regression_analysis.ipynb` for predictive modeling.