# 05. Business Actionable Insights & Evaluation

**Objective**: Translate technical clustering results into professional business strategies. We evaluate all three candidate models (K-Means, DBSCAN, Hierarchical) to provide a recommendation based on interpretability and actionable segmentation.

**Key Techniques:**
*   **Radar Charts**: Visualize the "personality" of each cluster (e.g., "High Spenders" vs. "Bargain Hunters").
*   **SHAP Values**: Explain *why* a customer falls into a specific cluster (Feature Importance).
*   **Business Profiling**: Define personas and marketing strategies.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../src')
from clustering_library import ClusterAnalyzer, DataVisualizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Data & Clustering Results

We load the scaled features (for technical analysis/SHAP) and the original features (for business profiling), along with the cluster labels saved from the Validation step.

In [None]:
analyzer = ClusterAnalyzer()
df_scaled, df_original = analyzer.load_data()

rename_map = {
    'recency': 'Recency',
    'frequency': 'Frequency',
    'monetary': 'Monetary',
    'tenure': 'Tenure'
}
df_original.rename(columns=rename_map, inplace=True)
print("Columns in df_original:", df_original.columns.tolist())

# Load pre-computed labels
try:
    results_df = pd.read_csv("../data/processed/clustered_customers.csv", index_col=0)
    print("Loaded clustering results successfully.")
    print(results_df.head())
except FileNotFoundError:
    print("Error: '../data/processed/clustered_customers.csv' not found. Please run 04_Technical_Validation.ipynb first.")

Số khách hàng: 3920
Số features: 16
Columns in df_original: ['Sum_Quantity', 'Mean_UnitPrice', 'Mean_TotalPrice', 'Sum_TotalPrice', 'Count_Invoice', 'Count_Stock', 'Mean_InvoiceCountPerStock', 'Mean_StockCountPerInvoice', 'Mean_UnitPriceMeanPerInvoice', 'Mean_QuantitySumPerInvoice', 'Mean_TotalPriceMeanPerInvoice', 'Mean_TotalPriceSumPerInvoice', 'Mean_UnitPriceMeanPerStock', 'Mean_QuantitySumPerStock', 'Mean_TotalPriceMeanPerStock', 'Mean_TotalPriceSumPerStock']
Loaded clustering results successfully.
            Cluster_KMeans  Cluster_DBSCAN  Cluster_Hierarchical
CustomerID                                                      
12346                    1              -1                     0
12747                    1               0                     2
12748                    2              -1                     2
12749                    2               0                     2
12820                    2               0                     2


## 2. Comparative Business Evaluation

We will iterate through each model to generate profiles and explainability visuals.

In [None]:
models = ['KMeans', 'DBSCAN', 'Hierarchical']

for model_name in models:
    col_name = f'Cluster_{model_name}'
    if col_name not in results_df.columns:
        print(f"\nSkipping {model_name} (Labels not found)")
        continue
        
    print(f"\n{'='*20} ANALYZING MODEL: {model_name} {'='*20}")
    
    # Get labels
    labels = results_df[col_name].values
    
    # Handle Noise in DBSCAN for profiling (usually exclude -1 or treat separately)
    # We'll treat -1 as a distinct group for visualization
    unique_labels = np.unique(labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    print(f"Number of Clusters: {n_clusters} (Noise points: {np.sum(labels == -1)})\n")
    
    if n_clusters < 1 and np.sum(labels == -1) == 0:
        print("No valid clusters and no noise. Skipping.")
        continue
    elif n_clusters < 2:
        print("Note: Only 1 cluster found (plus noise). Analyzing as Single Group vs Noise.")

    # --- 2.1. Cluster Statistics ---
    # Merge labels with original data for interpretable means
    df_business = df_original.copy()
    df_business['Cluster'] = labels
    
    cluster_means = df_business.groupby('Cluster').mean()
    cluster_sizes = df_business['Cluster'].value_counts().sort_index()
    
    print("Cluster Sizes:")
    print(cluster_sizes)
    print("\nCluster Profiles (Key Metrics Mean):")
    
    # Display specific RFM columns if available, otherwise all
    cols_to_show = ['Recency', 'Frequency', 'Monetary']
    available_cols = [c for c in cols_to_show if c in cluster_means.columns]
    if available_cols:
        display(cluster_means[available_cols].round(2))
    else:
        print("RFM columns not found for profile display, showing all:")
        display(cluster_means.round(2))

    # --- 2.2. Radar Chart Visualization ---
    print(f"\nGenerating Radar Chart for {model_name}...")
    # Manually inject into analyzer to allow helper methods to work
    # Note: create_radar_chart typically expects labels 0..k-1. 
    # DBSCAN might have -1. simple workaround: let analyzer handle it via 'labels' arg if supported, or filter.
    try:
        # The `create_radar_chart` in library likely expects labels. 
        # We filter out noise for Radar Chart clarity if it's DBSCAN
        clean_mask = labels != -1
        if np.sum(clean_mask) > 0:
            # We also need to temporarily update analyzer.df_original because helper methods might use it
            # Ensure analyzer has the standardized columns
            analyzer.df_original = df_original[clean_mask].copy()
            
            analyzer.create_radar_chart(labels=labels[clean_mask])
            
            # --- 2.2b. Individual Cluster Profiles ---
            print(f"\nGenerating Individual Radar Plots for {model_name}...")
            # Ensure we use the correct key (k_key) which we set for SHAP later, but we need to set it NOW if we use it here.
            # However, looking at the code structure, k_key is defined later for SHAP.
            # We need to manually register the means for create_individual_radar_plots to work.
            
            # Re-calculate means using the clean mask to match the visualization data
            clean_labels = labels[clean_mask]
            
            # If DBSCAN, n_clusters might be 1, but we want to plot whatever valid clusters we found.
            # create_individual_radar_plots uses self.cluster_results[k]["means"]
            
            # We'll stick to a temporary key = 999 or just reuse n_clusters if unique
            temp_key = n_clusters if n_clusters > 0 else 1
            
            # Prepare a temporary dataframe to calc means exactly as the library expects
            temp_df = df_original[clean_mask].copy()
            temp_df['Cluster'] = clean_labels
            temp_means = temp_df.groupby('Cluster').mean()
            
            # Inject into analyzer
            analyzer.cluster_results[temp_key] = {
                'labels': clean_labels,
                'means': temp_means
            }
            
            try:
                analyzer.create_individual_radar_plots(temp_key)
            except Exception as e:
                print(f"Could not generate Individual Radar Plots: {e}")
                import traceback
                traceback.print_exc()
        else:
            print("No non-noise data to plot.")
    except Exception as e:
        print(f"Could not generate Radar Chart: {e}")

    # --- 2.3. Explainability (SHAP) ---
    print(f"\nCalculating SHAP values for {model_name}...")
    
    # We need to register the results in analyzer to use its training/SHAP methods
    # The helper methods use integer keys (k). We'll use a dummy integer key relative to n_clusters
    # or modify state directly.
    
    k_key = n_clusters
    analyzer.cluster_results[k_key] = {
        'labels': labels,
        'means': cluster_means
    }

    try:
        # 1. Train Surrogate Random Forest
        analyzer.train_surrogate_model(k_key)
        
        # 2. Calculate SHAP
        analyzer.calculate_shap_values(k_key)
        
        # 3. Plot SHAP Summary
        print(f"SHAP Summary Plot for {model_name}:")
        analyzer.plot_shap_summary(k_key)
        
    except Exception as e:
        print(f"Error in SHAP analysis for {model_name}: {e}")
        # Traceback often helps debugging SHAP version/library issues
        # import traceback
        # traceback.print_exc()