In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster 

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Generation

In [2]:
# Create sample customer data
customer_data = {
    'customer_id': range(1000),
    'purchase_frequency': np.random.randint(1, 52, 1000),  # purchases per year
    'avg_transaction_value': np.random.normal(150, 50, 1000),  # average spend per visit
    'loyalty_points': np.random.randint(0, 5000, 1000),
    'store_visits': np.random.randint(0, 24, 1000),  # visits per year
    'online_visits': np.random.randint(0, 100, 1000),  # online visits per year
    'returns_rate': np.random.uniform(0, 0.2, 1000),  # return rate percentage
    'skincare_purchases': np.random.randint(0, 30, 1000),
    'makeup_purchases': np.random.randint(0, 40, 1000),
    'fragrance_purchases': np.random.randint(0, 15, 1000)
}

customer_df = pd.DataFrame(customer_data)

# Create sample product data
product_data = {
    'product_id': range(500),
    'price_point': np.random.normal(45, 20, 500),
    'rating': np.random.uniform(3.5, 5, 500),
    'return_rate': np.random.uniform(0, 0.15, 500),
    'reorder_rate': np.random.uniform(0.1, 0.5, 500),
    'shelf_life_months': np.random.randint(12, 36, 500),
    'margin': np.random.uniform(0.3, 0.7, 500),
    'ingredients_count': np.random.randint(5, 50, 500),
    'packaging_size_ml': np.random.normal(100, 30, 500),
    'fragrance_intensity': np.random.uniform(0, 5, 500),
    'application_ease': np.random.uniform(3, 5, 500)
}

product_df = pd.DataFrame(product_data)

# K-Means

In [3]:
def perform_customer_segmentation(df, n_clusters=4):
    """
    Perform K-means clustering on customer data
    """
    # Select features for clustering
    features = [
        'purchase_frequency', 'avg_transaction_value', 'loyalty_points',
        'store_visits', 'online_visits', 'returns_rate',
        'skincare_purchases', 'makeup_purchases', 'fragrance_purchases'
    ]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features])
    
    # Apply K-means
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    df['customer_segment'] = kmeans.fit_predict(X_scaled)
    
    # Analyze segments
    segment_analysis = df.groupby('customer_segment')[features].mean()
    
    return df, segment_analysis, kmeans.cluster_centers_

# Hierarchical Clustering

In [4]:
def perform_product_clustering(df):
    """
    Perform hierarchical clustering on product data
    """
    features = [
        'price_point', 'rating', 'return_rate', 'reorder_rate',
        'shelf_life_months', 'margin', 'ingredients_count',
        'packaging_size_ml', 'fragrance_intensity', 'application_ease'
    ]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features])
    
    # Create linkage matrix
    linkage_matrix = linkage(X_scaled, method='ward')
    
    # Create clusters
    n_clusters = 5
    df['product_cluster'] = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
    
    # Analyze clusters
    cluster_analysis = df.groupby('product_cluster')[features].mean()
    
    return df, cluster_analysis, linkage_matrix

# PCA

In [5]:
def perform_product_pca(df):
    """
    Perform PCA on product attributes
    """
    features = [
        'price_point', 'rating', 'return_rate', 'reorder_rate',
        'shelf_life_months', 'margin', 'ingredients_count',
        'packaging_size_ml', 'fragrance_intensity', 'application_ease'
    ]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features])
    
    # Apply PCA
    pca = PCA()
    pca_result = pca.fit_transform(X_scaled)
    
    # Create DataFrame with PCA results
    pca_df = pd.DataFrame(
        pca_result,
        columns=[f'PC{i+1}' for i in range(len(features))]
    )
    
    # Create loadings matrix
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f'PC{i+1}' for i in range(len(features))],
        index=features
    )
    
    # Calculate explained variance
    explained_variance = pd.DataFrame({
        'Principal Component': [f'PC{i+1}' for i in range(len(features))],
        'Explained Variance Ratio': pca.explained_variance_ratio_,
        'Cumulative Variance Ratio': np.cumsum(pca.explained_variance_ratio_)
    })
    
    return pca_df, loadings, explained_variance

# Insights

In [6]:
# Perform analyses
segmented_customers, segment_analysis, cluster_centers = perform_customer_segmentation(customer_df)
clustered_products, product_cluster_analysis, product_linkage = perform_product_clustering(product_df)
product_pca, pca_loadings, pca_variance = perform_product_pca(product_df)

# Example of business insights generation
def generate_customer_segment_insights(segment_analysis):
    """
    Generate business insights from customer segments
    """
    insights = []
    
    for segment in segment_analysis.index:
        segment_data = segment_analysis.loc[segment]
        
        # High value customers
        if segment_data['avg_transaction_value'] > segment_analysis['avg_transaction_value'].mean():
            insights.append(f"Segment {segment} represents high-value customers with average transaction value of ${segment_data['avg_transaction_value']:.2f}")
        
        # Online vs store preference
        if segment_data['online_visits'] > segment_data['store_visits'] * 2:
            insights.append(f"Segment {segment} shows strong preference for online shopping with {segment_data['online_visits']:.0f} online visits vs {segment_data['store_visits']:.0f} store visits")
        
        # Product category preference
        max_category = max(
            ('skincare', segment_data['skincare_purchases']),
            ('makeup', segment_data['makeup_purchases']),
            ('fragrance', segment_data['fragrance_purchases']),
            key=lambda x: x[1]
        )
        insights.append(f"Segment {segment} shows strongest preference for {max_category[0]} with {max_category[1]:.0f} purchases")
    
    return insights

In [7]:
# Generate and print insights
customer_insights = generate_customer_segment_insights(segment_analysis)
print("\nCustomer Segment Insights:")
for insight in customer_insights:
    print(f"- {insight}")

# Print key product cluster characteristics
print("\nProduct Cluster Characteristics:")
for cluster in product_cluster_analysis.index:
    print(f"\nCluster {cluster}:")
    print(f"Average Price: ${product_cluster_analysis.loc[cluster, 'price_point']:.2f}")
    print(f"Average Rating: {product_cluster_analysis.loc[cluster, 'rating']:.2f}")
    print(f"Reorder Rate: {product_cluster_analysis.loc[cluster, 'reorder_rate']:.2%}")

# Print PCA insights
print("\nPCA Insights:")
print("Top 3 Principal Components explain:")
for i in range(3):
    print(f"PC{i+1}: {pca_variance['Explained Variance Ratio'][i]:.2%} of variance")

# Export results to CSV
segmented_customers.to_csv('sephora_customer_segments.csv', index=False)
clustered_products.to_csv('sephora_product_clusters.csv', index=False)


Customer Segment Insights:
- Segment 0 represents high-value customers with average transaction value of $153.60
- Segment 0 shows strong preference for online shopping with 41 online visits vs 15 store visits
- Segment 0 shows strongest preference for makeup with 19 purchases
- Segment 1 shows strong preference for online shopping with 55 online visits vs 8 store visits
- Segment 1 shows strongest preference for makeup with 24 purchases
- Segment 2 represents high-value customers with average transaction value of $153.18
- Segment 2 shows strong preference for online shopping with 46 online visits vs 7 store visits
- Segment 2 shows strongest preference for makeup with 20 purchases
- Segment 3 represents high-value customers with average transaction value of $152.94
- Segment 3 shows strong preference for online shopping with 50 online visits vs 16 store visits
- Segment 3 shows strongest preference for makeup with 16 purchases

Product Cluster Characteristics:

Cluster 1:
Average Pr