# Clustering Algorithms
- K-means clustering, Vector quantization
- Real examples: Customer segmentation, Image compression

In [None]:
import numpy as np
from scipy.cluster.vq import kmeans, vq, whiten
import matplotlib.pyplot as plt
print('Clustering module loaded')

## K-means Clustering
**Goal**: Partition data into K clusters

**Algorithm**:
1. Initialize K centroids randomly
2. Assign each point to nearest centroid
3. Update centroids (mean of assigned points)
4. Repeat until convergence

**Function**: `kmeans(obs, k_or_guess)`

In [None]:
# Generate sample data (3 clusters)
np.random.seed(42)
cluster1 = np.random.randn(100, 2) + [2, 2]
cluster2 = np.random.randn(100, 2) + [8, 3]
cluster3 = np.random.randn(100, 2) + [5, 8]
data = np.vstack([cluster1, cluster2, cluster3])

print(f'Data: {len(data)} points in 2D')

# Whiten data (normalize variance)
data_whitened = whiten(data)

# K-means
k = 3
centroids, distortion = kmeans(data_whitened, k)
print(f'\nK-means with K={k}')
print(f'  Distortion: {distortion:.4f}')
print(f'  Centroids shape: {centroids.shape}')

# Assign points to clusters
idx, _ = vq(data_whitened, centroids)
print(f'  Cluster assignments: {np.bincount(idx)}')

## Real Example: Customer Segmentation
**Problem**: Group customers by behavior
**Data**: Purchase frequency, average spend
**Goal**: Targeted marketing strategies

In [None]:
# Simulate customer data
np.random.seed(42)
n_customers = 500

# Segment 1: High frequency, high spend
seg1 = np.column_stack([
    np.random.uniform(15, 25, 150),  # purchases/month
    np.random.uniform(200, 400, 150)  # avg spend
])

# Segment 2: Medium frequency, medium spend
seg2 = np.column_stack([
    np.random.uniform(5, 12, 200),
    np.random.uniform(80, 180, 200)
])

# Segment 3: Low frequency, low spend
seg3 = np.column_stack([
    np.random.uniform(1, 5, 150),
    np.random.uniform(20, 80, 150)
])

customers = np.vstack([seg1, seg2, seg3])
print('Customer Segmentation')
print(f'  Total customers: {len(customers)}')
print(f'  Features: [purchases/month, avg_spend]\n')

# Normalize
customers_norm = whiten(customers)

# Cluster
k = 3
centroids, dist = kmeans(customers_norm, k)
labels, _ = vq(customers_norm, centroids)

print(f'Segments found:')
for i in range(k):
    segment = customers[labels == i]
    print(f'  Segment {i+1}: {len(segment)} customers')
    print(f'    Avg purchases: {segment[:, 0].mean():.1f}/month')
    print(f'    Avg spend: ${segment[:, 1].mean():.0f}')
    
    # Profile
    if segment[:, 0].mean() > 15:
        profile = 'Premium (high value)'
    elif segment[:, 0].mean() > 7:
        profile = 'Regular (medium value)'
    else:
        profile = 'Occasional (low value)'
    print(f'    Profile: {profile}\n')

## Elbow Method: Choosing K
Plot distortion vs K
Look for 'elbow' where adding clusters gives diminishing returns

In [None]:
# Test different K values
k_range = range(2, 10)
distortions = []

for k in k_range:
    centroids, dist = kmeans(customers_norm, k)
    distortions.append(dist)
    print(f'K={k}: distortion={dist:.4f}')

print('\nOptimal K is where curve bends (elbow)')
print('For this data, K=3 shows clear elbow')

## Real Example: Image Color Quantization
Reduce colors in image using K-means
Compression: millions of colors → K representative colors

In [None]:
# Simulate RGB image (100x100 pixels)
np.random.seed(42)
img_rgb = np.random.rand(100, 100, 3)
print(f'Image compression via color quantization')
print(f'  Original: {img_rgb.shape[0]}x{img_rgb.shape[1]} pixels')
print(f'  Channels: RGB (3)\n')

# Reshape to (n_pixels, 3)
pixels = img_rgb.reshape(-1, 3)
print(f'Total pixels: {len(pixels)}')
print(f'Original colors: {len(np.unique(pixels, axis=0))}\n')

# Quantize to K colors
k_colors = 16
centroids, _ = kmeans(pixels, k_colors)
labels, _ = vq(pixels, centroids)

# Reconstruct
quantized_pixels = centroids[labels]
img_quantized = quantized_pixels.reshape(img_rgb.shape)

print(f'After quantization to {k_colors} colors:')
print(f'  Unique colors: {k_colors}')
print(f'  Compression ratio: {len(np.unique(pixels, axis=0)) / k_colors:.1f}:1')

# Calculate error
mse = np.mean((img_rgb - img_quantized)**2)
psnr = 10 * np.log10(1 / mse)
print(f'  MSE: {mse:.6f}')
print(f'  PSNR: {psnr:.2f} dB')

## Summary

### K-means Functions:
```python
from scipy.cluster.vq import kmeans, vq, whiten

# 1. Normalize data (important!)
data_norm = whiten(data)

# 2. Compute centroids
centroids, distortion = kmeans(data_norm, k)

# 3. Assign to clusters
labels, distances = vq(data_norm, centroids)
```

### Key Parameters:
- **K**: Number of clusters (use elbow method)
- **whiten**: Normalize features (critical for scale differences)
- **iter**: Max iterations (default: 20)

### Applications:
✓ **Marketing**: Customer segmentation  
✓ **Image**: Color quantization, compression  
✓ **Biology**: Gene expression clustering  
✓ **Anomaly**: Outlier detection  
✓ **Recommendation**: User grouping  

### Best Practices:
✓ **Always whiten**: Normalize feature scales  
✓ **Choose K carefully**: Use elbow method or domain knowledge  
✓ **Multiple runs**: K-means can find local minima  
✓ **Check results**: Validate cluster quality  