# Spatial Geometry and Distances
- Distance metrics, pdist, cdist, Distance matrices
- Real examples: Genomic distance, Market correlation

In [None]:
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform, euclidean, cosine
import matplotlib.pyplot as plt
print('Distance computation module loaded')

## Distance Metrics
**Common metrics**:
- **Euclidean**: √(Σ(xi-yi)²) - geometric distance
- **Manhattan**: Σ|xi-yi| - city block
- **Cosine**: 1 - cos(θ) - angle between vectors
- **Correlation**: 1 - Pearson correlation
- **Hamming**: Proportion of differing elements

**Choose based on**: Data type and problem domain

In [None]:
# Two points
point1 = np.array([1, 2, 3])
point2 = np.array([4, 5, 6])

print('Distance between two points:')
print(f'  Point 1: {point1}')
print(f'  Point 2: {point2}\n')

# Different metrics
from scipy.spatial import distance

metrics = ['euclidean', 'cityblock', 'cosine', 'correlation']
for metric in metrics:
    if metric == 'correlation':
        dist = distance.correlation(point1, point2)
    else:
        dist = distance.cdist([point1], [point2], metric=metric)[0, 0]
    print(f'  {metric.capitalize()}: {dist:.4f}')

## Pairwise Distances: pdist
Compute distances between all pairs in ONE set

**Function**: `pdist(X, metric='euclidean')`
- Input: (n, d) array
- Output: Condensed distance vector (n*(n-1)/2 elements)
- Use `squareform()` to get full matrix

In [None]:
# Set of points
np.random.seed(42)
points = np.random.rand(5, 3)

print(f'Pairwise distances for {len(points)} points:\n')

# Condensed distance vector
dist_condensed = pdist(points, metric='euclidean')
print(f'Condensed vector length: {len(dist_condensed)}')
print(f'  Expected: {len(points)*(len(points)-1)//2}\n')

# Full distance matrix
dist_matrix = squareform(dist_condensed)
print(f'Distance matrix shape: {dist_matrix.shape}')
print(f'Distance matrix:\n{dist_matrix}\n')
print('Note: Diagonal is 0 (distance to self)')
print('      Symmetric matrix')

## Cross Distances: cdist
Compute distances between TWO different sets

**Function**: `cdist(XA, XB, metric='euclidean')`
- Input: XA (m, d), XB (n, d)
- Output: (m, n) distance matrix

In [None]:
# Two sets of points
set_A = np.random.rand(3, 2)
set_B = np.random.rand(4, 2)

print(f'Cross distances:')
print(f'  Set A: {set_A.shape[0]} points')
print(f'  Set B: {set_B.shape[0]} points\n')

# Distance matrix
cross_dist = cdist(set_A, set_B, metric='euclidean')
print(f'Cross distance matrix shape: {cross_dist.shape}')
print(f'Matrix:\n{cross_dist}\n')
print('Each row: distances from one A point to all B points')

## Real Example: Stock Correlation Analysis
Measure similarity between stock price movements
Correlation distance for portfolio diversification

In [None]:
# Simulate stock returns (10 stocks, 252 days)
np.random.seed(42)
n_stocks = 10
n_days = 252

# Create correlated returns
market_factor = np.random.randn(n_days) * 0.02
stock_returns = np.zeros((n_stocks, n_days))
for i in range(n_stocks):
    beta = np.random.uniform(0.5, 1.5)
    idiosyncratic = np.random.randn(n_days) * 0.01
    stock_returns[i] = beta * market_factor + idiosyncratic

stock_names = [f'Stock_{chr(65+i)}' for i in range(n_stocks)]

print('Stock Correlation Analysis')
print(f'  Stocks: {n_stocks}')
print(f'  Trading days: {n_days}\n')

# Correlation distance (1 - correlation)
dist_corr = pdist(stock_returns, metric='correlation')
dist_matrix = squareform(dist_corr)

# Convert to correlation
corr_matrix = 1 - dist_matrix

print('Correlation matrix (excerpt):')
for i in range(min(5, n_stocks)):
    print(f'{stock_names[i]}: ', end='')
    for j in range(min(5, n_stocks)):
        print(f'{corr_matrix[i,j]:.3f} ', end='')
    print()

# Find most and least correlated pairs
from scipy.spatial.distance import squareform
triu_idx = np.triu_indices(n_stocks, k=1)
correlations = corr_matrix[triu_idx]
stock_pairs = list(zip(*triu_idx))

max_idx = np.argmax(correlations)
min_idx = np.argmin(correlations)

print(f'\nMost correlated: {stock_names[stock_pairs[max_idx][0]]} - {stock_names[stock_pairs[max_idx][1]]} ({correlations[max_idx]:.3f})')
print(f'Least correlated: {stock_names[stock_pairs[min_idx][0]]} - {stock_names[stock_pairs[min_idx][1]]} ({correlations[min_idx]:.3f})')
print('\nLow correlation → good diversification!')

## Real Example: DNA Sequence Comparison
Hamming distance for genetic similarity
Applications: Evolutionary biology, forensics

In [None]:
# DNA sequences (simplified as strings → arrays)
sequences = [
    'ATCGATCGATCG',
    'ATCGATCGTTCG',  # 1 mutation
    'ATCGAACGATCG',  # 1 mutation
    'TTCGATCGATCG',  # 1 mutation
    'GCGCGCGCGCGC'   # Very different
]

print('DNA Sequence Analysis')
print(f'  Sequences: {len(sequences)}')
print(f'  Length: {len(sequences[0])} base pairs\n')

# Convert to numeric
base_map = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
seq_numeric = np.array([[base_map[b] for b in seq] for seq in sequences])

# Hamming distance
dist_hamming = pdist(seq_numeric, metric='hamming')
dist_matrix = squareform(dist_hamming)

print('Hamming distance matrix:')
for i, seq in enumerate(sequences):
    print(f'Seq{i}: ', end='')
    for j in range(len(sequences)):
        # Hamming gives proportion, multiply by length
        mutations = dist_matrix[i,j] * len(sequences[0])
        print(f'{mutations:4.0f} ', end='')
    print()

print('\nInterpretation: Number of differing positions')
print('Seq0-Seq1: 1 mutation (closely related)')
print('Seq0-Seq4: Many mutations (distant)')

## Summary

### Distance Functions:
```python
from scipy.spatial.distance import pdist, cdist, squareform

# Pairwise distances (within one set)
dist_vector = pdist(X, metric='euclidean')
dist_matrix = squareform(dist_vector)

# Cross distances (between two sets)
cross_dist = cdist(XA, XB, metric='euclidean')

# Single pair
from scipy.spatial.distance import euclidean, cosine
d = euclidean(point1, point2)
```

### Common Metrics:
- **euclidean**: Geometric distance, general purpose
- **cityblock/manhattan**: L1 norm, robust to outliers
- **cosine**: Text, high-dim sparse data
- **correlation**: Time series, remove mean
- **hamming**: Categorical/binary data
- **jaccard**: Sets, binary features

### Applications:
✓ **Finance**: Portfolio correlation, risk analysis  
✓ **Biology**: Genetic similarity, protein alignment  
✓ **NLP**: Document similarity, semantic distance  
✓ **Clustering**: Input to hierarchical clustering  
✓ **Anomaly**: Outlier detection via distance  