# Correlation and Covariance
- Pearson, Spearman, Kendall correlation
- Covariance matrix, Partial correlation
- Real examples: Portfolio analysis, Feature selection

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
print('Correlation and covariance module loaded')

## Pearson Correlation

**Definition**: Linear relationship measure
**Range**: -1 to +1
**Formula**: r = Cov(X,Y) / (σₓ·σᵧ)

**Interpretation**:
- r = +1: Perfect positive
- r = 0: No linear relationship
- r = -1: Perfect negative

In [None]:
# Pearson correlation
np.random.seed(42)
n = 100
x = np.random.randn(n)
y = 2 * x + np.random.randn(n) * 0.5  # Strong positive correlation

r, p_value = stats.pearsonr(x, y)

print('Pearson Correlation\n')
print(f'Sample size: {n}')
print(f'Correlation coefficient: r = {r:.4f}')
print(f'p-value: {p_value:.6f}')
print(f'R² (variance explained): {r**2:.4f}\n')

if p_value < 0.001:
    print('Highly significant correlation (p < 0.001)')
elif p_value < 0.05:
    print('Significant correlation (p < 0.05)')
else:
    print('Not significant (p ≥ 0.05)')

## Spearman Rank Correlation

**Definition**: Monotonic relationship (rank-based)
**Advantage**: Robust to outliers, non-linear relationships
**Use**: When data not normally distributed

In [None]:
# Spearman correlation
# Non-linear but monotonic relationship
x = np.linspace(0, 10, 100)
y = x**2 + np.random.randn(100) * 5

rho_pearson, p_pearson = stats.pearsonr(x, y)
rho_spearman, p_spearman = stats.spearmanr(x, y)

print('Comparison: Pearson vs Spearman\n')
print('For non-linear (quadratic) relationship:')
print(f'  Pearson r: {rho_pearson:.4f}')
print(f'  Spearman ρ: {rho_spearman:.4f}')
print('\nSpearman better captures monotonic relationship!')

## Kendall's Tau

**Definition**: Concordance measure
**Property**: More robust, better for small samples
**Range**: -1 to +1

In [None]:
# Kendall's tau
tau, p_tau = stats.kendalltau(x, y)

print('Kendall Tau\n')
print(f'Tau: {tau:.4f}')
print(f'p-value: {p_tau:.6f}')
print('\nAll three measures (Pearson, Spearman, Kendall) available!')

## Covariance Matrix

**Definition**: Pairwise covariances
**Use**: Multivariate analysis, PCA

In [None]:
# Covariance matrix
np.random.seed(42)
data = np.random.multivariate_normal([0, 0, 0], [[1, 0.5, 0.3],
                                                     [0.5, 1, 0.4],
                                                     [0.3, 0.4, 1]], 100)

cov_matrix = np.cov(data.T)
corr_matrix = np.corrcoef(data.T)

print('Covariance Matrix:')
print(cov_matrix)
print('\nCorrelation Matrix:')
print(corr_matrix)

## Real Example: Stock Portfolio Correlation

**Problem**: Analyze correlations between stocks
**Use**: Diversification, risk management

In [None]:
# Stock returns simulation
print('Stock Portfolio Correlation Analysis\n')

np.random.seed(42)
days = 252  # Trading days

# Generate correlated returns
mean_returns = [0.0005, 0.0008, 0.0006, 0.0004]
cov_returns = [[0.0004, 0.0002, 0.0001, 0.00005],
               [0.0002, 0.0005, 0.00015, 0.0001],
               [0.0001, 0.00015, 0.0003, 0.00008],
               [0.00005, 0.0001, 0.00008, 0.0002]]

returns = np.random.multivariate_normal(mean_returns, cov_returns, days)
stocks = ['TECH', 'FINANCE', 'ENERGY', 'HEALTHCARE']

# Correlation matrix
corr_matrix = np.corrcoef(returns.T)

print('Stock Correlation Matrix:')
print('\n       ', end='')
for s in stocks:
    print(f'{s:>10s}', end='')
print()
for i, stock in enumerate(stocks):
    print(f'{stock:>7s}', end='')
    for j in range(len(stocks)):
        print(f'{corr_matrix[i,j]:>10.3f}', end='')
    print()

print('\nInterpretation:')
print('- TECH-FINANCE: Moderate correlation (0.5-0.7)')
print('- Low correlation → Good diversification')
print('- Negative correlation → Hedge opportunity')

## Partial Correlation

**Definition**: Correlation controlling for other variables
**Use**: Identify spurious correlations

In [None]:
# Partial correlation (simplified)
print('Partial Correlation Example\n')

# Three correlated variables
np.random.seed(42)
z = np.random.randn(100)  # Common cause
x = z + np.random.randn(100) * 0.5
y = z + np.random.randn(100) * 0.5

# X and Y correlated due to Z
r_xy, _ = stats.pearsonr(x, y)
print(f'Correlation X-Y: {r_xy:.4f}')
print('  (Both driven by Z)\n')

# Remove effect of Z (residual correlation)
from scipy import linalg
X_matrix = np.column_stack([np.ones(len(z)), z])
beta_x = linalg.lstsq(X_matrix, x)[0]
beta_y = linalg.lstsq(X_matrix, y)[0]

resid_x = x - X_matrix @ beta_x
resid_y = y - X_matrix @ beta_y

r_partial, _ = stats.pearsonr(resid_x, resid_y)
print(f'Partial correlation X-Y (controlling Z): {r_partial:.4f}')
print('  (Much weaker after removing Z effect)')

## Summary

### Correlation Methods:
```python
# Pearson (linear)
r, p = stats.pearsonr(x, y)

# Spearman (rank)
rho, p = stats.spearmanr(x, y)

# Kendall (concordance)
tau, p = stats.kendalltau(x, y)
```

### Covariance:
```python
# Covariance matrix
cov_matrix = np.cov(data.T)

# Correlation matrix
corr_matrix = np.corrcoef(data.T)
```

### Selection Guide:
- **Pearson**: Linear, normal data
- **Spearman**: Monotonic, outliers, ordinal
- **Kendall**: Small samples, robust
- **Partial**: Control confounding variables