# Statistical Distribution Infrastructure
- Custom distributions, Distribution fitting, Method of moments
- Real examples: Custom probability models, Parameter estimation

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
print('Distribution infrastructure module loaded')

## Distribution Class Structure

**Base classes**:
- `rv_continuous`: Continuous distributions
- `rv_discrete`: Discrete distributions

**Key methods**: pdf, cdf, ppf, rvs, fit, stats

In [None]:
print('Built-in Distribution Methods\n')

# Normal distribution
norm = stats.norm(loc=10, scale=2)

print('scipy.stats.norm methods:')
print(f'  pdf(x): Probability density')
print(f'  cdf(x): Cumulative distribution')
print(f'  ppf(q): Quantile (inverse CDF)')
print(f'  rvs(n): Random samples')
print(f'  fit(data): Parameter estimation')
print(f'  stats(): Mean, variance, skew, kurtosis\n')

# Demonstrate
x = 12
print(f'At x={x}:')
print(f'  pdf({x}) = {norm.pdf(x):.6f}')
print(f'  cdf({x}) = {norm.cdf(x):.6f}')
print(f'  P(X ≤ {x}) = {norm.cdf(x)*100:.2f}%\n')

mean, var = norm.stats(moments='mv')
print(f'Distribution statistics:')
print(f'  Mean: {mean:.2f}')
print(f'  Variance: {var:.2f}')

## Creating Custom Distributions

**Method**: Subclass `rv_continuous` or `rv_discrete`
**Override**: `_pdf`, `_cdf`, `_ppf`, etc.

In [None]:
print('\nCustom Distribution Example\n')

class triangular_gen(stats.rv_continuous):
    """Triangular distribution on [0, 1]"""
    
    def _pdf(self, x):
        # Triangle: peak at x=0.5
        return np.where(x < 0.5, 4*x, 4*(1-x))
    
    def _cdf(self, x):
        return np.where(x < 0.5, 2*x**2, 1 - 2*(1-x)**2)
    
    def _ppf(self, q):
        return np.where(q < 0.5, 
                       np.sqrt(q/2),
                       1 - np.sqrt((1-q)/2))

# Create instance
triangular = triangular_gen(a=0, b=1, name='triangular')

print('Custom triangular distribution [0, 1]')
print('  Peak at x = 0.5\n')

# Use like any scipy distribution
samples = triangular.rvs(size=1000)
print(f'Generated {len(samples)} samples')
print(f'  Mean: {samples.mean():.4f} (expected: 0.5)')
print(f'  Std: {samples.std():.4f}')

print('\n✓ Custom distribution works like built-in!')

## Distribution Fitting

**Methods**:
1. Maximum Likelihood Estimation (MLE)
2. Method of Moments (MOM)
3. Least Squares

In [None]:
print('\nDistribution Fitting\n')

np.random.seed(42)
# Generate data from gamma(2, scale=2)
true_a, true_scale = 2.0, 2.0
data = np.random.gamma(true_a, true_scale, 1000)

print(f'Data: {len(data)} samples from Gamma({true_a}, {true_scale})\n')

# Fit gamma distribution (MLE)
fit_a, fit_loc, fit_scale = stats.gamma.fit(data, floc=0)

print('Maximum Likelihood Fit:')
print(f'  Shape (a): {fit_a:.4f} (true: {true_a})')
print(f'  Scale: {fit_scale:.4f} (true: {true_scale})')
print(f'  Location: {fit_loc:.4f} (fixed at 0)\n')

# Goodness of fit
ks_stat, p_value = stats.kstest(data, 'gamma', args=(fit_a, fit_loc, fit_scale))
print(f'Kolmogorov-Smirnov test:')
print(f'  Statistic: {ks_stat:.4f}')
print(f'  p-value: {p_value:.4f}')
if p_value > 0.05:
    print('  Good fit! (p > 0.05)')

## Real Example: Lifetime Data Analysis

**Problem**: Fit component lifetime distribution
**Data**: Time to failure
**Models**: Weibull, Exponential, Lognormal

In [None]:
print('\nComponent Lifetime Analysis\n')

np.random.seed(42)
# Simulate lifetime data (Weibull)
lifetime = stats.weibull_min.rvs(c=2.5, scale=1000, size=100)

print(f'Lifetime data: {len(lifetime)} components')
print(f'  Range: {lifetime.min():.0f} to {lifetime.max():.0f} hours')
print(f'  Mean: {lifetime.mean():.0f} hours\n')

# Try multiple distributions
distributions = [
    ('Weibull', stats.weibull_min),
    ('Exponential', stats.expon),
    ('Lognormal', stats.lognorm)
]

print('Fitting multiple distributions:\n')
best_dist = None
best_ks = np.inf

for name, dist in distributions:
    params = dist.fit(lifetime)
    ks_stat, p_value = stats.kstest(lifetime, dist.cdf, args=params)
    
    print(f'{name}:')
    print(f'  KS statistic: {ks_stat:.4f}')
    print(f'  p-value: {p_value:.4f}')
    
    if ks_stat < best_ks:
        best_ks = ks_stat
        best_dist = name

print(f'\nBest fit: {best_dist}')
print('Use for reliability prediction!')

## Method of Moments

**Concept**: Match sample moments to theoretical moments
**Advantage**: Fast, simple
**Use**: Initial estimates for MLE

In [None]:
print('\nMethod of Moments Estimation\n')

# Gamma distribution: E[X] = a*θ, Var[X] = a*θ²
data_mean = data.mean()
data_var = data.var()

# Solve for a and θ
theta_mom = data_var / data_mean
a_mom = data_mean / theta_mom

print('Method of Moments (Gamma):')
print(f'  Shape (a): {a_mom:.4f}')
print(f'  Scale (θ): {theta_mom:.4f}\n')

print('Compare to MLE:')
print(f'  MLE shape: {fit_a:.4f}')
print(f'  MLE scale: {fit_scale:.4f}\n')

print('MoM: Fast but less efficient than MLE')
print('Good for initial guesses')

## Mixture Distributions

**Concept**: Weighted sum of distributions
**Use**: Multi-modal data, population subgroups

In [None]:
print('\nMixture Distribution\n')

np.random.seed(42)
# Two-component Gaussian mixture
n1, n2 = 300, 200
comp1 = np.random.normal(10, 2, n1)
comp2 = np.random.normal(20, 3, n2)
mixture_data = np.concatenate([comp1, comp2])

print('Gaussian mixture:')
print(f'  Component 1: N(10, 2²), n={n1}')
print(f'  Component 2: N(20, 3²), n={n2}')
print(f'  Total samples: {len(mixture_data)}\n')

# Mixture cannot be fit directly with single distribution
# Would need EM algorithm (not in scipy, use sklearn)
print('Mixture characteristics:')
print(f'  Mean: {mixture_data.mean():.2f}')
print(f'  Std: {mixture_data.std():.2f}')
print(f'  Bimodal distribution\n')

print('For EM algorithm: use sklearn.mixture.GaussianMixture')

## Summary

### Using Distributions:
```python
# Create distribution
dist = stats.norm(loc=mu, scale=sigma)

# Methods
pdf_val = dist.pdf(x)
cdf_val = dist.cdf(x)
quantile = dist.ppf(q)
samples = dist.rvs(size=n)
```

### Fitting:
```python
# Maximum likelihood
params = dist.fit(data)

# With fixed parameters
params = dist.fit(data, floc=0)
```

### Custom Distributions:
```python
class my_dist_gen(stats.rv_continuous):
    def _pdf(self, x):
        return pdf_formula
    
    def _cdf(self, x):
        return cdf_formula

my_dist = my_dist_gen(name='my_dist')
```

### Applications:
- **Reliability engineering**: Lifetime analysis
- **Finance**: Return distributions
- **Quality control**: Process capability
- **Environmental**: Extreme value analysis
- **Insurance**: Claim size modeling