# Summary Statistics
- Descriptive statistics, Central tendency, Dispersion, Shape
- Real examples: Data analysis, Exploratory data analysis

In [1]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
print('Summary statistics module loaded')

Summary statistics module loaded


## Measures of Central Tendency

**Mean**: Average value
**Median**: Middle value (50th percentile)
**Mode**: Most frequent value
**Trimmed mean**: Mean after removing outliers

In [2]:
# Generate sample data
np.random.seed(42)
data = np.random.gamma(2, 2, 1000)

print('Central Tendency Measures\n')

mean_val = np.mean(data)
median_val = np.median(data)
mode_result = stats.mode(data, keepdims=False)
trimmed_mean = stats.trim_mean(data, proportiontocut=0.1)

print(f'Mean: {mean_val:.4f}')
print(f'Median: {median_val:.4f}')
print(f'Trimmed mean (10%): {trimmed_mean:.4f}\n')

# Geometric and harmonic means
gmean = stats.gmean(data)
hmean = stats.hmean(data)

print(f'Geometric mean: {gmean:.4f}')
print(f'Harmonic mean: {hmean:.4f}')
print(f'\nRelation: HM ≤ GM ≤ AM')

Central Tendency Measures

Mean: 4.1199
Median: 3.4581
Trimmed mean (10%): 3.7705

Geometric mean: 3.2025
Harmonic mean: 2.2250

Relation: HM ≤ GM ≤ AM


## Measures of Dispersion

**Variance**: Average squared deviation
**Standard deviation**: √Variance
**Range**: Max - Min
**IQR**: Interquartile range (Q3 - Q1)
**MAD**: Median absolute deviation

In [3]:
print('Dispersion Measures\n')

std_val = np.std(data, ddof=1)
var_val = np.var(data, ddof=1)
range_val = np.ptp(data)
iqr_val = stats.iqr(data)
mad_val = stats.median_abs_deviation(data)

print(f'Standard deviation: {std_val:.4f}')
print(f'Variance: {var_val:.4f}')
print(f'Range: {range_val:.4f}')
print(f'IQR (Q3-Q1): {iqr_val:.4f}')
print(f'MAD: {mad_val:.4f}\n')

# Coefficient of variation
cv = std_val / mean_val * 100
print(f'Coefficient of variation: {cv:.2f}%')

Dispersion Measures

Standard deviation: 2.8057
Variance: 7.8720
Range: 15.4819
IQR (Q3-Q1): 3.4319
MAD: 1.5951

Coefficient of variation: 68.10%


## Measures of Shape

**Skewness**: Asymmetry
- Negative: Left tail
- Zero: Symmetric
- Positive: Right tail

**Kurtosis**: Tailedness
- Excess kurtosis: Kurtosis - 3
- Positive: Heavy tails
- Negative: Light tails

In [4]:
print('Shape Measures\n')

skew_val = stats.skew(data)
kurt_val = stats.kurtosis(data)

print(f'Skewness: {skew_val:.4f}')
if skew_val > 0:
    print('  Right-skewed (positive skew)')
elif skew_val < 0:
    print('  Left-skewed (negative skew)')
else:
    print('  Symmetric')

print(f'\nKurtosis (excess): {kurt_val:.4f}')
if kurt_val > 0:
    print('  Heavy tails (leptokurtic)')
elif kurt_val < 0:
    print('  Light tails (platykurtic)')
else:
    print('  Normal tails (mesokurtic)')

Shape Measures

Skewness: 1.1963
  Right-skewed (positive skew)

Kurtosis (excess): 1.4302
  Heavy tails (leptokurtic)


## Percentiles and Quantiles

**Quartiles**: 25th, 50th, 75th percentiles
**Deciles**: 10%, 20%, ..., 90%
**Percentiles**: Any percentage point

In [5]:
print('Percentiles\n')

# Quartiles
q1, q2, q3 = np.percentile(data, [25, 50, 75])
print(f'Quartiles:')
print(f'  Q1 (25th): {q1:.4f}')
print(f'  Q2 (50th): {q2:.4f}')
print(f'  Q3 (75th): {q3:.4f}\n')

# Deciles
print('Deciles:')
for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
    val = np.percentile(data, p)
    print(f'  {p}th: {val:.4f}')

Percentiles

Quartiles:
  Q1 (25th): 2.0524
  Q2 (50th): 3.4581
  Q3 (75th): 5.4843

Deciles:
  10th: 1.1670
  20th: 1.8055
  30th: 2.2766
  40th: 2.8820
  50th: 3.4581
  60th: 4.1227
  70th: 4.9247
  80th: 6.1684
  90th: 8.2082


## Real Example: Employee Salary Analysis

**Dataset**: Company salaries
**Task**: Comprehensive statistical summary

In [6]:
# Salary data (in thousands)
np.random.seed(42)
salaries = np.concatenate([
    np.random.normal(50, 10, 80),  # Junior
    np.random.normal(80, 15, 15),  # Mid
    np.random.normal(120, 25, 5)   # Senior
])

print('Employee Salary Analysis\n')
print(f'Sample size: {len(salaries)} employees\n')

# Central tendency
print('Central Tendency:')
print(f'  Mean salary: ${np.mean(salaries):.2f}k')
print(f'  Median salary: ${np.median(salaries):.2f}k')
print(f'  Trimmed mean (10%): ${stats.trim_mean(salaries, 0.1):.2f}k\n')

# Dispersion
print('Dispersion:')
print(f'  Std Dev: ${np.std(salaries, ddof=1):.2f}k')
print(f'  Range: ${np.ptp(salaries):.2f}k')
print(f'  IQR: ${stats.iqr(salaries):.2f}k\n')

# Shape
print('Distribution Shape:')
print(f'  Skewness: {stats.skew(salaries):.4f}')
print(f'  Kurtosis: {stats.kurtosis(salaries):.4f}\n')

# Percentiles
print('Salary Percentiles:')
for p in [10, 25, 50, 75, 90, 95]:
    val = np.percentile(salaries, p)
    print(f'  {p}th: ${val:.2f}k')

print('\nInterpretation: Positive skew indicates few high earners')

Employee Salary Analysis

Sample size: 100 employees

Central Tendency:
  Mean salary: $56.82k
  Median salary: $52.03k
  Trimmed mean (10%): $54.02k

Dispersion:
  Std Dev: $20.18k
  Range: $103.60k
  IQR: $19.75k

Distribution Shape:
  Skewness: 1.5349
  Kurtosis: 2.6811

Salary Percentiles:
  10th: $37.68k
  25th: $45.05k
  50th: $52.03k
  75th: $64.80k
  90th: $83.56k
  95th: $94.91k

Interpretation: Positive skew indicates few high earners


## Describe Function

**scipy.stats.describe()**: All-in-one summary

In [7]:
# Comprehensive summary
summary = stats.describe(salaries)

print('Comprehensive Summary (describe)\n')
print(f'Count: {summary.nobs}')
print(f'Min, Max: ({summary.minmax[0]:.2f}, {summary.minmax[1]:.2f})')
print(f'Mean: {summary.mean:.2f}')
print(f'Variance: {summary.variance:.2f}')
print(f'Skewness: {summary.skewness:.4f}')
print(f'Kurtosis: {summary.kurtosis:.4f}')

Comprehensive Summary (describe)

Count: 100
Min, Max: (23.80, 127.40)
Mean: 56.82
Variance: 407.17
Skewness: 1.5349
Kurtosis: 2.6811


## Summary

### Central Tendency:
```python
mean = np.mean(data)
median = np.median(data)
trimmed_mean = stats.trim_mean(data, 0.1)
gmean = stats.gmean(data)
hmean = stats.hmean(data)
```

### Dispersion:
```python
std = np.std(data, ddof=1)
var = np.var(data, ddof=1)
range_val = np.ptp(data)
iqr = stats.iqr(data)
mad = stats.median_abs_deviation(data)
```

### Shape:
```python
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)
```

### Comprehensive:
```python
summary = stats.describe(data)
percentiles = np.percentile(data, [25, 50, 75])
```