In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

##### Measures of Central Tendency

**Mean(Average)**

In [2]:
data = np.array([10, 20, 30, 40, 50])

mean = np.mean(data)
print(f"Mean : {mean}")

# Formula: μ = (1/n) * Σx_i
manual_mean = np.sum(data) / len(data)
print(f"Manual calculation: {manual_mean}")

# Weighted mean
# Formula: μ = Σ(x_i*w_i) / Σ(w_i)
weights = np.array([0.1, 0.2, 0.3, 0.2, 0.2])
weighted_mean = np.average(data, weights=weights)
print(f"Weighted mean: {weighted_mean}")

Mean : 30.0
Manual calculation: 30.0
Weighted mean: 32.0


**Median**

In [3]:
median = np.median(data)
print(f"Median : {median}")

# For even numbers of values
data_even = np.array([10, 20, 30, 40])
median_even = np.median(data_even)
print(f"Median (even): {median_even}") # (average of 20 and 30)

# Robust to outliers
data_with_outlier = np.array([10, 20, 30, 40, 50, 1000])
print(f"Mean with outlier: {np.mean(data_with_outlier)}")
print(f"Median with outlier: {np.median(data_with_outlier)}")

Median : 30.0
Median (even): 25.0
Mean with outlier: 191.66666666666666
Median with outlier: 35.0


**Mode**

In [17]:
# Mode (most frequent value)
data = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4])
mode_result = stats.mode(data)
print(f"Mode: {mode_result.mode}")
print(f"Count: {mode_result.count}")

# For continuous data, use histogram
continuous_data = np.random.normal(0, 1, 1000)
hist, bins = np.histogram(continuous_data, bins=50)
mode_bin = bins[np.argmax(hist)]
print(f"Mode (approximate): {mode_bin}")

Mode: 4
Count: 5
Mode (approximate): -0.36017798744868035


##### Measures of Dispersion

**Variance and Standard Deviation**

In [29]:
data = np.array([10, 20, 30, 40, 50])

variance = np.var(data)
print(f"Variance: {variance}")

# Formula: σ² = (1/n) * Σ(x_i - μ)²
mean = np.mean(data)
manual_variance = np.mean((data - mean) ** 2)
print(f"Manual variance: {manual_variance}")

# Sample variance (Bessel's correction)
sample_variance = np.var(data, ddof=1) # # Divide by (n-1) instead of n
print(f"Sample variance: {sample_variance}")


# Standard deviation
std_dev = np.std(data)
print(f"Standard deviation: {std_dev}")

# Standard deviation = √variance
print(f"√variance = {np.sqrt(variance)}")

Variance: 200.0
Manual variance: 200.0
Sample variance: 250.0
Standard deviation: 14.142135623730951
√variance = 14.142135623730951


**Range and Quartiles**

In [None]:
data = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 1000])

# Range
data_range = np.max(data) - np.min(data)
print(f"Range: {data_range}")

# Quartiles
q1 = np.percentile(data, 25)
q2 = np.percentile(data, 50)  # Median
q3 = np.percentile(data, 75) 
print(f"Q1: {q1}, Q2: {q2}, Q3: {q3}")

# Interquartile Range (IQR)
iqr = q3 - q1
print(f"IQR: {iqr}")


# Outlier detection using IQR
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data < lower_bound) | (data > upper_bound)]
print(f"Outliers: {outliers}")

Range: 990
Q1: 32.5, Q2: 55.0, Q3: 77.5
IQR: 45.0
Outliers: [1000]


##### Correlation and Covariance

**Covariance**

In [None]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])

# Covariance
covariance = np.cov(x, y)[0, 1]
print(f"Covariance : {covariance}")

# Manual Covariance
mean_x = np.mean(x)
mean_y = np.mean(y)
manual_cov = np.mean((x - mean_x) * (y - mean_y))
print(f"Manual covariance: {manual_cov}") # 4.0 (population), 5.0 (sample)

Covariance : 5.0
Manual covariance: 4.0


**Correlation**

In [38]:
# Pearson correlation coefficient
correlation = np.corrcoef(x, y)[0, 1]
print(f"Correlation : {correlation}")


# Formula: r = Cov(X,Y) / (σ_X * σ_Y)
std_x = np.std(x, ddof=1)
std_y = np.std(y, ddof=1)
manual_corr = covariance / (std_x * std_y)
print(f"Manual correlation: {manual_corr}")


# Correlation matrix
data = np.array([x, y])
corr_matrix = np.corrcoef(data)
print(f"Correlation matrix:\n{corr_matrix}")

Correlation : 0.9999999999999999
Manual correlation: 0.9999999999999998
Correlation matrix:
[[1. 1.]
 [1. 1.]]
