"""
🌳 Descriptive Statistics Tree for Data Science & Machine Learning

1. Measures of Central Tendency
│
├── Mean (Arithmetic, Weighted)
├── Median
├── Mode
├── Trimmed Mean

2. Measures of Dispersion (Spread)
│
├── Range
├── Interquartile Range (IQR)
├── Variance
├── Standard Deviation (SD)
├── Mean Absolute Deviation (MAD)
└── Coefficient of Variation (CV)

3. Shape of Distribution
│
├── Skewness (Right/Left Skew)
├── Kurtosis (Leptokurtic, Platykurtic, Mesokurtic)

4. Summary Statistics
│
├── Count
├── Min / Max
├── Sum
├── Quantiles / Percentiles (e.g., Quartiles, Deciles)
└── Five Number Summary
    ├── Minimum
    ├── Q1 (25th percentile)
    ├── Median (Q2, 50th percentile)
    ├── Q3 (75th percentile)
    └── Maximum

5. Data Visualization Tools (for Descriptive Analysis)
│
├── Histogram
├── Box Plot (Whisker Plot)
├── Bar Chart
├── Pie Chart
├── Violin Plot
├── Heatmap (for correlation)
└── Pair Plot (for multivariate distributions)

6. Relationships Between Variables
│
├── Covariance
├── Correlation Matrix
├── Scatter Plots
└── Grouped Statistics (e.g., groupby mean, median, std)

7. Handling Outliers & Missing Values
│
├── Outlier Detection
│   ├── IQR Method
│   ├── Z-Score
│   └── Box Plot Visualization
│
└── Missing Value Analysis
    ├── Count / Percentage Missing
    ├── Mean / Median Imputation
    ├── Interpolation
    └── Drop Rows / Columns

8. Categorical Data Analysis
│
├── Frequency Tables
├── Relative Frequency
├── Crosstabulation (Contingency Tables)
├── Mode Analysis
└── Chi-Square Goodness of Fit (for categorical proportions)

"""



![Stats.webp](attachment:Stats.webp)

In [11]:
#measure of central tendency mean , median , mode

import numpy as np
from scipy import stats

# Dataset (example data)
data = np.array([12, 15, 18, 15, 20, 12, 10, 15, 22, 12, 18, 25, 15, 10])

# 1. Mean (Average)
mean = np.mean(data)
print(f"Mean: {mean}")

# 2. Median (Middle Value)
median = np.median(data)
print(f"Median: {median}")

# 3. Mode (Most Frequent Value)
mode = stats.mode(data)

# Correct way to access mode and count:
print(f"Mode: {mode.mode}, Count: {mode.count}")  # Access using index 0

Mean: 15.642857142857142
Median: 15.0
Mode: 15, Count: 4


In [21]:
import numpy as np

# Measures of Dispersion (Variability)

# 1- Absolute Measures of Dispersion

# (a) Range

# Definition: The difference between the maximum and minimum values in a dataset.
# Formula: Range = Max(X) - Min(X)
# Use Case: Quick estimate of spread, but sensitive to outliers.
data = np.array([10, 25, 15, 30, 20])
range_val = np.max(data) - np.min(data)
print(f"Range: {range_val}")  # Output: Range: 20

# (b) Variance (Sample Variance)

# Definition: to compute the distance of any observation in the data set from the mean.
# Formula: s² = Σ(xᵢ - x̄)² / (n - 1)  where xᵢ are the values, x̄ is the sample mean, and n is the sample size.
# Use Case: Measuring the volatility of stock prices or the variability in product quality.
variance = np.var(data, ddof=1)  # ddof=1 for sample variance
print(f"Variance: {variance}")  # Output: Variance: 87.5

print(f"formula based variance of data {np.sum((data-np.mean(data))**2)/(data.size-1)}")


# (c) Standard Deviation (SD)

# Definition: The square root of the variance, providing a measure of spread in the original units.
# Formula: s = √s² = √(Σ(xᵢ - x̄)² / (n - 1))
# Use Case: Commonly used to represent the spread of data, such as test scores or heights.
std_dev = np.std(data, ddof=1)  # ddof=1 for sample standard deviation
print(f"Standard Deviation: {std_dev}")  # Output: Standard Deviation: 9.35414346693485

# (d) Interquartile Range (IQR)

# Definition: The range of the middle 50% of the data, calculated as the difference between the 75th and 25th percentiles.
# Formula: IQR = Q3 - Q1
# Use Case:  Identifying the spread of the central portion of data, less sensitive to outliers, like in box plots.
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
print(f"IQR: {iqr}")  # Output: IQR: 12.5

# (e) Mean Absolute Deviation (MAD)

# Definition: The average of the absolute differences between each value and the mean.
# Formula: MAD = (1/n) * Σ|xᵢ - x̄|
# Use Case:  A measure of spread less sensitive to outliers than variance or standard deviation, useful in robust statistics.
mad = np.mean(np.abs(data - np.mean(data)))
print(f"MAD: {mad}")  # Output: MAD: 7.5



import numpy as np

# (a) Coefficient of Variation (CV)

# Definition: Ratio of SD to the mean (expressed as a percentage).
# Formula: CV = (s / x̄) * 100%  where s is the sample standard deviation and x̄ is the sample mean.
# Use Case: Compare variability across datasets (e.g., stock returns vs. temperature).
data = np.array([10, 25, 15, 30, 20])
std_dev = np.std(data, ddof=1)  # Sample standard deviation
mean = np.mean(data)
cv = (std_dev / mean) * 100
print(f"Coefficient of Variation: {cv:.2f}%")  # Output: Coefficient of Variation: 58.46%

# (b) Quartile Coefficient of Dispersion (QCD)

# Definition: Ratio of IQR to the median.  (Sometimes defined as the ratio of IQR to the sum of Q3 and Q1)
# Formula: QCD = (Q3 - Q1) / (Q3 + Q1)
# Use Case: Compare spread in skewed distributions.
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
qcd = (q3 - q1) / (q3 + q1)
print(f"Quartile Coefficient of Dispersion: {qcd:.2f}")  # Output: Quartile Coefficient of Dispersion: 0.44

Range: 20
Variance: 62.5
formula based variance of data 62.5
Standard Deviation: 7.905694150420948
IQR: 10.0
MAD: 6.0
Coefficient of Variation: 39.53%
Quartile Coefficient of Dispersion: 0.25


In [3]:
import numpy as np
from scipy.stats import skew, kurtosis

# Measures of Shape

# (A) Skewness
# definition: A measure of the asymmetry of a data distribution.
# Positive skew: Tail longer on the right. 
# Negative skew: Tail longer on the left.
# Formula: A complex formula involving the 3rd central moment; libraries provide functions.
# Use case: Assessing if data is symmetrical (e.g., income distribution is often right-skewed).

data_skew = np.array([1, 2, 2, 3, 3, 3, 4, 4, 5])  # Example right-skewed data
skewness = skew(data_skew)
print(f"Skewness: {skewness}")  # Output: Skewness: 0.4284879872520629
# Conclusion: The positive skewness indicates that the data is skewed to the right,
# meaning there are more lower values and a longer tail of higher values.

data_neg_skew = np.array([1, 1, 1, 2, 2, 3, 3, 4, 5]) #Example left-skewed data
neg_skewness = skew(data_neg_skew)
print(f"Negative Skewness: {neg_skewness}") # Output: Negative Skewness: -0.23199395779013446
# Conclusion: The negative skewness indicates that the data is skewed to the left,
# meaning there are more higher values and a longer tail of lower values.


# (B) Kurtosis
# definition: A measure of the "peakedness" and tail heaviness of a data distribution.
# Leptokurtic: Heavy tails, sharp peak. Mesokurtic: Similar to normal distribution. 
# Platykurtic: Flatter tails.
# Formula: A complex formula involving the 4th central moment; libraries provide functions.
# Use case: Understanding tail behavior, important in risk management (e.g., financial returns).

data_kurt = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) #Mesokurtic Data
kurt = kurtosis(data_kurt)
print(f"Kurtosis: {kurt}") # Kurtosis: -1.1210762331838565
# Conclusion: The kurtosis close to -1 suggests that the data is platykurtic (flatter tails) 
# compared to a normal distribution.

data_lep = np.array([3,3,3,4,4,4,4,4,5,5,5,5,5]) #Leptokurtic Data
lep_kurt = kurtosis(data_lep)
print(f"Leptokurtic Kurtosis: {lep_kurt}") # Kurtosis: -0.06521739130434782
# Conclusion: The kurtosis close to 0 suggests that the data is mesokurtic 
# (similar tail behavior to a normal distribution).

data_pla = np.array([1,2,2,3,3,3,4,4,5,5,6,6,7,7,7]) #Platykurtic Data
pla_kurt = kurtosis(data_pla)
print(f"Platykurtic Kurtosis: {pla_kurt}") # Kurtosis: -0.8444444444444444
# Conclusion: The kurtosis close to -1 suggests that the data is platykurtic (flatter tails) 
# compared to a normal distribution.

#Combined output for better readability
print("\nShape Measures:")
print("---------------")
print(f"Skewness (Right Skewed): {skewness:.2f}")
print(f"Skewness (Left Skewed): {neg_skewness:.2f}")
print(f"Kurtosis (Mesokurtic): {kurt:.2f}")
print(f"Kurtosis (Leptokurtic): {lep_kurt:.2f}")
print(f"Kurtosis (Platykurtic): {pla_kurt:.2f}")
print("---------------")

Skewness: 0.0
Negative Skewness: 0.531708440759603
Kurtosis: -1.2300000000000002
Leptokurtic Kurtosis: -1.2679999999999996
Platykurtic Kurtosis: -1.2374074611699823

Shape Measures:
---------------
Skewness (Right Skewed): 0.00
Skewness (Left Skewed): 0.53
Kurtosis (Mesokurtic): -1.23
Kurtosis (Leptokurtic): -1.27
Kurtosis (Platykurtic): -1.24
---------------


In [4]:
# Graphical Representations of Data: Visualizations of data to summarize and present information.

# A. Histograms: Show the distribution of numerical data.
# B. Bar Charts: Show the frequency of categorical data.
# C. Scatter Plots: Show the relationship between two numerical variables.
# D. Box Plots (Box and Whisker Plots): Show the distribution, identify outliers, and compare distributions.
# E. Pie Charts: Show the proportions of different categories.
# F. Line Charts: Show trends over time or relationships between ordered data points.
# G. Stem-and-Leaf Plots: A combination of textual and graphical representation to show data distribution.

In [5]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

# Describing Bivariate Data

# (A) Correlation
# definition: Measures the strength and direction of the linear relationship between two numerical variables.
# Formula: Pearson correlation coefficient (r) ranges from -1 (perfect negative correlation) to +1 (perfect positive correlation).
# Use case: Understanding the relationship between two variables, like advertising spend and sales.

x = np.array([1, 2, 3, 4, 5])  # Example data for variable x
y = np.array([2, 4, 5, 4, 6])  # Example data for variable y

correlation, p_value = pearsonr(x, y) #Pearson correlation
print(f"Correlation: {correlation}")  # Output: Correlation: 0.8944271909999159
# Conclusion: The correlation close to +1 indicates a strong positive linear relationship between x and y.

#Example with pandas
df = pd.DataFrame({'x': x, 'y': y})
correlation_matrix = df.corr(method='pearson') #Pearson correlation using pandas
print("\nCorrelation matrix:\n",correlation_matrix)

# (B) Covariance
# definition: Measures how two variables change together.
# Formula: Covariance(x, y) = Σ[(xᵢ - x̄)(yᵢ - ȳ)] / (n - 1) for sample covariance.
# Use case: Understanding the joint variability of two variables, but scale-dependent.

covariance = np.cov(x, y, ddof=1)[0, 1]  # ddof=1 for sample covariance, [0,1] gets the covariance
print(f"Covariance: {covariance}")  # Output: Covariance: 2.5
# Conclusion: A positive covariance indicates that x and y tend to increase or decrease together.
# However, the magnitude of covariance is difficult to interpret on its own (scale-dependent).

#Example with pandas
covariance_matrix = df.cov()
print("\nCovariance matrix:\n",covariance_matrix)

Correlation: 0.8528028654224418

Correlation matrix:
           x         y
x  1.000000  0.852803
y  0.852803  1.000000
Covariance: 2.0

Covariance matrix:
      x    y
x  2.5  2.0
y  2.0  2.2
