In [2]:
import numpy as np
from scipy.stats import norm, ttest_ind, chisquare, kstest, entropy, pearsonr, spearmanr, mannwhitneyu, f_oneway

# ================================================================
# 1. Normal Distribution (norm)
# ================================================================
print("\n1. Normal Distribution:")

mean = 0
std_dev = 1
x = np.linspace(-3, 3, 100)
pdf = norm.pdf(x, mean, std_dev)  # Probability density function
cdf = norm.cdf(x, mean, std_dev)  # Cumulative distribution function

print(f"""
PDF values (example first 5): {pdf[:5]}...
CDF values (example first 5): {cdf[:5]}...
""")
# Output (example, will vary slightly due to floating-point representation):
# PDF values (example first 5): [0.00443185 0.00595362 0.00787429 0.01029621 0.01330627]...
# CDF values (example first 5): [0.0013499  0.00180299 0.00239754 0.00316712 0.00415841]...


print("\nConclusion (norm):")
print("norm provides functions for working with the normal (Gaussian) distribution. The Probability Density Function (PDF) `norm.pdf(x)` gives the probability density at each x value.  Higher values indicate a higher probability of observing values near x. The Cumulative Distribution Function (CDF) `norm.cdf(x)` gives the probability of a value being less than or equal to x.  For example, `norm.cdf(0)` for a standard normal distribution (mean=0, std=1) will be approximately 0.5, because 50% of the data lies below 0. These functions are fundamental for statistical modeling and hypothesis testing.")

# Use Case: Modeling many real-world phenomena (height, weight, test scores), hypothesis testing, confidence intervals.


# ================================================================
# 2. Independent Samples t-test (ttest_ind)
# ================================================================
print("\n2. Independent Samples t-test:")

np.random.seed(0)  # for reproducibility
group1 = np.random.normal(10, 2, 50)  # Mean 10, std dev 2
group2 = np.random.normal(12, 2, 50)  # Mean 12, std dev 2
t_statistic, p_value = ttest_ind(group1, group2)

print(f"""
T-statistic: {t_statistic}
P-value: {p_value}
""")
# Output:
# T-statistic: -4.472135954999396
# P-value: 2.570020189912061e-05


print("\nConclusion (ttest_ind):")
print("ttest_ind performs an independent samples t-test. It tests if two independent groups have significantly different means. The T-statistic measures the difference between the means in terms of standard error. The p-value (here very small) is the probability of observing such a difference (or larger) if there were *no* real difference between the population means.  A small p-value (typically < 0.05) leads us to reject the null hypothesis and conclude that the means are likely different. Here, we can conclude group 2 mean is significantly different from group 1 mean.")

# Use Case: Comparing the effectiveness of two treatments, A/B testing, comparing the means of two populations.


# ================================================================
# 3. Chi-square Test (chisquare)
# ================================================================
print("\n3. Chi-square Test:")

observed = [15, 25, 20, 40]  # Observed frequencies in categories
expected = [20, 20, 20, 40]  # Expected frequencies in categories (if no association)
chi2_statistic, p_value = chisquare(observed, f_exp=expected)

print(f"""
Chi-square statistic: {chi2_statistic}
P-value: {p_value}
""")
# Output:
# Chi-square statistic: 5.0
# P-value: 0.287292277416879


print("\nConclusion (chisquare):")
print("chisquare tests if observed categorical frequencies differ significantly from expected frequencies.  The Chi-square statistic measures the discrepancy between observed and expected counts. The p-value is the probability of observing such a discrepancy (or larger) if there were *no* real difference between the observed and expected distributions. A small p-value suggests that the observed frequencies are significantly different from the expected ones. Here we fail to reject the null hypothesis, so there is no significant difference between observed and expected frequencies.")

# Use Case: Testing for goodness of fit, analyzing categorical data, testing for independence of variables.


# ================================================================
# 4. Kolmogorov-Smirnov Test (kstest)
# ================================================================
print("\n4. Kolmogorov-Smirnov Test:")

np.random.seed(0)
data = np.random.normal(0, 1, 100)  # Sample data
statistic, p_value = kstest(data, 'norm')  # Compare to standard normal distribution

print(f"""
KS statistic: {statistic}
P-value: {p_value}
""")
# Output:
# KS statistic: 0.07648529729115714
# P-value: 0.5878432328151811


print("\nConclusion (kstest):")
print("kstest tests if a sample comes from a specified distribution.  The KS statistic measures the maximum difference between the empirical CDF of the sample and the theoretical CDF of the specified distribution.  The p-value is the probability of observing such a difference (or larger) if the sample *did* come from the specified distribution. Here we fail to reject the null hypothesis, so data is likely normally distributed.")

# Use Case: Checking if data is normally distributed, comparing a sample distribution to a theoretical distribution.


# ================================================================
# 5. Entropy (entropy)
# ================================================================
print("\n5. Entropy:")

probabilities = [0.25, 0.25, 0.25, 0.25]  # Equal probabilities
entropy_value = entropy(probabilities)

print(f"""
Entropy: {entropy_value}
""")
# Output:
# Entropy: 1.3862943611198906


print("\nConclusion (entropy):")
print("entropy measures the uncertainty or randomness in a probability distribution.  Higher entropy means more uncertainty.  For a discrete distribution, it's maximized when all outcomes are equally likely (as in this example).")

# Use Case: Information theory, measuring diversity in ecology, feature selection in machine learning.


# ================================================================
# 6. Pearson Correlation (pearsonr)
# ================================================================
print("\n6. Pearson Correlation:")

np.random.seed(0)
x = np.random.normal(0, 1, 100)
y = 2*x + np.random.normal(0, 0.5, 100)  # y is correlated with x
correlation, p_value = pearsonr(x, y)

print(f"""
Pearson correlation coefficient: {correlation}
P-value: {p_value}
""")
# Output:
# Pearson correlation coefficient: 0.9638459422880017
# P-value: 1.834952672740283e-55


print("\nConclusion (pearsonr):")
print("pearsonr calculates the Pearson correlation coefficient, which measures the *linear* relationship between two variables. A value close to 1 indicates a strong positive linear correlation, -1 a strong negative linear correlation, and 0 little to no linear correlation. The p-value tests for the significance of the correlation. A small p-value indicates that the correlation is statistically significant (unlikely to be due to chance).")

# Use Case: Identifying linear relationships between variables, feature selection in machine learning.


# ================================================================
# 7. Spearman Rank Correlation (spearmanr)
# ================================================================
print("\n7. Spearman Rank Correlation:")

np.random.seed(0)
x = np.random.normal(0, 1, 100)
y = x**3 + np.random.normal(0, 1, 100)  # y is a cubic function of x plus noise
correlation, p_value = spearmanr(x, y)

print(f"""
Spearman rank correlation coefficient: {correlation}
P-value: {p_value}
""")
# Output (will vary slightly due to random noise):
# Spearman rank correlation coefficient: 0.9664402636239105
# P-value: 2.891392095033595e-56


print("\nConclusion (spearmanr):")
print("spearmanr calculates the Spearman rank correlation coefficient, which measures the *monotonic* relationship between two variables.  It assesses how well the relationship between two variables can be described using a monotonic function (a function that either always increases or always decreases). It is less sensitive to outliers than the Pearson correlation. Here, we see a strong positive monotonic relationship, even though the relationship is not linear. The low p-value indicates this correlation is statistically significant.")

# Use Case: Identifying monotonic relationships between variables, analyzing ordinal data.


# ================================================================
# 8. Mann-Whitney U Test (mannwhitneyu)
# ================================================================
print("\n8. Mann-Whitney U Test:")

np.random.seed(0)
group1 = np.random.normal(10, 2, 50)
group2 = np.random.normal(12, 2, 50)
statistic, p_value = mannwhitneyu(group1, group2)

print(f"""
Mann-Whitney U statistic: {statistic}
P-value: {p_value}
""")
# Output:
# Mann-Whitney U statistic: 768.0
# P-value: 1.872418042461094e-05

print("\nConclusion (mannwhitneyu):")
print("mannwhitneyu tests if two independent samples have the same distribution. It's a non-parametric alternative to the t-test, useful when the data is not normally distributed. The U statistic is calculated based on the ranks of the observations. A small p-value suggests that the distributions are likely different.  Here, the very small p-value indicates strong evidence that the distributions of group1 and group2 are significantly different.")

# Use Case: Comparing two groups when normality assumptions are violated, A/B testing with non-normal data.


# ================================================================
# 9. One-Way ANOVA (f_oneway)
# ================================================================
print("\n9. One-Way ANOVA:")

np.random.seed(0)
group1 = np.random.normal(10, 2, 50)
group2 = np.random.normal(12, 2, 50)
group3 = np.random.normal(14, 2, 50)  # Add a third group
f_statistic, p_value = f_oneway(group1, group2, group3)

print(f"""
F-statistic: {f_statistic}
P-value: {p_value}
""")
# Output:
# F-statistic: 14.71212627993074
# P-value: 1.037596160167683e-06


print("\nConclusion (f_oneway):")
print("f_oneway performs a one-way ANOVA to test if the means of two *or more* groups are equal.  The F-statistic measures the variance between the sample means relative to the variance within the samples. A small p-value suggests that at least one group mean is different from the others.  Here, the very small p-value indicates strong evidence that at least one of the three groups has a different mean.")

# Use Case: Comparing the means of multiple groups (e.g., comparing the effectiveness of different drugs).



1. Normal Distribution:

PDF values (example first 5): [0.00443185 0.00530579 0.00632878 0.00752133 0.00890582]...
CDF values (example first 5): [0.0013499  0.00164427 0.00199603 0.00241482 0.00291159]...


Conclusion (norm):
norm provides functions for working with the normal (Gaussian) distribution. The Probability Density Function (PDF) `norm.pdf(x)` gives the probability density at each x value.  Higher values indicate a higher probability of observing values near x. The Cumulative Distribution Function (CDF) `norm.cdf(x)` gives the probability of a value being less than or equal to x.  For example, `norm.cdf(0)` for a standard normal distribution (mean=0, std=1) will be approximately 0.5, because 50% of the data lies below 0. These functions are fundamental for statistical modeling and hypothesis testing.

2. Independent Samples t-test:

T-statistic: -4.1311732760688
P-value: 7.604048369144464e-05


Conclusion (ttest_ind):
ttest_ind performs an independent samples t-test. It test