# Q1: What is hypothesis testing in statistics?
Hypothesis testing is a way to make decisions using data, where we test an assumption about a population parameter.

# Q2: What is the null hypothesis, and how does it differ from the alternative hypothesis?
The null hypothesis assumes no effect or difference, while the alternative hypothesis claims there is an effect or difference.

# Q3: What is the significance level in hypothesis testing, and why is it important?
The significance level (alpha) is the probability of rejecting the null hypothesis when it's actually true, commonly set at 0.05.

# Q4: What does a P-value represent in hypothesis testing?
The P-value shows how likely the observed data would occur if the null hypothesis were true.

# Q5: How do you interpret the P-value in hypothesis testing?
A small P-value (typically < 0.05) suggests strong evidence against the null hypothesis.

# Q6: What are Type 1 and Type 2 errors in hypothesis testing?
Type 1 error is rejecting a true null hypothesis; Type 2 error is failing to reject a false null hypothesis.

# Q7: What is the difference between a one-tailed and a two-tailed test in hypothesis testing?
One-tailed tests check for a deviation in one direction, while two-tailed tests check for deviations in both directions.

# Q8: What is the Z-test, and when is it used in hypothesis testing?
A Z-test is used when the population standard deviation is known and the sample size is large.

# Q9: How do you calculate the Z-score, and what does it represent in hypothesis testing?
Z = (sample_mean - population_mean) / (std_dev/sqrt(n)); it shows how many std deviations the sample is from the population mean.

# Q10: What is the T-distribution, and when should it be used instead of the normal distribution?
The T-distribution is used when the sample size is small and population standard deviation is unknown.

# Q11: What is the difference between a Z-test and a T-test?
Z-test is for known population standard deviation, T-test is for unknown standard deviation.

# Q12: What is the T-test, and how is it used in hypothesis testing?
The T-test compares sample means when population standard deviation is unknown to determine if there’s a significant difference.

# Q13: What is the relationship between Z-test and T-test in hypothesis testing?
Both test means, but the T-test handles more uncertainty and is used with smaller samples.

# Q14: What is a confidence interval, and how is it used to interpret statistical results?
A confidence interval estimates a range where the true parameter likely falls, giving us a sense of accuracy.

# Q15: What is the margin of error, and how does it affect the confidence interval?
Margin of error shows the range of uncertainty; larger margins mean wider, less precise intervals.

# Q16: How is Bayes' Theorem used in statistics, and what is its significance?
Bayes’ Theorem updates the probability of a hypothesis as new data is observed, useful in decision-making.

# Q17: What is the Chi-square distribution, and when is it used?
It’s used in tests for independence or goodness-of-fit involving categorical data.

# Q18: What is the Chi-square goodness of fit test, and how is it applied?
It tests how well observed data match expected data under a specific distribution.

# Q19: What is the F-distribution, and when is it used in hypothesis testing?
The F-distribution is used to compare variances, commonly in ANOVA and F-tests.

# Q20: What is an ANOVA test, and what are its assumptions?
ANOVA checks if there are differences among group means; assumes normality, independence, and equal variances.

# Q21: What are the different types of ANOVA tests?
Types include one-way ANOVA (one factor) and two-way ANOVA (two factors, interaction effect).

# Q22: What is the F-test, and how does it relate to hypothesis testing?
The F-test compares variances between groups to support or reject equality of means in ANOVA.


In [None]:
from scipy import stats
import numpy as np

data = np.array([68, 70, 72, 65, 74, 69, 71, 73])
pop_mean = 70
z_stat, p_value = stats.ttest_1samp(data, pop_mean)
print("Z-statistic:", z_stat)
print("P-value:", p_value)
# If p-value < 0.05, we reject the null hypothesis.


In [None]:
np.random.seed(42)
sample = np.random.normal(loc=52, scale=5, size=100)
t_stat, p_val = stats.ttest_1samp(sample, 50)
print("T-statistic:", t_stat)
print("P-value:", p_val)


In [None]:
def one_sample_z_test(sample, pop_mean, pop_std):
    z = (np.mean(sample) - pop_mean) / (pop_std / np.sqrt(len(sample)))
    p = 2 * (1 - stats.norm.cdf(abs(z)))
    return z, p

sample_data = np.random.normal(100, 15, 50)
z, p = one_sample_z_test(sample_data, 100, 15)
print("Z-score:", z, "P-value:", p)


In [None]:
import matplotlib.pyplot as plt

z_score = 2.2
alpha = 0.05
critical = stats.norm.ppf(1 - alpha/2)

x = np.linspace(-4, 4, 1000)
y = stats.norm.pdf(x)
plt.plot(x, y)
plt.axvline(x=critical, color='red', linestyle='--')
plt.axvline(x=-critical, color='red', linestyle='--')
plt.axvline(x=z_score, color='blue', label='Z-score')
plt.title("Two-tailed Z-test")
plt.legend()
plt.show()


In [None]:
def visualize_errors():
    x = np.linspace(-4, 4, 1000)
    y1 = stats.norm.pdf(x, 0, 1)  # Null
    y2 = stats.norm.pdf(x, 1, 1)  # Alternative
    plt.plot(x, y1, label='H0', color='blue')
    plt.plot(x, y2, label='H1', color='green')
    plt.fill_between(x, 0, y1, where=(x > 1.64), color='red', alpha=0.5, label='Type 1 Error')
    plt.fill_between(x, 0, y2, where=(x < 1.64), color='orange', alpha=0.5, label='Type 2 Error')
    plt.legend()
    plt.title("Type 1 and Type 2 Errors")
    plt.show()

visualize_errors()


In [None]:
group1 = np.random.normal(50, 5, 30)
group2 = np.random.normal(52, 5, 30)
t_stat, p_val = stats.ttest_ind(group1, group2)
print("T-statistic:", t_stat, "P-value:", p_val)


In [None]:
before = np.random.normal(60, 5, 30)
after = before + np.random.normal(2, 2, 30)
t_stat, p_val = stats.ttest_rel(before, after)
print("T-statistic:", t_stat, "P-value:", p_val)


In [None]:
sample = np.random.normal(100, 10, 30)
pop_mean = 105

# Z-test (using known std)
z = (np.mean(sample) - pop_mean) / (10 / np.sqrt(len(sample)))
z_p = 2 * (1 - stats.norm.cdf(abs(z)))

# T-test (unknown std)
t_stat, t_p = stats.ttest_1samp(sample, pop_mean)

print("Z-test:", z, z_p)
print("T-test:", t_stat, t_p)


In [None]:
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    sem = stats.sem(data)
    margin = sem * stats.t.ppf((1 + confidence) / 2., len(data) - 1)
    return mean - margin, mean + margin

data = np.random.normal(100, 10, 50)
ci = confidence_interval(data)
print("95% Confidence Interval:", ci)


In [None]:
def margin_of_error(data, confidence=0.95):
    se = stats.sem(data)
    t = stats.t.ppf((1 + confidence) / 2., len(data)-1)
    return t * se

sample_data = np.random.normal(50, 10, 100)
moe = margin_of_error(sample_data)
print("Margin of Error:", moe)


In [None]:
def bayes(prior_A, prior_B, likelihood_A, likelihood_B):
    evidence = likelihood_A * prior_A + likelihood_B * prior_B
    posterior_A = (likelihood_A * prior_A) / evidence
    return posterior_A

# Example
P_A = 0.01
P_B = 0.99
P_E_given_A = 0.9
P_E_given_B = 0.2
print("Posterior Probability of A:", bayes(P_A, P_B, P_E_given_A, P_E_given_B))


In [None]:
import pandas as pd
table = pd.DataFrame([[30, 10], [20, 40]])
chi2, p, dof, expected = stats.chi2_contingency(table)
print("Chi-square statistic:", chi2)
print("P-value:", p)


In [None]:
observed = np.array([[20, 30], [50, 100]])
chi2, p, dof, expected = stats.chi2_contingency(observed)
print("Expected Frequencies:\n", expected)


In [None]:
observed = [20, 30, 50]
expected = [25, 25, 50]
chi2_stat, p_val = stats.chisquare(f_obs=observed, f_exp=expected)
print("Chi-square:", chi2_stat, "P-value:", p_val)


In [None]:
df = 3
x = np.linspace(0, 20, 500)
y = stats.chi2.pdf(x, df)
plt.plot(x, y, label=f'df={df}')
plt.title("Chi-square Distribution")
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend()
plt.show()


In [None]:
group1 = np.random.normal(20, 5, 30)
group2 = np.random.normal(22, 8, 30)
f_stat = np.var(group1, ddof=1) / np.var(group2, ddof=1)
df1, df2 = len(group1)-1, len(group2)-1
p_val = 1 - stats.f.cdf(f_stat, df1, df2)
print("F-statistic:", f_stat, "P-value:", p_val)


In [None]:
group1 = np.random.normal(10, 2, 30)
group2 = np.random.normal(15, 2, 30)
group3 = np.random.normal(20, 2, 30)
f_stat, p_val = stats.f_oneway(group1, group2, group3)
print("F-statistic:", f_stat, "P-value:", p_val)


In [None]:
import seaborn as sns
import pandas as pd

df = pd.DataFrame({
    'score': np.concatenate([group1, group2, group3]),
    'group': ['A']*30 + ['B']*30 + ['C']*30
})
sns.boxplot(x='group', y='score', data=df)
plt.title("One-Way ANOVA Boxplot")
plt.show()


In [None]:
from scipy.stats import shapiro, levene

# Normality
print("Shapiro Test p-values:")
print("Group1:", shapiro(group1).pvalue)
print("Group2:", shapiro(group2).pvalue)
print("Group3:", shapiro(group3).pvalue)

# Equal Variances
print("Levene's Test p-value:", levene(group1, group2, group3).pvalue)


In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

df = pd.DataFrame({
    'A': np.repeat(['Low', 'High'], 30),
    'B': np.tile(['X', 'Y', 'Z'], 20),
    'score': np.random.normal(10, 2, 60)
})
model = ols('score ~ C(A) + C(B) + C(A):C(B)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)


In [None]:
x = np.linspace(0, 5, 500)
df1, df2 = 5, 10
y = stats.f.pdf(x, df1, df2)
plt.plot(x, y)
plt.title("F-distribution")
plt.xlabel("F-value")
plt.ylabel("Density")
plt.grid()
plt.show()


In [None]:
group1 = np.random.normal(50, 5, 20)
group2 = np.random.normal(55, 5, 20)
group3 = np.random.normal(60, 5, 20)
df = pd.DataFrame({
    'score': np.concatenate([group1, group2, group3]),
    'group': ['G1']*20 + ['G2']*20 + ['G3']*20
})
sns.boxplot(x='group', y='score', data=df)
plt.title("Group Comparison - One-Way ANOVA")
plt.show()


In [None]:
data = np.random.normal(100, 15, 100)
t_stat, p_val = stats.ttest_1samp(data, 100)
print("T-statistic:", t_stat, "P-value:", p_val)


In [None]:
data = np.random.normal(50, 5, 30)
n = len(data)
sample_var = np.var(data, ddof=1)
hypo_var = 25  # hypothesized variance
chi2_stat = (n - 1) * sample_var / hypo_var
p_val = 1 - stats.chi2.cdf(chi2_stat, df=n-1)
print("Chi2 statistic:", chi2_stat, "P-value:", p_val)


In [None]:
from statsmodels.stats.proportion import proportions_ztest

success = [60, 50]
nobs = [100, 100]
z_stat, p_val = proportions_ztest(success, nobs)
print("Z-statistic:", z_stat, "P-value:", p_val)


In [None]:
sample1 = np.random.normal(10, 3, 30)
sample2 = np.random.normal(10, 5, 30)
f = np.var(sample1, ddof=1) / np.var(sample2, ddof=1)
df1, df2 = len(sample1)-1, len(sample2)-1
p = 1 - stats.f.cdf(f, df1, df2)
print("F-statistic:", f, "P-value:", p)


In [None]:
observed = np.random.randint(15, 25, 5)
expected = [20]*5
chi2, p = stats.chisquare(observed, f_exp=expected)
print("Chi-square:", chi2, "P-value:", p)
