In [9]:
import numpy as np
import pandas as pd
from scipy.stats import chi2, expon
#First Question The χ2 goodness-of-fit test
# Reading the file as a multi-column dataset with spaces as delimiters, then flatten to a single column
data = pd.read_csv('spam.txt', header=None, delimiter="\s+", dtype=float).stack().reset_index(drop=True)

# Ensuring there are no NaN values and calculating mean for exponential parameter
if data.isna().any():
    raise ValueError("Data contains NaN values.")
mean_spam = data.mean()
rate = 1 / mean_spam  # Lambda for exponential distribution

# Defining intervals and calculating observed counts
intervals = np.histogram_bin_edges(data, bins='auto')
observed_counts, _ = np.histogram(data, bins=intervals)

# Calculating expected counts for each interval based on the exponential distribution
expected_counts = [(expon.cdf(intervals[i + 1], scale=mean_spam) - expon.cdf(intervals[i], scale=mean_spam)) * len(data)
                   for i in range(len(intervals) - 1)]
expected_counts = np.array(expected_counts)

# Checking if any expected frequency is below 5, as a warning
if np.any(expected_counts < 5):
    print("Warning: Some expected frequencies are less than 5. Consider merging intervals.")

# Calculating chi-square statistic
chi_square_stat = np.sum((observed_counts - expected_counts) ** 2 / expected_counts)
df = len(observed_counts) - 1  # Degrees of freedom
chi_critical = chi2.ppf(0.99, df)  # Critical value at 1% significance

# Conclusion
if chi_square_stat > chi_critical:
    print("Reject the null hypothesis: The data does not follow an exponential distribution.")
else:
    print("Fail to reject the null hypothesis: The data may follow an exponential distribution.")

Fail to reject the null hypothesis: The data may follow an exponential distribution.
