In [42]:
#The Kolmogorov–Smirnov test is a nonparametric goodness-of-fit test and is used to determine 
#whether two distributions differ, or whether an underlying probability distribution differs from a hypothesized distribution. 
#It is used when we have two samples coming from two populations that can be different

#The Kolmogorov-Smirnov test is defined by:
#H0 (null hypothesis):	The data follow a normal distribution
#Ha (alternative hypothesis):	The data do not follow a normal distribution

#Note: CDF stands for Cumulative Distribution Function.

In [170]:
#import libraries
import numpy as np
import pandas as pd
from scipy import stats

In [174]:
# Load dataset
df = pd.read_csv('IRIS.csv')

# Standardize the sample you will use
# This transforms the sample data to have a mean of 0 and a standard deviation of 1.
def standardize(sample):
    return (sample - np.mean(sample)) / np.std(sample)

In [176]:
# Kolmogorov-Smirnov test function
def ks_test(sample):
    # Sort the sample
    sample_sorted = np.sort(sample)
    # Evaluate the empirical CDF (ECDF)
    ecdf = np.arange(1, len(sample_sorted)+1) / len(sample_sorted)
    # Evaluate the theoretical CDF
    cdf = stats.norm.cdf(sample_sorted)
    # Calculate the KS statistic
    ks_stat = np.max(np.abs(ecdf - cdf))
    # Calculate the p-value
    p_value = stats.kstest(sample_sorted, 'norm').pvalue
    return ks_stat, p_value

# Select one feature from the dataset (Example: assuming the first column is sepal_length)
sample = df.iloc[:, 0]  # Change the column index as needed

# Standardize the sample
standardized_sample = standardize(sample)

# Perform the KS test on standardize sample
ks_stat, p_value = ks_test(standardized_sample)


In [182]:
# Print the result
print(f"KS Test Result for {df.columns[0]}: \nks_stat = {ks_stat:.4f}, \np-value = {p_value:.3e}, \nNormal distribution = {p_value > 0.05}")

# This supports or rejects the null hypothesis H0 
# If sample does not come from a normal distribution ---> reject H0
# If sample comes from a normal distribution ---> fail to reject H0
if p_value > 0.05:
    print("Fail to reject H0. Sample comes from the specified distribution")
else:
     print("Reject H0. Sample DOES NOT come from the specified distribution")


KS Test Result for sepal_length: 
ks_stat = 0.0895, 
p-value = 1.706e-01, 
Normal distribution = True
Fail to reject H0. Sample comes from the specified distribution


In [190]:
# Perform the KS test on some random normal samples for comparison
print("\nRandom samples to verify KS test works: ")
random_samples = {
    'norm_a': np.random.normal(loc=0, scale=1, size=500), #random normal sample with mean 0 and std 1
    'norm_b': np.random.normal(loc=0.1, scale=1, size=500),
    'norm_c': np.random.normal(loc=3, scale=1, size=500),
    'f_a': np.random.f(dfnum=5, dfden=10, size=500) #random F-distribution sample with dfnum=5 and dfden=10
}

# Standardize and test the random samples
for name, sample in random_samples.items(): # Iterate through each generated sample
    standardized_sample = standardize(sample) # Standardize the samples
    ks_stat, p_value = ks_test(standardized_sample)
    print(f"{name}: ks_stat = {ks_stat:.4f}, p-value = {p_value:.3e}, Normal distribution = {p_value > 0.05}")



Random samples to verify KS test works: 
norm_a: ks_stat = 0.0242, p-value = 9.245e-01, Normal distribution = True
norm_b: ks_stat = 0.0214, p-value = 9.728e-01, Normal distribution = True
norm_c: ks_stat = 0.0211, p-value = 9.460e-01, Normal distribution = True
f_a: ks_stat = 0.1426, p-value = 2.497e-09, Normal distribution = False
