In [48]:
#The Kolmogorov–Smirnov test is a nonparametric goodness-of-fit test and is used to determine 
#whether two distributions differ, or whether an underlying probability distribution differs from a hypothesized distribution
#It is used when we have two samples coming from two populations that can be different

#The Kolmogorov-Smirnov test is defined by:
#H0 (null hypothesis):	The data follow a normal distribution
#Ha (alternative hypothesis):	The data do not follow a normal distribution

#Note: CDF stands for Cumulative Distribution Function.

In [2]:
#import libraries
import numpy as np
import pandas as pd
from scipy import stats

In [3]:
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
print(df_table)

          0         1         2         3         4         5         6    \
0    2.014037  2.842330  2.093059  2.314322  2.550290  2.556514  2.063987   
1    2.655125  2.439494  2.387897  2.414520  2.677007  2.066587  2.221681   
2    2.397686  2.129261  2.228847  2.574741  2.672454  2.330393  2.379493   
3    0.021023  0.884131  0.570157  0.950007  0.570792  0.741419  0.251829   
4    0.087550  0.596086  0.355909  0.447322  0.680048  0.198563  0.192330   
..        ...       ...       ...       ...       ...       ...       ...   
495  0.611156  0.236036  0.896368  0.773777  0.538057  0.402998  0.090796   
496  2.761173  2.080949  2.939479  2.325925  2.977614  2.109083  2.517269   
497  0.401104  0.340544  0.555580  0.230778  0.600226  0.992868  0.274078   
498  0.248207  0.096274  0.516660  0.946114  0.271408  0.845261  0.546188   
499  2.647101  2.363681  2.077603  2.632778  2.676110  2.920187  2.866320   

          7         8         9    ...       141       142       143  \
0  

In [6]:
# Subsetting to last 15 columns which include the label column
df_table = df_table.iloc[:, 135:151]

#Loop, converts floats to ints and then those ints to category
for i in range(0,5):
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

In [8]:
df_table.head()

Unnamed: 0,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150
0,2.0,2.0,2.0,2.0,2.0,2.580566,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,2.0,2.0,2.0,2.0,2.0,2.744746,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.0,2.0,2.0,2.0,2.0,2.809408,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.0,0.0,0.0,0.0,0.0,0.608604,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.0,0.0,0.0,0.0,0.0,0.898631,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [10]:
# Subset to select only numerical variables columns --> KS Test only works with numerical
df_KS = df_table.select_dtypes(include = ["float64"])
df_KS.head()

Unnamed: 0,140,141,142,143,144,145,146,147,148,149,150
0,2.580566,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,2.744746,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.809408,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.608604,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.898631,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [12]:
# Standardize the sample you will use
# This transforms the sample data to have a mean of 0 and a standard deviation of 1.
def standardize(sample):
    return (sample - np.mean(sample)) / np.std(sample)

In [21]:
# Kolmogorov-Smirnov test function
def ks_test(sample):
    # Sort the sample
    sample_sorted = np.sort(sample)
    # Evaluate the empirical CDF (ECDF)
    ecdf = np.arange(1, len(sample_sorted)+1) / len(sample_sorted)
    # Evaluate the theoretical CDF
    cdf = stats.norm.cdf(sample_sorted)
    # Calculate the KS statistic
    ks_stat = np.max(np.abs(ecdf - cdf))
    # Calculate the p-value
    p_value = stats.kstest(sample_sorted, 'norm').pvalue
    return ks_stat, p_value

# Select one feature from the dataset (Ex: assuming the first column is 0)---> this is where you include df and column index
sample = df_KS.iloc[:, 0] 

# Standardize the sample
standardized_sample = standardize(sample)

# Perform the KS test on standardize sample
ks_stat, p_value = ks_test(standardized_sample)

# Print the result. This prints only one specific column as a string
print(f"KS Test Result for {df_table.columns[0]}: \nks_stat = {ks_stat:.4f}, \np-value = {p_value:.3e}, \nNormal distribution = {p_value > 0.05}")

# This supports or rejects the null hypothesis H0 
# If sample does not come from a normal distribution ---> reject H0
# If sample comes from a normal distribution ---> fail to reject H0
if p_value > 0.05:
    print("Fail to reject H0. Sample comes from the specified distribution")
else:
     print("Reject H0. Sample DOES NOT come from the specified distribution")


KS Test Result for 135: 
ks_stat = 0.1911, 
p-value = 1.844e-16, 
Normal distribution = False
Reject H0. Sample DOES NOT come from the specified distribution


In [18]:
#Temporary KS Test
#This will print a table of all the numerical columns outputs for KS test, p-value, distribution, and hypothesis result
var_count = len(df_KS.columns)-1

#creates an empty array to print values in a table
results = [] 

for i in range(0, var_count):

    # Select one feature from the dataset (Example: assuming the first column is numeric)
    sample = df_KS.iloc[:, i]  # Change the column index as needed

    # Standardize the sample
    standardized_sample = standardize(sample)

    # Perform the KS test on standardize sample
    ks_stat, p_value = ks_test(standardized_sample)

    # Determine if we reject or fail to reject the null hypothesis
    # If sample does not come from a normal distribution ---> reject H0
    # If sample comes from a normal distribution ---> fail to reject H0
    normal_dist = p_value > 0.05
    hypothesis_result = "Fail to reject H0" if normal_dist else "Reject H0"

    # Append results to the list
    results.append({
        "Feature": df_KS.columns[i],
        "KS Statistic": f"{ks_stat:.4f}",
        "P-Value": f"{p_value:.3e}",
        "Normal Distribution": normal_dist,
        "Hypothesis Result": hypothesis_result})
    
# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df.to_string(index=False))


 Feature KS Statistic   P-Value  Normal Distribution Hypothesis Result
     140       0.1911 1.844e-16                False         Reject H0
     141       0.1897 1.446e-16                False         Reject H0
     142       0.1918 1.373e-16                False         Reject H0
     143       0.1918 1.369e-16                False         Reject H0
     144       0.1938 2.806e-17                False         Reject H0
     145       0.1975 1.437e-17                False         Reject H0
     146       0.1900 1.283e-16                False         Reject H0
     147       0.1865 1.079e-15                False         Reject H0
     148       0.1899 2.920e-16                False         Reject H0
     149       0.1851 8.492e-16                False         Reject H0
