In [48]:
#The Kolmogorov–Smirnov test is a nonparametric goodness-of-fit test and is used to determine 
#whether two distributions differ, or whether an underlying probability distribution differs from a hypothesized distribution
#It is used when we have two samples coming from two populations that can be different

#The Kolmogorov-Smirnov test is defined by:
#H0 (null hypothesis):	The data follow a normal distribution
#Ha (alternative hypothesis):	The data do not follow a normal distribution

#Note: CDF stands for Cumulative Distribution Function.

In [201]:
#import libraries
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split

In [203]:
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [205]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [207]:
# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,1:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [209]:
# Splitting X_train into numerical subset 
numerical_df = X_train.select_dtypes(include=['number'])
numerical_df


Unnamed: 0,26,27,28,29,30,31,32,33,34,35,...,140,141,142,143,144,145,146,147,148,149
159,2.737286,2.564694,2.078000,2.701250,2.172147,2.306469,2.833275,2.121672,2.214934,2.381538,...,2.653517,2.825271,2.758778,2.136084,2.767652,2.302499,2.246025,2.683174,2.566359,2.813462
198,0.449442,0.769937,0.043365,0.899528,0.743107,0.055306,0.189964,0.881079,0.855876,0.177939,...,0.238594,0.022022,0.508925,0.028390,0.660544,0.118136,0.086415,0.946498,0.558883,0.899343
259,2.730227,2.558789,2.801426,2.055368,2.278217,2.426925,2.431457,2.903312,2.106255,2.858540,...,2.887773,2.945521,2.368866,2.799701,2.487134,2.698392,2.588976,2.320205,2.480301,2.136642
301,2.233624,2.514446,2.930156,2.916944,2.435773,2.614566,2.833691,2.577013,2.563006,2.058731,...,2.395805,2.372597,2.512598,2.065079,2.459612,2.946646,2.619441,2.235146,2.659467,2.811038
220,0.687277,0.112971,0.248352,0.483834,0.189635,0.360817,0.808924,0.975149,0.609001,0.142674,...,0.319568,0.561645,0.396156,0.907804,0.942291,0.830752,0.881505,0.321756,0.910684,0.283537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2.465128,2.600074,2.676940,2.477864,2.639337,2.902997,2.084804,2.669015,2.656391,2.421545,...,2.604090,2.501031,2.667957,2.456940,2.130424,2.018010,2.816774,2.800348,2.036940,2.408504
151,0.688109,0.625346,0.802530,0.602819,0.343456,0.936555,0.263959,0.375823,0.553923,0.799001,...,0.228123,0.544925,0.596277,0.942976,0.213394,0.542929,0.105415,0.244743,0.052109,0.638471
13,0.839154,0.999741,0.972734,0.602158,0.763564,0.515336,0.740383,0.434245,0.317270,0.297843,...,0.710380,0.815848,0.727969,0.898495,0.241817,0.637411,0.740484,0.615302,0.018263,0.876028
267,0.927585,0.265717,0.361995,0.526540,0.688182,0.129511,0.233144,0.966997,0.239566,0.344881,...,0.850673,0.191450,0.968260,0.085316,0.577563,0.512694,0.746791,0.422051,0.036703,0.870898


In [221]:
# Splitting X_train into categorical subset 
categorical_df = X_train.select_dtypes(include=['object', 'category'])
categorical_df['label'] = df_table.iloc[:,-1]
categorical_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
159,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
301,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [223]:
# Subset to select only numerical variables columns --> KS Test only works with numerical
df_KS = df_table.select_dtypes(include = ["float64"])
# Add label column to new KS dataset to compare Feature to Label
label_column = df_table.iloc[:, -1]
df_KS['label_column'] = label_column
df_KS.head()

Unnamed: 0,26,27,28,29,30,31,32,33,34,35,...,141,142,143,144,145,146,147,148,149,label_column
0,2.943264,2.523688,2.221715,2.60152,2.46718,2.623657,2.471477,2.215336,2.869438,2.895407,...,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,2.710397,2.550716,2.275302,2.27752,2.014946,2.128915,2.118399,2.488584,2.308198,2.425818,...,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.899312,2.628782,2.597816,2.901625,2.763677,2.822468,2.819629,2.999716,2.389219,2.339056,...,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.547968,0.847056,0.249636,0.347463,0.004766,0.542968,0.855453,0.641879,0.489122,0.034674,...,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.142479,0.312244,0.035217,0.38451,0.067104,0.683855,0.055373,0.380732,0.534018,0.505809,...,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [225]:
# Standardize the sample you will use
# This transforms the sample data to have a mean of 0 and a standard deviation of 1.
def standardize(sample):
    return (sample - np.mean(sample)) / np.std(sample)

In [227]:
# Kolmogorov-Smirnov test function
def ks_test(sample):
    # Sort the sample
    sample_sorted = np.sort(sample)
    # Evaluate the empirical CDF (ECDF)
    ecdf = np.arange(1, len(sample_sorted)+1) / len(sample_sorted)
    # Evaluate the theoretical CDF
    cdf = stats.norm.cdf(sample_sorted)
    # Calculate the KS statistic
    ks_stat = np.max(np.abs(ecdf - cdf))
    # Calculate the p-value
    p_value = stats.kstest(sample_sorted, 'norm').pvalue
    return ks_stat, p_value

# Select one feature from the dataset (Ex: assuming the first column is 0)---> this is where you include df and column index
sample = df_KS.iloc[:, 0] 

# Standardize the sample
standardized_sample = standardize(sample)

# Perform the KS test on standardize sample
ks_stat, p_value = ks_test(standardized_sample)

# Print the result. This prints only one specific column as a string
print(f"KS Test Result for {df_table.columns[0]}: \nks_stat = {ks_stat:.4f}, \np-value = {p_value:.3e}, \nNormal distribution = {p_value > 0.05}")

# This supports or rejects the null hypothesis H0 
# If sample does not come from a normal distribution ---> reject H0
# If sample comes from a normal distribution ---> fail to reject H0
if p_value > 0.05:
    print("Fail to reject H0. Sample comes from the specified distribution")
else:
     print("Reject H0. Sample DOES NOT come from the specified distribution")


KS Test Result for 0: 
ks_stat = 0.1894, 
p-value = 1.647e-16, 
Normal distribution = False
Reject H0. Sample DOES NOT come from the specified distribution


In [229]:
#Temporary KS Test
#This will print a table of all the numerical columns outputs for KS test, p-value, distribution, and hypothesis result
var_count = len(df_KS.columns)-1

#creates an empty array to print values in a table
results = [] 

for i in range(0, var_count):

    # Select one feature from the dataset (Example: assuming the first column is numeric)
    sample = df_KS.iloc[:, i]  # Change the column index as needed

    # Standardize the sample
    standardized_sample = standardize(sample)

    # Perform the KS test on standardize sample
    ks_stat, p_value = ks_test(standardized_sample)

    # Determine if we reject or fail to reject the null hypothesis
    # If sample does not come from a normal distribution ---> reject H0
    # If sample comes from a normal distribution ---> fail to reject H0
    normal_dist = p_value > 0.05
    hypothesis_result = "Fail to reject H0" if normal_dist else "Reject H0"

    # Append results to the list
    results.append({
        "Feature": df_KS.columns[i],
        "KS Statistic": f"{ks_stat:.4f}",
        "P-Value": f"{p_value:.3e}",
        "Normal Distribution": normal_dist,
        "Hypothesis Result": hypothesis_result})
    
# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df.to_string(index=False))


 Feature KS Statistic   P-Value  Normal Distribution Hypothesis Result
      26       0.1894 1.647e-16                False         Reject H0
      27       0.1862 1.180e-15                False         Reject H0
      28       0.1883 5.476e-16                False         Reject H0
      29       0.1954 3.392e-17                False         Reject H0
      30       0.1916 1.516e-16                False         Reject H0
      31       0.1893 1.693e-16                False         Reject H0
      32       0.1828 2.046e-15                False         Reject H0
      33       0.1958 2.897e-17                False         Reject H0
      34       0.1887 2.156e-16                False         Reject H0
      35       0.1892 1.773e-16                False         Reject H0
      36       0.1878 3.060e-16                False         Reject H0
      37       0.1879 2.872e-16                False         Reject H0
      38       0.1908 2.077e-16                False         Reject H0
      