# Import Data

In [28]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations

data = np.loadtxt("/Users/elleemortensen/Documents/GitHub/BP24/Ellee/gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df = pd.DataFrame(array)

# Look at data
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,8.544316,9.996471,10.321597,8.227151,8.896668,9.257801,9.370844,8.794788,8.780338,7.98623,...,7.400718,9.613205,7.522342,8.62344,8.639828,8.749055,9.208908,9.060524,8.933239,0.0
1,4.418327,5.97046,3.906164,6.58564,5.378763,4.871757,7.19241,4.951841,5.367239,4.786619,...,4.874917,4.508863,5.760392,5.427105,5.100361,6.743385,4.463974,5.194206,3.70531,1.0
2,10.169981,9.15091,8.771904,8.826778,10.082657,12.241785,10.174273,9.254439,9.688548,9.808325,...,7.719226,8.556954,7.247095,8.142212,8.558377,8.417857,8.760038,8.464984,9.188946,0.0
3,5.115883,5.740511,5.501883,5.298259,4.698823,6.016974,6.04285,6.639564,4.409195,4.885787,...,5.039694,4.1939,6.02352,6.168796,4.758035,5.899085,5.06944,6.234275,3.800722,1.0
4,8.101622,9.751511,8.432083,9.260282,9.174719,10.507783,8.809413,9.013208,8.034601,8.611893,...,6.61791,7.79099,10.612359,9.874875,8.924992,9.743395,8.162828,8.706789,9.728883,0.0


# Convert 25/150 Columns into Categorical Variables

In [30]:
# Converting 25 columns from numerical floats -> categorical integers
for i in range(26):

    df.iloc[:,i] = df.iloc[:,i].astype(int) # Integer
    df.iloc[:,i] = df.iloc[:,i].astype('category') # Categories
    
# Turn label into categorical label
df.iloc[:,150] = df.iloc[:,150].astype('category')

In [31]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,8.0,9.0,10.0,8.0,8.0,9.0,9.0,8.0,8.0,7.0,...,7.400718,9.613205,7.522342,8.62344,8.639828,8.749055,9.208908,9.060524,8.933239,0.0
1,4.0,5.0,3.0,6.0,5.0,4.0,7.0,4.0,5.0,4.0,...,4.874917,4.508863,5.760392,5.427105,5.100361,6.743385,4.463974,5.194206,3.70531,1.0
2,10.0,9.0,8.0,8.0,10.0,12.0,10.0,9.0,9.0,9.0,...,7.719226,8.556954,7.247095,8.142212,8.558377,8.417857,8.760038,8.464984,9.188946,0.0
3,5.0,5.0,5.0,5.0,4.0,6.0,6.0,6.0,4.0,4.0,...,5.039694,4.1939,6.02352,6.168796,4.758035,5.899085,5.06944,6.234275,3.800722,1.0
4,8.0,9.0,8.0,9.0,9.0,10.0,8.0,9.0,8.0,8.0,...,6.61791,7.79099,10.612359,9.874875,8.924992,9.743395,8.162828,8.706789,9.728883,0.0


# Train Test Split

In [33]:
from sklearn.model_selection import train_test_split
# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:150], df.iloc[:,-1], test_size=0.2, random_state=52)

# Subset into Categorical

In [35]:
# Creating subset of only CATEGORICAL variables + LABEL
categorical_df = X_train.select_dtypes(include=['category'])
categorical_df['y_train'] = y_train # Add y_train bc it is also categorical
categorical_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,17,18,19,20,21,22,23,24,25,y_train
159,4.0,3.0,5.0,5.0,4.0,6.0,5.0,4.0,4.0,5.0,...,3.0,5.0,5.0,4.0,4.0,6.0,3.0,4.0,5.0,1.0
198,7.0,5.0,9.0,6.0,8.0,8.0,10.0,8.0,8.0,8.0,...,8.0,9.0,8.0,9.0,9.0,8.0,9.0,10.0,8.0,0.0
259,5.0,4.0,4.0,8.0,5.0,5.0,4.0,4.0,5.0,5.0,...,6.0,2.0,2.0,5.0,3.0,5.0,3.0,4.0,3.0,1.0
301,5.0,4.0,4.0,3.0,4.0,6.0,6.0,3.0,4.0,5.0,...,5.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,1.0
220,6.0,5.0,4.0,4.0,5.0,4.0,3.0,4.0,2.0,4.0,...,5.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,6.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,8.0,9.0,8.0,9.0,8.0,9.0,8.0,7.0,8.0,7.0,...,9.0,9.0,10.0,10.0,9.0,7.0,9.0,6.0,9.0,0.0
151,10.0,8.0,9.0,7.0,8.0,8.0,10.0,8.0,6.0,9.0,...,8.0,8.0,10.0,9.0,10.0,8.0,10.0,8.0,8.0,0.0
13,8.0,10.0,9.0,9.0,9.0,9.0,9.0,9.0,8.0,9.0,...,7.0,8.0,8.0,7.0,9.0,8.0,7.0,9.0,8.0,0.0
267,9.0,8.0,9.0,8.0,9.0,9.0,8.0,8.0,9.0,9.0,...,8.0,8.0,9.0,9.0,8.0,10.0,11.0,7.0,9.0,0.0


# Subset into Numerical

In [37]:
# Creating subset of only NUMERICAL variables from X_train
numerical_df = X_train.select_dtypes(include=['float64'])
numerical_df

Unnamed: 0,26,27,28,29,30,31,32,33,34,35,...,140,141,142,143,144,145,146,147,148,149
159,7.099923,4.447043,5.142696,4.684034,3.140266,5.970080,4.458111,6.821168,4.611144,5.468471,...,4.931065,4.580602,5.598586,5.066765,6.980444,4.241523,5.533661,4.617902,6.839565,4.687857
198,6.964896,8.728623,10.292001,8.961336,10.257361,8.226621,10.116211,7.788209,10.030794,8.980125,...,9.262447,7.500312,9.892198,9.084443,9.162072,9.110184,7.417345,8.248274,9.210940,7.818947
259,4.336846,6.766429,5.563906,4.775484,4.545495,4.727920,4.613554,4.983634,5.316882,6.082599,...,5.433964,3.753876,4.715492,4.118385,5.010725,6.065753,4.078235,5.857303,6.684675,5.537256
301,4.647010,4.119629,4.914772,4.795004,4.159470,4.580232,3.186331,4.648345,5.739554,4.830495,...,3.476744,5.033208,5.109841,6.516130,6.214287,4.364014,4.898977,5.577521,4.561627,5.769038
220,3.999263,4.421098,5.491960,5.492742,5.391402,6.186725,4.034506,4.416704,4.927945,5.888376,...,4.790503,4.329447,5.558376,5.244435,2.823911,4.978521,3.848766,5.362332,5.531697,5.073712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,9.918155,9.413481,8.969330,7.254288,8.311788,11.207003,9.965159,8.829878,9.348247,9.154468,...,8.466994,9.657615,8.086505,8.934959,8.561990,9.941661,10.463104,9.791351,9.770465,9.905881
151,7.877472,8.079200,9.203012,7.456393,8.202167,10.394020,8.982082,8.865251,6.836831,10.060603,...,7.869146,9.968630,9.671245,9.019262,7.229522,8.654626,10.726847,8.956065,8.072554,9.507621
13,10.267841,9.388828,8.966013,9.310424,8.265621,9.876220,9.183961,9.475048,9.882469,9.257195,...,9.556609,9.017620,7.403879,8.547498,9.116467,9.950959,10.130605,9.899564,8.646721,10.176696
267,7.560877,9.625373,8.045073,9.367074,8.695015,9.131595,10.416486,8.182030,6.975683,9.829547,...,8.915012,8.333260,10.066706,9.760086,8.763877,9.163495,8.420628,11.016699,9.471193,7.833822


# Histogram: Feature Distributions

In [39]:
########################## Histogram/Graphing ###############################
# Ensure data is 2D
if df.ndim == 1:
    df = df.reshape(-1, 1)  # Reshape 1D array to 2D array with one column

# Number of features (columns) in the dataset
num_features = df.shape[1]

# Loop through each feature
for feature_idx in range(num_features):
    # Extract the current feature data (column)
    feature_df = df.iloc[:, feature_idx]

    # Compute histogram with 10 bins
    hist, bin_edges = np.histogram(feature_df, bins=10)

    # Print feature number
    print(f"Feature {feature_idx + 1}:")
    
    # Print bin edges
    print("Bin Edges:", bin_edges)

    # Store bin heights in a list
    bin_heights = []
    bin_heights.extend(hist)
    print("Array with bin heights:", bin_heights)

    # Store bin probabilities in a list and normalize
    bin_probs = []
    bin_probs.extend(hist)
    bin_probs = np.array(bin_probs) / sum(bin_heights)
    print("Array with bin probabilities:", bin_probs)

    # Loop through each bin to print range and probabilities
    for i in range(len(hist)):
        bin_range = f"{bin_edges[i]:.2f} to {bin_edges[i+1]:.2f}"  # Bin range
        bin_probability = hist[i] / sum(hist)  # Bin probability
        print(f"Bin {i + 1} ({bin_range}): Height = {hist[i]}, Probability = {bin_probability:.2f}")

    # Separator between features for clarity
    print("\n" + "="*50 + "\n")

Feature 1:
Bin Edges: [ 2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
Array with bin heights: [7, 40, 84, 82, 35, 41, 92, 87, 28, 4]
Array with bin probabilities: [0.014 0.08  0.168 0.164 0.07  0.082 0.184 0.174 0.056 0.008]
Bin 1 (2.00 to 3.00): Height = 7, Probability = 0.01
Bin 2 (3.00 to 4.00): Height = 40, Probability = 0.08
Bin 3 (4.00 to 5.00): Height = 84, Probability = 0.17
Bin 4 (5.00 to 6.00): Height = 82, Probability = 0.16
Bin 5 (6.00 to 7.00): Height = 35, Probability = 0.07
Bin 6 (7.00 to 8.00): Height = 41, Probability = 0.08
Bin 7 (8.00 to 9.00): Height = 92, Probability = 0.18
Bin 8 (9.00 to 10.00): Height = 87, Probability = 0.17
Bin 9 (10.00 to 11.00): Height = 28, Probability = 0.06
Bin 10 (11.00 to 12.00): Height = 4, Probability = 0.01


Feature 2:
Bin Edges: [ 2.   2.9  3.8  4.7  5.6  6.5  7.4  8.3  9.2 10.1 11. ]
Array with bin heights: [8, 37, 82, 84, 40, 39, 91, 87, 26, 6]
Array with bin probabilities: [0.016 0.074 0.164 0.168 0.08  0.078 0.182 0.174 0.052 0.01

# Correlation between NUMERICAL columns

In [41]:
##################### Correlation between columns (numerical) Code ############################
# takes the X_train data to find correlation between numerical columns
def num_corr(X_train):
    matrix = X_train.corr(method='pearson')
    print("Correlation Matrix: \n", matrix)
     
#Calls the function so the matrix prints out    
num_corr(numerical_df)

Correlation Matrix: 
           26        27        28        29        30        31        32   \
26   1.000000  0.813871  0.808895  0.813182  0.795954  0.824211  0.800541   
27   0.813871  1.000000  0.807611  0.822460  0.804108  0.825418  0.810229   
28   0.808895  0.807611  1.000000  0.800935  0.803326  0.807871  0.782464   
29   0.813182  0.822460  0.800935  1.000000  0.799280  0.825252  0.811518   
30   0.795954  0.804108  0.803326  0.799280  1.000000  0.815924  0.794541   
..        ...       ...       ...       ...       ...       ...       ...   
145  0.798140  0.809563  0.823700  0.810280  0.784386  0.815382  0.797333   
146  0.824845  0.826522  0.816540  0.809691  0.776747  0.813120  0.794000   
147  0.800442  0.817071  0.803210  0.801450  0.812326  0.794766  0.803663   
148  0.808984  0.829856  0.797791  0.822089  0.793106  0.804635  0.808377   
149  0.781559  0.783944  0.784041  0.791923  0.782808  0.780687  0.779287   

          33        34        35   ...       140     

In [42]:
# Print the correlation matrix of CATEGORICAL dataframe
print(categorical_df.corr(method='pearson'))

                1         2         3         4         5         6         7  \
1        1.000000  0.774806  0.791700  0.788169  0.771693  0.773023  0.793592   
2        0.774806  1.000000  0.763108  0.790478  0.790220  0.787186  0.774365   
3        0.791700  0.763108  1.000000  0.768370  0.750188  0.759413  0.768104   
4        0.788169  0.790478  0.768370  1.000000  0.778953  0.772494  0.781701   
5        0.771693  0.790220  0.750188  0.778953  1.000000  0.794333  0.764393   
6        0.773023  0.787186  0.759413  0.772494  0.794333  1.000000  0.802827   
7        0.793592  0.774365  0.768104  0.781701  0.764393  0.802827  1.000000   
8        0.800071  0.787455  0.767740  0.795044  0.781017  0.805516  0.806637   
9        0.776526  0.786717  0.765803  0.749415  0.770198  0.773713  0.795664   
10       0.791246  0.776783  0.765903  0.781974  0.790010  0.791566  0.804127   
11       0.778201  0.770649  0.760309  0.779252  0.757417  0.750704  0.780424   
12       0.789501  0.788835 

In [43]:
# Print the correlation matrix of CATEGORICAL dataframe
print(numerical_df.corr(method='pearson'))

          26        27        28        29        30        31        32   \
26   1.000000  0.813871  0.808895  0.813182  0.795954  0.824211  0.800541   
27   0.813871  1.000000  0.807611  0.822460  0.804108  0.825418  0.810229   
28   0.808895  0.807611  1.000000  0.800935  0.803326  0.807871  0.782464   
29   0.813182  0.822460  0.800935  1.000000  0.799280  0.825252  0.811518   
30   0.795954  0.804108  0.803326  0.799280  1.000000  0.815924  0.794541   
..        ...       ...       ...       ...       ...       ...       ...   
145  0.798140  0.809563  0.823700  0.810280  0.784386  0.815382  0.797333   
146  0.824845  0.826522  0.816540  0.809691  0.776747  0.813120  0.794000   
147  0.800442  0.817071  0.803210  0.801450  0.812326  0.794766  0.803663   
148  0.808984  0.829856  0.797791  0.822089  0.793106  0.804635  0.808377   
149  0.781559  0.783944  0.784041  0.791923  0.782808  0.780687  0.779287   

          33        34        35   ...       140       141       142  \
26 

# Chi-squared test

In [45]:
############### CHI-SQUARE TEST FOR LABEL V. ALL FEATURES ########################
    # Finds dependency between all features in X_train & the label in y_train
def chi_squared_fvl(X_train, y_train):
        
    # Combining X_train and y_train
    df = X_train
    df['label'] = y_train

    # Number of features, excluding label
    var_count = len(df.columns)-1

    # Creates an empty array to print values in a table
    results = []

    for i in range(0, var_count):

        # Create contigency table of all features v. label
        crosstab = pd.crosstab(df.iloc[:, i], df.iloc[:,-1])
            
        # Compute chi-squared and p-values
        chi2 = stats.chi2_contingency(crosstab)[0]
        p = stats.chi2_contingency(crosstab)[1]
            
        # Append results to the list
        results.append({
            "Feature": df.columns[i],
            "Chi Squared Statistic": chi2,
            "P-Value": p})

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # Print the DataFrame
    print("Label:", df.columns.values[-1])
    print(results_df.to_string(index=False))
    
chi_squared_fvl(categorical_df, y_train)

Label: label
Feature  Chi Squared Statistic      P-Value
      1             369.206561 5.038195e-74
      2             352.418053 1.222384e-69
      3             361.951650 7.153309e-71
      4             373.555487 5.965145e-75
      5             359.565887 3.713288e-71
      6             371.776606 9.460805e-74
      7             381.646863 1.124606e-76
      8             371.830599 9.214134e-74
      9             354.285714 7.584728e-71
     10             389.234651 1.837621e-77
     11             359.955925 4.706175e-72
     12             371.723707 9.708882e-74
     13             369.281960 3.206282e-73
     14             358.649882 8.928478e-72
     15             359.742781 5.224641e-72
     16             372.279818 1.115469e-74
     17             360.746466 3.193893e-72
     18             375.292158 1.066593e-73
     19             371.832212 1.389425e-74
     20             378.170180 6.196060e-76
     21             384.728139 1.669968e-76
     22            

In [51]:
############### CHI-SQUARE TEST FOR ALL FEATURES V. ALL FEATURES ########################

# Number of features, excluding label
var_count = len(categorical_df.columns)-1

for j in range(0, var_count):

    for i in range(0, var_count):
    
        # Create contigency table
        crosstab = pd.crosstab(categorical_df.iloc[:, i], categorical_df.iloc[:,j])
    
        # Passing contingency table into chi-squared test
        chi, p, dof, exp = stats.chi2_contingency(crosstab)
        print("V", i, "V", j)
        print("Chi-squared:", chi)
        print("p-value:", p)
        print(" ")
    

V 0 V 0
Chi-squared: 3600.0
p-value: 0.0
 
V 1 V 0
Chi-squared: 388.27089679454537
p-value: 1.1242129876257257e-38
 
V 2 V 0
Chi-squared: 402.14537419478506
p-value: 3.7064195816422433e-38
 
V 3 V 0
Chi-squared: 430.66762012456167
p-value: 4.148129899659923e-49
 
V 4 V 0
Chi-squared: 421.1897289484738
p-value: 2.805586233495269e-44
 
V 5 V 0
Chi-squared: 398.749370129151
p-value: 1.9101494672232882e-40
 
V 6 V 0
Chi-squared: 387.9958135752825
p-value: 1.2725433619214173e-41
 
V 7 V 0
Chi-squared: 411.80840082228485
p-value: 1.1410417559421522e-42
 
V 8 V 0
Chi-squared: 384.5489230936647
p-value: 5.024073574527681e-41
 
V 9 V 0
Chi-squared: 434.7085566933299
p-value: 1.2961182864888597e-46
 
V 10 V 0
Chi-squared: 383.735690193342
p-value: 6.943284295511902e-41
 
V 11 V 0
Chi-squared: 394.8890760575076
p-value: 8.602186253636879e-40
 
V 12 V 0
Chi-squared: 444.84075968381103
p-value: 2.2401489991716125e-48
 
V 13 V 0
Chi-squared: 401.9015011985033
p-value: 4.845987168850375e-44
 
V 14 V 

In [None]:
from itertools import combinations

data = df_categorical

# Extract variable names
variable_names = list(data.columns)

# Initialize matrices to store chi-squared and p-values
num_variables = len(variable_names)
chi_squared = np.zeros((num_variables, num_variables))
p_values = np.zeros((num_variables, num_variables))

# Compute chi-squared and p-values for each pair of variables
for i, j in combinations(range(num_variables), 2):
    contingency_table = pd.crosstab(data.iloc[:, i], data.iloc[:, j])
    
    # Compute chi-squared and p-values
    chi2 = stats.chi2_contingency(contingency_table)[0]
    p = stats.chi2_contingency(contingency_table)[1]
    
    # Assign results to chi_squared and p_values matrices
    chi_squared[i, j] = chi2
    chi_squared[j, i] = chi2  # Assign to symmetric position in the matrix
    p_values[i, j] = p
    p_values[j, i] = p  # Assign to symmetric position in the matrix

# Create a DataFrame with variable names as index and columns
chi_squared_df = pd.DataFrame(chi_squared, index=variable_names, columns=variable_names)
p_values_df = pd.DataFrame(p_values, index=variable_names, columns=variable_names)

# Printing the matrix-like output with variable names
print("Chi-Squared Values:")
print(chi_squared_df)
print("\nP-Values:")
print(p_values_df)

# Kolmogorov-Smirnov Test

In [None]:
KS_df = df.select_dtypes(include=['float64'])
KS_df

In [None]:
# Kolmogorov-Smirnov test function
def ks_test(sample):
    # Sort the sample
    sample_sorted = np.sort(sample)
    # Evaluate the empirical CDF (ECDF)
    ecdf = np.arange(1, len(sample_sorted)+1) / len(sample_sorted)
    # Evaluate the theoretical CDF
    cdf = stats.norm.cdf(sample_sorted)
    # Calculate the KS statistic
    ks_stat = np.max(np.abs(ecdf - cdf))
    # Calculate the p-value
    p_value = stats.kstest(sample_sorted, 'norm').pvalue
    return ks_stat, p_value

# Select one feature from the dataset (Example: assuming the first column is sepal_length)
sample = KS_df.iloc[:, 0]  # Change the column index as needed

# Standardize the sample
standardized_sample = standardize(sample)

# Perform the KS test on standardize sample
ks_stat, p_value = ks_test(standardized_sample)


In [None]:
# Print the result
print(f"KS Test Result for {KS_df.columns[0]}: \nks_stat = {ks_stat:.4f}, \np-value = {p_value:.3e}, \nNormal distribution = {p_value > 0.05}")

# This supports or rejects the null hypothesis H0 
# If sample does not come from a normal distribution ---> reject H0
# If sample comes from a normal distribution ---> fail to reject H0
if p_value > 0.05:
    print("Fail to reject H0. Sample comes from the specified distribution")
else:
     print("Reject H0. Sample DOES NOT come from the specified distribution")


In [None]:
# Data
KS_df = df.select_dtypes(include=['float64'])

# Number of features, excluding label
var_count = len(categorical_df.columns)-1


for i in range(0, var_count):

    # Select one feature from the dataset (Example: assuming the first column is sepal_length)
    sample = KS_df.iloc[:, i]  # Change the column index as needed

    # Standardize the sample
    standardized_sample = standardize(sample)

    # Perform the KS test on standardize sample
    ks_stat, p_value = ks_test(standardized_sample)

    # Print the result
    print(f"KS Test Result for {KS_df.columns[i]}: \nks_stat = {ks_stat:.4f}, \np-value = {p_value:.3e}, \nNormal distribution = {p_value > 0.05}")

    # This supports or rejects the null hypothesis H0 
    # If sample does not come from a normal distribution ---> reject H0
    # If sample comes from a normal distribution ---> fail to reject H0
    if p_value > 0.05:
        print("Fail to reject H0. Sample comes from the specified distribution")
    else:
         print("Reject H0. Sample DOES NOT come from the specified distribution")
    print(" ")
