# Import Data

In [47]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations    


data = np.loadtxt("C:/Users/aceme/OneDrive/Documents/GitHub/BP24/Fabiana/Demos Fabi/uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df = pd.DataFrame(array)

# Look at data
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,0.604431,0.724054,1.295053,0.495865,0.607451,0.547615,0.56415,0.46888,0.420394,0.910129,...,0.660629,1.325968,1.282151,0.60055,0.592177,0.776711,1.085891,1.153748,1.352572,1.0
1,1.180423,1.391002,1.184481,0.583052,1.21049,0.923676,1.185203,1.369972,1.201448,0.614857,...,0.892705,0.848612,1.298801,1.250497,0.547771,1.215082,0.940952,1.109552,1.181372,1.0
2,1.067779,0.718696,0.798901,1.369462,0.470935,0.566282,1.398846,1.015372,0.801271,1.33027,...,1.339399,0.417466,0.496915,0.661756,0.875185,1.293924,0.750581,0.742218,0.993983,1.0
3,0.368247,0.730771,0.134119,0.984532,0.397524,0.470181,0.025061,0.648142,0.016333,0.973801,...,0.086188,0.394613,0.252668,0.808593,0.587922,0.827502,0.862651,0.684517,0.149873,0.0
4,0.91976,0.577797,0.441661,0.862139,0.263016,0.393494,0.635624,0.657747,0.78192,0.56691,...,0.816635,0.31988,0.770176,0.919029,0.265299,0.983398,0.956898,0.175083,0.170124,0.0


# Convert 25/150 Columns into NON-CONSECUTIVE Categorical Variables

In [50]:
# Converting 25 columns from numerical floats -> categorical integers
for i in range(25):
    
    df.iloc[:,i] = df.iloc[:,i].round() # Rounding
    df.iloc[:,i] = df.iloc[:,i].astype(int) # Integer
    df.iloc[:,i] = df.iloc[:,i].astype('category') # Categories
    
# Turn label into categorical label
df.iloc[:,150] = df.iloc[:,150].astype('category')

In [52]:
df.head()
df[0].dtype

CategoricalDtype(categories=[0.0, 1.0], ordered=False, categories_dtype=float64)

# Train Test Split

In [55]:
from sklearn.model_selection import train_test_split
# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:150], df.iloc[:,-1], test_size=0.2, random_state=52)

In [57]:
y_train

159    0.0
198    1.0
259    1.0
301    1.0
220    0.0
      ... 
86     0.0
151    1.0
13     1.0
267    0.0
156    0.0
Name: 150, Length: 400, dtype: category
Categories (2, float64): [0.0, 1.0]

# Subset into Categorical

In [60]:
# Creating subset of only CATEGORICAL variables + LABEL
categorical_df = X_train.select_dtypes(exclude=['float64'])
categorical_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,21,22,23,24
159,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
198,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
259,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
301,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
220,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
151,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
13,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
267,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [62]:
# Check if all your rows are unique
print(len(categorical_df))
categorical_df.drop_duplicates()
print(len(categorical_df))

400
400


# Subset into Numerical

In [65]:
# Creating subset of only NUMERICAL variables from X_train
numerical_df = X_train.select_dtypes(include=['float64'])
numerical_df

Unnamed: 0,25,26,27,28,29,30,31,32,33,34,...,140,141,142,143,144,145,146,147,148,149
159,0.690486,0.724728,0.031008,0.085695,0.708439,0.174898,0.955817,0.894438,0.133235,0.574983,...,0.128504,0.585810,0.766640,0.340473,0.350218,0.274646,0.093124,0.001300,0.899399,0.368022
198,0.458346,0.899253,0.541733,0.912462,0.749055,0.802133,1.241716,1.130841,0.433225,0.673238,...,1.385530,0.600071,1.139251,0.826940,1.272623,1.018323,0.698481,1.365619,0.460799,1.089924
259,0.830562,0.744662,0.893746,0.687039,0.811907,0.724473,1.271633,0.419871,0.617389,0.678783,...,0.658827,0.488724,1.184376,1.056230,0.478523,0.910375,0.718602,0.753928,0.936283,0.423786
301,1.362552,0.925683,0.497489,0.700696,0.880002,0.470012,0.814202,0.843126,1.291244,1.235374,...,1.329065,0.528018,0.747393,1.137783,0.521072,1.136169,1.091905,0.533929,1.024574,0.561742
220,0.916990,0.722635,0.228897,0.223320,0.305287,0.875514,0.747599,0.673355,0.622995,0.832419,...,0.301774,0.271489,0.905137,0.619907,0.835196,0.589189,0.873329,0.891913,0.651581,0.984017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.707802,0.929381,0.149925,0.218498,0.094172,0.789945,0.474387,0.438386,0.701406,0.383875,...,0.261558,0.709312,0.034474,0.437128,0.989087,0.780784,0.299435,0.101811,0.088816,0.493946
151,1.369765,0.783977,0.707489,0.862420,0.802352,0.632215,1.116079,1.288400,1.362606,0.975385,...,0.767802,1.292120,0.790276,0.629266,1.389115,0.819113,1.372995,0.636902,1.295628,0.835012
13,0.797505,0.942278,1.229178,0.719263,1.188005,1.258559,1.003102,1.297534,1.376138,0.522838,...,1.128536,1.195708,0.590900,0.480853,1.021578,0.949964,1.075881,0.682236,0.734495,1.123123
267,0.364381,0.006219,0.229648,0.949955,0.361580,0.109717,0.557569,0.344360,0.228150,0.067130,...,0.191653,0.682845,0.228399,0.585276,0.089750,0.549333,0.279715,0.346601,0.601132,0.893580


# Fixing Chi-Squared FvL

In [68]:
##################### Chi-Square (F vs label column) Code ####################################
print("\n-----------------------Chi-squared test (Feature v. Label)-------------------------------")
    
# Finds dependency between all features in X_train & the label in y_train
def chi_squared_fvl(X_train, y_train):
        
    # Combining X_train and y_train
    df = X_train
    df['label'] = y_train

    # Number of features, excluding label
    var_count = len(df.columns)-1

    # Creates an empty array for Chi2 and P-values
    results = []

    # Creates an empty boolean array for contingency table cells <5
    below_5 = []

    for i in range(0, var_count):

        # Create contigency table of all features v. label
        contingency_table = pd.crosstab(df.iloc[:, i], df.iloc[:,-1])
        
        # Check if any cell in the contingency table is below 5 & appends
        below_5.append((contingency_table < 5).any().any())
            
        # Compute chi-squared and p-values
        chi2 = stats.chi2_contingency(contingency_table)[0]
        p = stats.chi2_contingency(contingency_table)[1]
            
        # Append results to the list
        results.append({
            "Feature": df.columns[i],
            "Chi Squared Statistic": chi2,
            "P-Value": p})

    # Create a dataFrame from the results
    results_df = pd.DataFrame(results)

    # Print warning if any cells are below 5
    if any(below_5):
        print("WARNING: The validity of this chi-squared test may be violated as there are \n         cells below 5 in your contingency table of observed values.") 
    
    # Print the dataFrame
    print("Label:", df.columns.values[-1])
    print(results_df.to_string(index=False))
    
chi_squared_fvl(categorical_df, y_train)


-----------------------Chi-squared test (Feature v. Label)-------------------------------
Label: label
 Feature  Chi Squared Statistic      P-Value
       1              76.935839 1.766031e-18
       2              93.634102 3.795908e-22
       3              84.404438 4.032416e-20
       4              58.614193 1.918277e-14
       5              73.605378 9.540439e-18
       6              69.925699 6.158091e-17
       7              59.315198 1.343350e-14
       8              90.107143 2.256057e-21
       9              72.777296 1.451369e-17
      10              52.978056 3.372956e-13
      11              73.498233 1.007263e-17
      12              66.083829 4.321467e-16
      13              93.419035 4.231614e-22
      14              78.507123 7.971032e-19
      15              93.419035 4.231614e-22
      16              78.242909 9.111759e-19
      17              75.777076 3.175665e-18
      18              72.777296 1.451369e-17
      19              72.839623 1.406251e