In [4]:
import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.model_selection import train_test_split

# Upload Dataset

In [6]:
data1 = pd.read_csv("/Users/elleemortensen/Documents/GitHub/BP24/Ellee/Data/Gaussian/gaussian_orig.csv", header=None)
data2 = pd.read_csv("/Users/elleemortensen/Documents/GitHub/BP24/Ellee/Data/Uniform/uniform_orig.csv", header=None)
data3 = pd.read_csv("/Users/elleemortensen/Documents/GitHub/BP24/Ellee/Data/Uniform/uniform_new.csv", header=None)

In [8]:
subset_data2 = data2.iloc[:, -13:]
subset_data3 = data3.iloc[:, -13:]

In [32]:
# Print shapes for debugging
print(f"Shape of data1: {data1.shape}")
print(f"Shape of data2: {subset_data2.shape}")
print(f"Shape of data3: {subset_data3.shape}")

Shape of data1: (172, 13)
Shape of data2: (343, 13)
Shape of data3: (342, 13)


# Concat Synthetic + Augmented dataset

In [14]:
for i in range(-5,0):
    data1.iloc[:,i] = data1.iloc[:,i].astype(int) # Integer
    data1.iloc[:,i] = data1.iloc[:,i].astype('category') # Categories

In [16]:
for i in range(-5,0):
    subset_data2.iloc[:,i] = subset_data2.iloc[:,i].astype(int) # Integer
    subset_data2.iloc[:,i] = subset_data2.iloc[:,i].astype('category') # Categories

1      0.0
2      1.0
3      0.0
4      0.0
      ... 
338    0.0
339    0.0
340    0.0
341    0.0
342    0.0
Name: 20, Length: 343, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_data2.iloc[:,i] = subset_data2.iloc[:,i].astype('category') # Categories
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
338    0.0
339    0.0
340    1.0
341    0.0
342    1.0
Name: 21, Length: 343, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_data2.iloc[:,i] = subset_data2.iloc[:,i].astype('category') # Categories
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
338    0.0
339    0.0
340    0.0
341    0.0
342    0.0
Name: 22, Length: 343, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_da

In [18]:
for i in range(-5,0):
    subset_data3.iloc[:,i] = subset_data3.iloc[:,i].astype(int) # Integer
    subset_data3.iloc[:,i] = subset_data3.iloc[:,i].astype('category') # Categories

1      0.0
2      1.0
3      0.0
4      0.0
      ... 
337    0.0
338    0.0
339    0.0
340    0.0
341    0.0
Name: 20, Length: 342, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_data3.iloc[:,i] = subset_data3.iloc[:,i].astype('category') # Categories
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
337    0.0
338    0.0
339    0.0
340    0.0
341    0.0
Name: 21, Length: 342, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_data3.iloc[:,i] = subset_data3.iloc[:,i].astype('category') # Categories
1      0.0
2      1.0
3      0.0
4      0.0
      ... 
337    1.0
338    0.0
339    0.0
340    0.0
341    0.0
Name: 22, Length: 342, dtype: category
Categories (2, float64): [0.0, 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  subset_da

In [34]:
# Concat (adding rows)
combined_df = np.vstack((subset_data2, subset_data3))

# Reset the index to ensure a clean, continuous index
#combined_df.reset_index(drop=True, inplace=True)
combined_df.shape
combined_df
# change numpy array into a dataframe
combined_data = pd.DataFrame(combined_df, columns=subset_data2.columns)
combined_data
# Convert the last 4 columns to categorical
for column in combined_data.columns[-5:]:
    combined_data[column] = combined_data[column].astype('category')

# Verify the changes
print(combined_data.dtypes)

12     float64
13     float64
14     float64
15     float64
16     float64
17     float64
18     float64
19     float64
20    category
21    category
22    category
23    category
24    category
dtype: object


# Split Dataset

In [20]:
# Split dataset into X and y
X = data1.iloc[:, :-1]  # All columns except the last one
y = data1.iloc[:, -1]   # Only the last column

# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Split dataset into X and y
X = combined_data.iloc[:, :-1]  # All columns except the last one
y = combined_data.iloc[:, -1]   # Only the last column

# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Splitting X_train into numerical subset 
numerical_df = X_train.select_dtypes(include = ['float', 'float64'])

# Splitting X_train into categorical subset 
categorical_df = X_train.select_dtypes(exclude=['float', 'float64'])

# Correlation between columns test (Matrix)

In [40]:
##################### Correlation between columns (numerical) Code ############################
# Takes the X_train data to find correlation between NUMERICAL features
def num_corr(X_train_numerical):
    matrix = X_train_numerical.corr(method='pearson')
    print("---------------------------Correlation Matrix------------------------- \n", matrix)
     
#Calls the function so the matrix prints out    
correlation_matrix = num_corr(numerical_df)

# Print the dataframe to verify
print(correlation_matrix)

#np.savetxt('correlation_matrix.csv', correlation_matrix, delimiter=',')

---------------------------Correlation Matrix------------------------- 
           12        13        14        15        16        17        18  \
12  1.000000  0.157319  0.134368  0.111432  0.134634  0.183681  0.116678   
13  0.157319  1.000000  0.099604  0.117605  0.111486  0.095595  0.089879   
14  0.134368  0.099604  1.000000  0.106509  0.034420  0.131049  0.060834   
15  0.111432  0.117605  0.106509  1.000000  0.118005  0.100133  0.011884   
16  0.134634  0.111486  0.034420  0.118005  1.000000  0.163528  0.152705   
17  0.183681  0.095595  0.131049  0.100133  0.163528  1.000000  0.079526   
18  0.116678  0.089879  0.060834  0.011884  0.152705  0.079526  1.000000   
19  0.145837  0.126652  0.121208  0.085087  0.076828  0.137600  0.069584   

          19  
12  0.145837  
13  0.126652  
14  0.121208  
15  0.085087  
16  0.076828  
17  0.137600  
18  0.069584  
19  1.000000  
None


In [26]:
correlation_matrix = numerical_df.corr()
correlation_df1 = pd.DataFrame(correlation_matrix)
print(correlation_df1)

correlation_df1.shape

          0         1         2         3         4         5         6  \
0  1.000000  0.190323  0.213723  0.161948  0.210775  0.124851  0.215383   
1  0.190323  1.000000  0.093142  0.150070  0.141232  0.149117  0.136150   
2  0.213723  0.093142  1.000000  0.174529  0.183100  0.236549  0.053457   
3  0.161948  0.150070  0.174529  1.000000  0.203168  0.285518  0.200924   
4  0.210775  0.141232  0.183100  0.203168  1.000000  0.080916  0.269330   
5  0.124851  0.149117  0.236549  0.285518  0.080916  1.000000  0.283308   
6  0.215383  0.136150  0.053457  0.200924  0.269330  0.283308  1.000000   
7  0.278531  0.106609  0.154279  0.227292  0.187540  0.244414  0.239216   

          7  
0  0.278531  
1  0.106609  
2  0.154279  
3  0.227292  
4  0.187540  
5  0.244414  
6  0.239216  
7  1.000000  


(8, 8)

In [42]:
correlation_matrix = numerical_df.corr()
correlation_df2 = pd.DataFrame(correlation_matrix)
print(correlation_df2)

correlation_df2.shape

          12        13        14        15        16        17        18  \
12  1.000000  0.157319  0.134368  0.111432  0.134634  0.183681  0.116678   
13  0.157319  1.000000  0.099604  0.117605  0.111486  0.095595  0.089879   
14  0.134368  0.099604  1.000000  0.106509  0.034420  0.131049  0.060834   
15  0.111432  0.117605  0.106509  1.000000  0.118005  0.100133  0.011884   
16  0.134634  0.111486  0.034420  0.118005  1.000000  0.163528  0.152705   
17  0.183681  0.095595  0.131049  0.100133  0.163528  1.000000  0.079526   
18  0.116678  0.089879  0.060834  0.011884  0.152705  0.079526  1.000000   
19  0.145837  0.126652  0.121208  0.085087  0.076828  0.137600  0.069584   

          19  
12  0.145837  
13  0.126652  
14  0.121208  
15  0.085087  
16  0.076828  
17  0.137600  
18  0.069584  
19  1.000000  


(8, 8)

# Differences in Matrices (Frobenius Norm)

In [54]:
# Convert the dataframes to numpy arrays
matrix1 = correlation_df1.to_numpy()
matrix2 = correlation_df2.to_numpy()

# Compute the Frobenius norm of the difference between the matrices. 
frobenius_abs = np.linalg.norm(matrix1 - matrix2, ord='fro')
frobenius_rel = frobenius_abs/np.linalg.norm(matrix1, ord='fro')

print(f"Frobenius abs difference: {frobenius_abs: .3f}")
print(f"Frobenius rel difference: {frobenius_rel: .3f}")

Frobenius norm difference:  0.781
Frobenius abs difference:  0.245


# Differences in Arrays (L2 Norm)

In [None]:
# Compute the L2 norm (Euclidean norm) of the difference between the arrays
# L2_norm = np.linalg.norm(array1 - array2, ord=2)

# print(f"L2 norm difference: {L2_norm: .3f}")