This project focuses on data transformation, dimensionality reduction and calculating distance matrices using different transformation techniques. The dataset is normalized and its Eucledian distance is calculated between all the rows. Similarly, the Haar Wavelet Transformation is applied to row normalized data and its Euclediamn distance is calculated. On the other hand, we also apply PCA transformation to column normalized data and then its Eucledian distance is also calculated. At the end, we comapre three Eucledian distance matrices.

In [15]:
# Importing necessary libraries for data manipulation, numerical calculation, and random operations. 
import pandas as pd
import numpy as np
import math
import random

In [16]:
# Read the dataset
df = pd.read_csv('../data/Asgmnt1_data.csv',header=None)

In [17]:
# Normalizing the data by row
normalized_rows = []
number_rows = df.shape[0]

for i in range(number_rows):
    normalized_row = []
    row_values = df.iloc[i, :]
    
    mean = row_values.mean()
    std = row_values.std(ddof=1)
    
    for val in row_values:
        normalized_val = (val - mean) / std
        normalized_row.append(normalized_val)
        
    normalized_rows.append(pd.Series(normalized_row, index=df.columns, name=df.index[i]))
    
normalized_data = pd.concat(normalized_rows, axis=1).T

In [18]:
# Function to calculate the Euclidean distance between two rows
def calculate_euclidean_distance(row1, row2):
    sum_square = 0
    for (val1, val2) in zip(row1, row2):
        sum_square += (val1 - val2) ** 2
    
    return math.sqrt(sum_square)

In [19]:
# Function to construct a Euclidean distance matrix from a DataFrame
def construct_euclidean_distance_matrix(dataframe):
    n = dataframe.shape[0]
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            distance = calculate_euclidean_distance(dataframe.iloc[i], dataframe.iloc[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    
    return distance_matrix


In [8]:
# Constructing a Euclidean distance matrix from the normalized data
normalized_distance_matrix = construct_euclidean_distance_matrix(normalized_data)

In [10]:
# Function to create a Haar non-square matrix for transformation (used in wavelet transformations)
def haar_non_square_matrix(m, n):
    if n < m:
        raise ValueError("Number of columns must be greater than or equal to the number of rows")
    
    H = np.zeros((m, n))

    # First row is all ones
    H[0, :] = 1 / np.sqrt(n)
    
    # Build the rest of the matrix
    for level in range(1, m):
        step = n // (2 ** level)
        if step == 0:
            break  # Stop if the step becomes zero (more rows than meaningful steps)
        
        for i in range(0, n, 2 * step):
            H[level, i:i + step] = 1 / np.sqrt(step)
            H[level, i + step:i + 2 * step] = -1 / np.sqrt(step)
    
    return H

In [11]:
# Generate Haar Wavelet Matrix for wavelet transofrmation
h = haar_non_square_matrix(128, 16000).T

In [12]:
# Multiply the data with haar wavelet matrix
wavelet_transformed_normalized_data = normalized_data * h

In [14]:
wavelet_transformed_distance_matrix = construct_euclidean_distance_matrix(wavelet_transformed_normalized_data.iloc[:, :4])

In [15]:
np.savetxt('wavelet_transformed_distance_matrix.csv', wavelet_transformed_distance_matrix, delimiter=',', fmt='%.6f')

In [3]:
# Normalizing the data by column
normalized_columns = []

# Get the number of columns
number_columns = df.shape[1]

# Loop through each column to normalize
for col in df.columns:
    normalized_column = []
    column_values = df[col]
    
    mean = column_values.mean()
    std = column_values.std(ddof=1)
    
    # Normalize each value in the column
    for val in column_values:
        normalized_val = (val - mean) / std
        normalized_column.append(normalized_val)
    
    normalized_columns.append(pd.Series(normalized_column, index=df.index, name=col))

# Concatenate the normalized columns to form the normalized DataFrame
column_normalized_data = pd.concat(normalized_columns, axis=1)

In [11]:
# Perform Principal Component Analysis (PCA) on the normalized data
def PCA(data_before_pca, n_principal_components):
    covariance_matrix = (data_before_pca.T @ data_before_pca)
    # np.cov(data_before_pca, rowvar=False)

    # Calculate eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

    # Sort eigenvalues and corresponding eigenvectors in descending order of eigenvalues
    components = eigenvectors[:, [127, 126, 125, 124]]
    
    data_after_pca = np.dot(data_before_pca, components)
    
    return data_after_pca, eigenvectors[:n_principal_components] 

In [12]:
# pca_transformed_data, top_eigenvalues = PCA(normalized_data, 4)
pca_transformed_data, top_eigenvalues = PCA(column_normalized_data, 4)

In [14]:
pca_transformed_data.shape

(16000, 4)

In [None]:
# Construct a Euclidean distance matrix from the PCA-transformed data
pca_transformed_distance_matrix = construct_euclidean_distance_matrix(pd.DataFrame(pca_transformed_data))

In [95]:
# Function to generate two pairs of random indices from the matrix
def generate_random_pairs(matrix_size):
    row1, col1, row2, col2 = -1, -2, -1, -2
    
    while (row1 == row2 and col1 == col2) or (row1 == col1 and row2 == col1):
        row1, col1 = random.randint(0, matrix_size-1), random.randint(0, matrix_size-1)
        row2, col2 = random.randint(0, matrix_size-1), random.randint(0, matrix_size-1)
    return (row1, col1), (row2, col2)

def compare_cells(cell1, cell2):
    if cell1 > cell2:
        return 'greater'
    elif cell1 == cell2:
        return 'equal'
    else:
        return 'smaller'
    
# Function to compare random cells across three matrices
def compare_random_cells(matrix_list1, matrix_list2, matrix_list3, n):
    matrix_size = len(matrix_list1[0])  # Assuming all matrices are the same size
    count = 0  # Counter for matches across all matrices
    
    for _ in range(n):
        # Generate random pairs of indices
        pair1, pair2 = generate_random_pairs(matrix_size)
        
        # Extract cell values from each matrix
        matrix1_cell1 = matrix_list1[pair1[0]][pair1[1]]
        matrix1_cell2 = matrix_list1[pair2[0]][pair2[1]]
        
        matrix2_cell1 = matrix_list2[pair1[0]][pair1[1]]
        matrix2_cell2 = matrix_list2[pair2[0]][pair2[1]]
        
        matrix3_cell1 = matrix_list3[pair1[0]][pair1[1]]
        matrix3_cell2 = matrix_list3[pair2[0]][pair2[1]]

        
        # Compare cells in each matrix
        comparison1 = compare_cells(matrix1_cell1, matrix1_cell2)
        comparison2 = compare_cells(matrix2_cell1, matrix2_cell2)
        comparison3 = compare_cells(matrix3_cell1, matrix3_cell2)
        
        # Check if all comparisons are the same
        if comparison1 == comparison2 == comparison3:
            count += 1
    
    return count

In [96]:
n= 100
count = compare_random_cells(normalized_distance_matrix, wavelet_transformed_distance_matrix, pca_transformed_distance_matrix, n)

In [None]:
count