In [1]:
# Importing neccessary libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Imoprting TrainData.csv and TestData.csv
# TrainData.csv and TestData.csv are required to be in the same
# folder where this current file is present
train_data = pd.read_csv('./TrainData.csv')
test_data = pd.read_csv('./TestData.csv')

train_labels = train_data.iloc[:, -1]
train_data = train_data.iloc[:, :-1]

test_labels = test_data.iloc[:, -1]
test_data = test_data.iloc[:, :-1]

In [2]:

import numpy as np

# Function to calculate Euclidean distance between two points
def dis(x1, x2):
    """
    Calculate the Euclidean distance between two points.

    Parameters:
    x1 (numpy.ndarray): The first point.
    x2 (numpy.ndarray): The second point.

    Returns:
    float: The Euclidean distance between x1 and x2.
    """
    return np.linalg.norm(x1 - x2)

# Function to perform classification using the K - Nearest Neighbor approach
def myclassifier(Train, Trainlabel, Test, k=3):
    """
    Train is the training data
    Trainlabel is the training labels
    Test is the testing data
    k is the number of neighbors to consider (default is 3)
    The k-NN classifier assigns a test point to the class label
    that is most common among its k nearest neighbors in the training data.
    You can specify the value of k as a parameter (default is 3)
    """
    # Create a k-NN classifier
    classifier = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    classifier.fit(Train, Trainlabel)

    # Perform classification on the test data
    predicted_labels = classifier.predict(Test)
    return predicted_labels


def calculate_accuracy(true_labels, predicted_labels):
    """
    Calculate the accuracy of predicted labels compared to true labels.

    Parameters:
    true_labels (list or numpy.ndarray): The true labels.
    predicted_labels (list): The predicted labels to be evaluated.

    Returns:
    float: The accuracy of the predicted labels.
    """
    # Ensure that the true labels and predicted labels have the same length
    if len(true_labels) != len(predicted_labels):
        raise ValueError("Length of true_labels and predicted_labels must be the same.")

    # Count the number of correct predictions
    correct_predictions = sum(1 for true, predicted in zip(true_labels, predicted_labels) if true == predicted)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(true_labels)

    return accuracy


**PART 1 - Principal Component Analysis**

**C**

In [3]:
import numpy as np

def my_pca(data, k):
    """
    Perform Principal Component Analysis (PCA) on the given data.

    Parameters:
    data (numpy.ndarray): The input data for PCA.
    k (int): The number of principal components to retain.

    Returns:
    numpy.ndarray: Transformed data after PCA.
    numpy.ndarray: Top k eigenvectors.
    """

    # Step 1: Calculate the mean of the data
    mean = np.mean(data, axis=0)

    # Step 2: Center the data by subtracting the mean
    centered_data = data - mean

    # Step 3: Calculate the covariance matrix
    cov_matrix = np.cov(centered_data, rowvar=False)

    # Step 4: Calculate eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    # Step 5: Sort eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Step 6: Choose the top k eigenvectors to retain variance
    top_eigenvectors = eigenvectors[:, :k]

    # Step 7: Project the data onto the top k eigenvectors
    reduced_data = np.dot(centered_data, top_eigenvectors)

    return reduced_data, top_eigenvectors

# Apply your PCA implementation to the Train dataset
k = 200  # Adjust the number of principal components as needed
reduced_train_data, top_eigenvectors = my_pca(train_data, k)

# Calculate the total variance
total_variance = np.sum(np.var(reduced_train_data, axis=0))

# Choose an appropriate number of principal components to retain a significant amount of variance
threshold_variance = 0.95  # Adjust as needed
cumulative_variance = np.cumsum(np.var(reduced_train_data, axis=0)) / total_variance
selected_components = np.argmax(cumulative_variance >= threshold_variance) + 1

print(f"Number of components to retain {threshold_variance:.0%} of variance:", selected_components)


Number of components to retain 95% of variance: 110


**1.1 - a, b :**

In [4]:
# From above code we got that with k = 110 we can retain 95% of the variance
# so, we will be using  k = 120 (10 value extra, to be on safe side)

def my_pca(data, k=None):
    """
    Perform Principal Component Analysis (PCA) on the given data.

    Parameters:
    data (numpy.ndarray): The input data for PCA.
    k (int, optional): The number of principal components to retain. If None, retains all components.

    Returns:
    numpy.ndarray: Transformed data after PCA.
    numpy.ndarray: Eigenvalues corresponding to principal components.
    numpy.ndarray: Eigenvectors representing the principal components.
    numpy.ndarray: Mean of the input data.
    numpy.ndarray: Standard deviation of the input data.
    """
    # Start the timer
    start_time = time.time()

    # Standardize the data (center and scale)
    mean = np.mean(data, axis=0)
    std_dev = np.std(data, axis=0)
    standardized_data = (data - mean) / std_dev

    # Compute the covariance matrix
    covariance_matrix = np.cov(standardized_data, rowvar=False)

    # Calculate eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top k eigenvectors if specified
    if k is not None:
        eigenvalues = eigenvalues[:k]
        eigenvectors = eigenvectors[:, :k]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(standardized_data, eigenvectors)

    # End the timer
    end_time = time.time()

    # Calculate and print the execution time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")

    return transformed_data, eigenvalues, eigenvectors, mean, std_dev

# Perform PCA on the training data with a specified number of components (e.g., k=120)
transformed_pca_train, pca_eigenvalues, pca_eigenvectors, pca_mean, pca_std_dev = my_pca(train_data.values, k=120)
real_transformed_pca_train = np.real(transformed_pca_train)

Execution time: 957.51 seconds


**1.2**

In [5]:
# Start the timer
start_time = time.time()

# Step 1: Create a PCA instance with the desired number of components
n_components = 120
pca = PCA(n_components=n_components)

# Step 2: Fit PCA on the training data to compute principal components
pca.fit(train_data)

# Step 3: Transform the training data into the new feature space defined by PCs
transformed_train_data_pca_sk = pca.transform(train_data)

# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

Execution time: 1.19 seconds


# Key findings :
### Sklearn PCA vs my_pca

- Although both the PCA from sklearn library and my_pca does the same transformation, the time taken to transform the data varies highly
- From the output of above code block we can see that
    - Execution time for my_pca is 611.3 seconds,
    - Whereas Exectution time for PCA function from sklearn library is 1.16 seconds whcih is 900 times faster

**Conlusion : Using sklearn's inbuilt functions are way faster than using functions that we were written from scratch**

**PART 2 - Kernal PCA (KPCA)**

**2.1 KPCA with rbf**

In [6]:
def rbf_kernel(X1, X2, gamma=1.0):

    # Calculate the pairwise squared Euclidean distances
    pairwise_distances = np.sum(X1**2, axis=1, keepdims=True) + np.sum(X2**2, axis=1) - 2 * np.dot(X1, X2.T)

    # Calculate the kernel matrix using the RBF kernel formula
    kernel_matrix = np.exp(-gamma * pairwise_distances)

    return kernel_matrix

# Define the Kernel PCA function with RBF kernel
def kpca_rbf(X, gamma=1.0, n_components=None):

    n_samples, _ = X.shape

    # Initialize the scaler
    scaler = StandardScaler()

    # Standardize the data and fit the scaler
    X = scaler.fit_transform(X)

    # Calculate the kernel matrix using the RBF kernel defined above
    kernel_matrix = rbf_kernel(X, X, gamma=gamma)

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)


    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors, scaler

# Perform Kernel PCA transformation on the training data
transformed_rbf_train, rbf_eigenvalues, rbf_eigenvectors, rbf_scaler = kpca_rbf(train_data.values,0, n_components=120)

**2.2 KPCA with Polynomial Kernel**

In [7]:
# Define the Polynomial kernel function
def polynomial_kernel(X1, X2, degree=3):
    kernel_matrix = (1 + np.dot(X1, X2.T)) ** degree
    return kernel_matrix

# Define the Kernel PCA function with Polynomial kernel
def kpca_poly(X, degree=2, n_components=None):
    n_samples, _ = X.shape

    # Initialize the scaler
    scaler = StandardScaler()

    # Standardize the data and fit the scaler
    X = scaler.fit_transform(X)

    # Calculate the kernel matrix using the Polynomial kernel

    kernel_matrix = polynomial_kernel(X, X, degree=degree)

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)

    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors, scaler

n_components = 120  # Adjust the number of components as needed
degree_poly = 3  # Adjust the degree of the polynomial kernel

# Perform Polynomial Kernel PCA transformation on the training data
transformed_poly_train, poly_eigenvalues, poly_eigenvectors, poly_scaler = kpca_poly(train_data.values, degree=degree_poly, n_components=n_components)

**2.3 KPCA with Linear Kernel**

In [8]:
# Define the Linear kernel function
def linear_kernel(X1, X2):
    return np.dot(X1, X2.T)

# Define the Kernel PCA function with Linear kernel
def kpca_linear(X, n_components=None):
    n_samples, _ = X.shape

    # Initialize the scaler
    scaler = StandardScaler()

    # Standardize the data and fit the scaler
    X = scaler.fit_transform(X)

    # Calculate the kernel matrix using the Linear kernel
    kernel_matrix = linear_kernel(X, X)

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)

    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors, scaler


# Set the number of components
n_components = 120

# Perform Kernel PCA transformation on the training data
transformed_linear_train, linear_eigenvalues, linear_eigenvectors, linear_scaler = kpca_linear(train_data.values, n_components=n_components)

**PART 3**

In [9]:
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
# Use the same transformation on the test data
# Ensure that the test data is standardized with respect to the training data (use mean and std_dev)
transformed_pca_test = (test_data - pca_mean) / pca_std_dev
transformed_pca_test = np.dot(transformed_pca_test, pca_eigenvectors)

In [10]:
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
# Use the same transformation on the test data
# This way we can make sure that the transformed test data is consistent with transformed train data
centered_rbf_test_data = rbf_scaler.transform(test_data.values)
transformed_rbf_test = np.dot(rbf_kernel(centered_rbf_test_data, centered_rbf_test_data), rbf_eigenvectors)

In [18]:
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
# Use the same transformation on the test data
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
centered_poly_test_data = poly_scaler.transform(test_data.values)
transformed_poly_test = np.dot(polynomial_kernel(centered_poly_test_data, centered_poly_test_data, degree=degree_poly), poly_eigenvectors)

In [19]:
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
# Use the same transformation on the test data
# Ensure that the test data is standardized and centered with respect to the training data using the scaler
centered_test_data = linear_scaler.transform(test_data.values)
transformed_linear_test = np.dot(linear_kernel(centered_test_data, centered_test_data), linear_eigenvectors)
real_transformed_pca_test = np.real(transformed_pca_test)

**PART 4**

In [21]:
real_transformed_pca_test = np.real(transformed_pca_test)
predicted_labels_pca = myclassifier(real_transformed_pca_train, train_labels.values, real_transformed_pca_test)

# Calculate accuracy using sklearn's accuracy_score function
accuracy_pca = calculate_accuracy(test_labels.values, predicted_labels_pca)
print(f'Accuracy using pca :', accuracy_pca)

Accuracy using pca : 0.83


In [27]:
# Now you can use transformed_rbf_train and transformed_rbf_test for classification
rbf_predicted_labels = myclassifier(transformed_rbf_train, train_labels.values, transformed_rbf_test)

# Calculate accuracy using sklearn's accuracy_score function
accuracy_rbf_kpca = calculate_accuracy(test_labels.values, rbf_predicted_labels)
print(f'Accuracy using rbf_kcpa :', accuracy_rbf_kpca)

Accuracy using rbf_kcpa : 0.025


In [24]:
# Now you can use transformed_poly_train and transformed_poly_test for classification
predicted_labels_poly = myclassifier(transformed_poly_train, train_labels.values, transformed_poly_test)

# Calculate accuracy using sklearn's accuracy_score function
accuracy_poly_kpca = calculate_accuracy(test_labels.values, predicted_labels_poly)
print(f'Accuracy using polynomial_kcpa of degree {degree_poly}:', accuracy_poly_kpca)

Accuracy using polynomial_kcpa of degree 3: 0.605


In [26]:
# Now you can use transformed_linear_train and transformed_test_kpca_linear for classification
predicted_labels_linear = myclassifier(transformed_linear_train, train_labels.values, transformed_linear_test)

# Calculate accuracy using sklearn's accuracy_score function
accuracy_linear_kpca = calculate_accuracy(test_labels.values, predicted_labels_linear)
print(f'Accuracy using linear_kcpa for :', accuracy_linear_kpca)

Accuracy using linear_kcpa for : 0.76


# Key Findings :

## Comparing accuracies :

- Accuracy using pca : 0.83
- Accuracy using KPCA with rbf kernel : 0.025
- Accuracy using KPCA with poly kernel : 0.605
- Accuracy using KPCA with linear kernel : 0.76


Accuracy obtained by using the reduced data with pca (without any kernel) produced better results than compared to other methods. simple PCA achieved an accuracy of 0.88. While rbf kernel performed poorly with 0.025 accuracy

Given train data has around 10300 columns (features), but we have reduced the data to 120 principal components and yet we can achieve accuracies as high as 0.88.

That means we are not loosing much of the information from the data, as we were able to predict 88% of the data correctly, even though we have reduced them.

## Impact on classification performance :
- Although we have reduced train data using differnet kernels or without kernel into same number of principal components, there is high difference in the accuracy (classification accuracy)
 - The data whcih is reduced using PCA performed better than rest of all and achieved an accuracy of 88%
 - While the data whcih is reduced using KPCA with rbf_kernel performed poorly and achieved an accuracy of 2.5%
 - So, the reduction technique that we used to reduce the data, greatly impacts the classification performance


## Advantages and Disadvantages of using codes written from scratch over sklearn's functions :

- Advantage is that ff we implement codes form scractch, we can customize the code for our requirements. We can learn and apply these learnings in other areas as well, whcih we cannot do if we have directly used sklearns functions

- Disadvantage is that, as we have seen the PCA module from sklearn performed the operation 500 times faster than our my_pca code, so codes written from scratch are not efficient in terms of time taken. And also built-in functions are easy to use



In [28]:
# Finding the covariance matrix of the train_labels from scratch

# Step 1: Compute the mean of the attribute
mean = np.mean(train_labels)

# Step 2: Calculate squared differences
squared_diff = (train_labels - mean) ** 2

# Step 3: Sum up the squared differences
sum_squared_diff = np.sum(squared_diff)

# Step 4: Calculate the variance
variance = sum_squared_diff / len(train_labels)

# Step 5: Create a 1x1 covariance matrix with the variance value
covariance_matrix = np.array([[variance]])

# Print the covariance matrix
print("Covariance Matrix of Label Attribute:")
print(covariance_matrix)


Covariance Matrix of Label Attribute:
[[133.25]]
