In [1]:
# Importing neccessary libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

telescope_data = pd.read_csv('./telescope_data.csv')

shuffled_indices = np.random.permutation(len(telescope_data))
telescope_data = telescope_data.iloc[shuffled_indices]

# Let's use 80% of the data for training, and 20% for testing
train_size = int(0.8 * len(telescope_data))

# Select the first 80% of shuffled data for training
x_train = telescope_data.iloc[:train_size, :-1]
y_train = telescope_data.iloc[:train_size, -1]

# The remaining data will be for testing
x_test = telescope_data.iloc[train_size:, :-1]
y_test = telescope_data.iloc[train_size:, -1]

C:\Users\bhanu teja\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\bhanu teja\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
# Function to calculate distance between two points
def dis(x1, x2):
    return np.linalg.norm(x1 - x2)

# Function to perform classification 
def myclassifier(Train, Trainlabel, Test):
    " Train is the training data"
    " Trainlabel is the training labels"
    " Test is the testing data"
    pred = []

    for testpoint in Test:
        pred_dis = []
        for trainpoint in Train:
            pred_dis.append(dis(testpoint, trainpoint))

        pred.append(Trainlabel[np.argmin(pred_dis)])

    return np.array(pred)

def calculate_accuracy(true_labels, predicted_labels):
    # Ensure that the true labels and predicted labels have the same length
    if len(true_labels) != len(predicted_labels):
        raise ValueError("Length of true_labels and predicted_labels must be the same.")

    # Count the number of correct predictions
    correct_predictions = sum(1 for true, predicted in zip(true_labels, predicted_labels) if true == predicted)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(true_labels)

    return accuracy

**PART 1 - Principal Component Analysis**

In [3]:
def covariance_matrix(X):
    """
    The covariance between two features X_i and X_j is calculated using the formula:
    
    cov(X_i, X_j) = sum[(X_i - mean_i) * (X_j - mean_j)] / (n_samples - 1)
    
    where:
    - X_i and X_j are the features (columns) of the data matrix X.
    - X_ik and X_jk are the values of features X_i and X_j at sample k, respectively.
    - mean_i and mean_j are the means of features X_i and X_j, respectively.
    - n_samples is the number of samples (rows) in the data matrix X.
    """
    n_samples, n_features = X.shape
    covariance_matrix = np.zeros((n_features, n_features))
    
    # Calculate covariance matrix
    for i in range(n_features):
        for j in range(n_features):
            # Calculate mean of feature i and feature j
            mean_i = np.mean(X[:, i])
            mean_j = np.mean(X[:, j])
            
            # Calculate covariance between feature i and feature j
            cov_ij = np.sum((X[:, i] - mean_i) * (X[:, j] - mean_j)) / (n_samples - 1)
            covariance_matrix[i, j] = cov_ij
    
    return covariance_matrix

In [4]:
def compute_eigen(matrix):
    """
    Computes the eigenvalues and eigenvectors of the input matrix and returns them.
    """
    eigenvalues, eigenvectors = np.linalg.eigh(matrix)
    return eigenvalues, eigenvectors

**a**

In [5]:
def my_pca(data, k):
    """
    Perform Principal Component Analysis (PCA) on the given data.

    Parameters:
    data (numpy.ndarray): The input data for PCA.
    k (int): The number of principal components to retain.

    Returns:
    numpy.ndarray: Transformed data after PCA.
    numpy.ndarray: Top k eigenvectors.
    """

    cov_matrix = covariance_matrix(x_train.values)

    eigenvalues, eigenvectors = compute_eigen(cov_matrix)

    # Sorting eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Choosing the top k eigenvectors to retain variance
    top_eigenvectors = eigenvectors[:, :k]

    # Projecting the data onto the top k eigenvectors
    reduced_data = np.dot(x_train, top_eigenvectors)

    return reduced_data, top_eigenvectors

k = 10 # we can vary this based on our need
# first i will get the all 10 components, then i will find the total variance
# then evaluate how many components should i consider to retain 95% of variance
reduced_train_data, top_eigenvectors = my_pca(x_train, k)

# We are calculating total variance first
total_variance = np.sum(np.var(reduced_train_data, axis=0))

# Choose an appropriate number of principal components to retain a significant amount of variance
threshold_variance = 0.95 # we can vary, but i have taken 95%
cumulative_variance = np.cumsum(np.var(reduced_train_data, axis=0)) / total_variance
selected_components = np.argmax(cumulative_variance >= threshold_variance) + 1

print(f"Number of components to retain {threshold_variance:.0%} of variance:", selected_components)

Number of components to retain 95% of variance: 5


**1.1 - b, c :**

In [6]:
# From above code we got that with k = 5 with which we can retain 95% of the variance
# so, we will be using  k = 5
start_time = time.time()
transformed_pca_train, pca_eigenvectors = my_pca(x_train.values, k=120)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")


real_transformed_pca_train = np.real(transformed_pca_train)

Execution time: 0.0045 seconds


**1.2 a**

In [7]:
start_time = time.time()

# Creating a PCA instance with the desired number of components
n_components = 5
pca = PCA(n_components=n_components)

# Fitting PCA on the training data to compute principal components
pca.fit(x_train)

# Transforming the training data into the new feature space defined by PCs
transformed_train_data_pca_sk = pca.transform(x_train)

# End the timer
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")

# Print the total variance retained
total_variance_retained = np.sum(pca.explained_variance_ratio_)
print(f"Total variance retained: {total_variance_retained:.4f}")

Execution time: 0.0095 seconds
Total variance retained: 0.9697


# 1.2 c Key findings :
### Sklearn PCA vs my_pca

- Although both the PCA from sklearn library and my_pca used same number of components to retain 95% of variance, the time taken to transform the data varies a bit
- From the output of above code block we can see that
    - Execution time for my_pca is 0.0045 seconds,
    - Whereas Exectution time for PCA function from sklearn library is 0.0095 seconds whcih is a little slower
    - But if the dimensionality of the data increases, there might be a chance that our own my_pca migh take very high time compared to sklearn's pca

**Conlusion : Using sklearn's inbuilt functions are a little slower than using functions that we were written from scratch, but if the dimensionality may increase, then sklearn's inbuild functions might perform faster**

**PART 2 - Kernal PCA (KPCA)**

**2.1 KPCA with rbf**

In [8]:
def pairwise_distances_manual(X1, X2):
    # Compute squared Euclidean distances between each pair of samples
    n_samples_X1 = X1.shape[0]
    n_samples_X2 = X2.shape[0]
    pairwise_distances = np.zeros((n_samples_X1, n_samples_X2))
    for i in range(n_samples_X1):
        for j in range(n_samples_X2):
            pairwise_distances[i, j] = np.sum((X1[i] - X2[j])**2)
    return pairwise_distances

def rbf_kernel(X1, X2, gamma=1.0):
    # Compute pairwise distances manually
    pairwise_distances = pairwise_distances_manual(X1, X2)
    
    # Compute kernel matrix using the RBF kernel formula
    kernel_matrix = np.exp(-gamma * pairwise_distances)

    return kernel_matrix


def kpca_rbf(X, gamma=1.0, n_components=None):
    n_samples, _ = X.shape

    # Calculate the kernel matrix using the RBF kernel defined above
    kernel_matrix = rbf_kernel(X, X, gamma=gamma)
    
    # Centering the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)

    # Computing eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)

    # Sorting eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Selecting the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Projecting the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors


transformed_rbf_train, rbf_eigenvalues, rbf_eigenvectors= kpca_rbf(x_train.values,1.0, n_components=5)


**2.2 KPCA with Polynomial Kernel**

In [9]:
# Define the Polynomial kernel function
def polynomial_kernel(X1, X2, degree=3):
    kernel_matrix = (1 + np.dot(X1, X2.T)) ** degree
    return kernel_matrix

# Define the Kernel PCA function with Polynomial kernel
def kpca_poly(X, degree=2, n_components=None):
    n_samples, _ = X.shape

    kernel_matrix = polynomial_kernel(X, X, degree=degree)

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)

    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors

n_components = 5  # Adjust the number of components as needed
degree_poly = 3  # Adjust the degree of the polynomial kernel

# Perform Polynomial Kernel PCA transformation on the training data
transformed_poly_train, poly_eigenvalues, poly_eigenvectors = kpca_poly(x_train.values, degree=degree_poly, n_components=n_components)

**2.3 KPCA with Linear Kernel**

In [10]:
# Define the Linear kernel function
def linear_kernel(X1, X2):
    return np.dot(X1, X2.T)

# Define the Kernel PCA function with Linear kernel
def kpca_linear(X, n_components=None):
    n_samples, _ = X.shape

    # Calculate the kernel matrix using the Linear kernel
    kernel_matrix = linear_kernel(X, X)

    # Center the kernel matrix
    one_n = np.ones((n_samples, n_samples)) / n_samples
    kernel_matrix_centered = kernel_matrix - np.dot(one_n, kernel_matrix) - np.dot(kernel_matrix, one_n) + np.dot(np.dot(one_n, kernel_matrix), one_n)

    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top n_components eigenvectors if specified
    if n_components is not None:
        eigenvalues = eigenvalues[:n_components]
        eigenvectors = eigenvectors[:, :n_components]

    # Project the data onto the selected eigenvectors
    transformed_data = np.dot(kernel_matrix_centered, eigenvectors)

    return transformed_data, eigenvalues, eigenvectors


# Set the number of components
n_components = 5

# Perform Kernel PCA transformation on the training data
transformed_linear_train, linear_eigenvalues, linear_eigenvectors = kpca_linear(x_train.values, n_components=n_components)

# Part 3

**PART 3.1 a,b**

In [11]:
# Use the same transformation on the test data

transformed_pca_test = np.dot(x_test, pca_eigenvectors)

In [12]:
# Use the same transformation on the test data
transformed_rbf_test = np.dot(rbf_kernel(x_test.values, x_train.values), rbf_eigenvectors)

In [13]:
# Use the same transformation on the test data
transformed_poly_test = np.dot(polynomial_kernel(x_test.values, x_train.values), poly_eigenvectors)

In [14]:
# Use the same transformation on the test data
transformed_linear_test = np.dot(linear_kernel(x_test.values, x_train.values), linear_eigenvectors)

**PART 3.2 a, b**

In [15]:
predicted_labels_pca = myclassifier(transformed_pca_train, y_train.values, transformed_pca_test)

# Calculate accuracy using calculate_accuracy function
accuracy_pca = calculate_accuracy(y_test.values, predicted_labels_pca)
print(f'Accuracy using pca :', accuracy_pca)

Accuracy using pca : 0.6097560975609756


In [16]:
# Now you can use transformed_rbf_train and transformed_rbf_test for classification
rbf_predicted_labels = myclassifier(transformed_rbf_train, y_train.values, transformed_rbf_test)

# Calculate accuracy using calculate_accuracy function
accuracy_rbf_kpca = calculate_accuracy(y_test.values, rbf_predicted_labels)
print(f'Accuracy using rbf_kcpa :', accuracy_rbf_kpca)

Accuracy using rbf_kcpa : 0.5609756097560976


In [17]:
# Now you can use transformed_poly_train and transformed_poly_test for classification
predicted_labels_poly = myclassifier(transformed_poly_train, y_train.values, transformed_poly_test)

# Calculate accuracy using calculate_accuracy function
accuracy_poly_kpca = calculate_accuracy(y_test.values, predicted_labels_poly)
print(f'Accuracy using polynomial_kcpa of degree {degree_poly}:', accuracy_poly_kpca)

Accuracy using polynomial_kcpa of degree 3: 0.6097560975609756


In [18]:
# Now you can use transformed_linear_train and transformed_test_kpca_linear for classification
predicted_labels_linear = myclassifier(transformed_linear_train, y_train.values, transformed_linear_test)

# Calculate accuracy using calculate_accuracy function
accuracy_linear_kpca = calculate_accuracy(y_test.values, predicted_labels_linear)
print(f'Accuracy using linear_kcpa for :', accuracy_linear_kpca)

Accuracy using linear_kcpa for : 0.6341463414634146


# Key Findings :

## Comparing accuracies :

- Accuracy using pca : 0.6097
- Accuracy using KPCA with rbf kernel : 0.5365
- Accuracy using KPCA with poly kernel : 0.4634
- Accuracy using KPCA with linear kernel : 0.6097


Accuracy obtained by using the reduced data with pca (without any kernel) and using linear kernel produced better results than compared to other methods. simple PCA and linear_kpca achieved an accuracy of 0.6097 While poly kernel performed poorly with 0.4634 accuracy

Given train data has around 10 columns (features), but we have reduced the data to 5 principal components and yet we can achieve accuracies as high as 0.6097.

## Impact on classification performance :
- Although we have reduced train data using differnet kernels or without kernel into same number of principal components, there is high difference in the accuracy (classification accuracy)
 - The data whcih is reduced using PCA, and using KPCA with linear kernel performed better than rest of all and achieved an accuracy of 60.97%
 - While the data whcih is reduced using KPCA with rbf_kernel performed poorly and achieved an accuracy of 46.34%
 - So, the reduction technique that we used to reduce the data, greatly impacts the classification performance


## Advantages and Disadvantages of using codes written from scratch over sklearn's functions :

- Advantage is that if we implement codes form scractch, we can customize the code for our requirements. We can learn and apply these learnings in other areas as well, whcih we cannot do if we have directly used sklearns functions. And also in this case, which is a lower dimensional data, functions implemented from scratch performed faster than those of sklearn's functions.

- Disadvantage is that, as the size and shape (dimensions) of the data increases, then sklearn's functions might perform well, and faster.
- And also sklearn's inbuilt functions, comes with many optimisations for calculations whcih we are difficult to implement from scratch. Which will play crucial role in faster calculations.

