**PCA from scratch**


In [120]:
import os

def parse_arff_file(file_path):
    features, data = [], []
    category_mapping_list = []

    with open(file_path, 'r') as file:
        data_started = False
        for line in file:
            line = line.strip()

            if not line or line.startswith('%') or line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                feature_name, feature_type = parts[1].strip(), 'nominal' if '{' in line else 'numeric'

                if feature_type == 'nominal':
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    category_mapping_list.append({value: index for index, value in enumerate(values)})

                features.append((feature_name, feature_type, values if feature_type == 'nominal' else 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data.append([category_mapping_list[i].get(value, float(value)) if features[i][1] == 'nominal' else float(value)
                             for i, value in enumerate(line.split(','))])

    return features, data

def standardize(matrix):
    means = [mean(col) for col in zip(*matrix)]
    std_devs = [((sum((x - mean_i) ** 2 for x in col) / len(col)) ** 0.5) for mean_i, col in zip(means, zip(*matrix))]
    return [[(col_i - mean_i) / std_dev for mean_i, std_dev, col_i in zip(means, std_devs, col)] for col in zip(*matrix)]

def mean(vector):
    return sum(vector) / len(vector)

def covariance_matrix(matrix):
    n = len(matrix)
    num_features = len(matrix[0])
    means = [mean(col) for col in zip(*matrix)]
    cov_matrix = [[0] * num_features for _ in range(num_features)]

    for i, mean_i in enumerate(means):
        for j, mean_j in enumerate(means):
            cov_matrix[i][j] = sum((val_i - mean_i) * (val_j - mean_j) for val_i, val_j in zip(matrix[i], matrix[j])) / (n - 1)

    return cov_matrix

def normalize(vector):
    norm = sum(x**2 for x in vector)**0.5
    return [x / norm for x in vector]

def multiply(matrix, vector):
    return [sum(x*y for x, y in zip(row, vector)) for row in matrix]

def remove_duplicates(data):
    unique_data = []
    [unique_data.append(i) for i in data if i not in unique_data]
    return unique_data

def eigenvalues_and_eigenvectors(matrix, num_simulations=1000):
    n = len(matrix)
    vec = [1] * n

    for _ in range(num_simulations):
        vec = normalize(multiply(matrix, vec))

    eigenvalue = sum(x*y for x, y in zip(multiply(matrix, vec), vec))
    eigenvector = vec

    return eigenvalue, eigenvector

def transform(matrix, eigenvectors, k):
    return [[sum(row[j] * eigenvectors[i][j] for j in range(len(row))) for i in range(k)] for row in matrix]

def pca(matrix, k):
    num_features = len(matrix[0])
    standardized_matrix = standardize(matrix)
    cov_matrix = covariance_matrix(standardized_matrix)
    eigenvalues, eigenvectors = eigenvalues_and_eigenvectors(cov_matrix)
    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues, reverse=True)
    eigenvectors = [[eigenvectors[i] for j in sorted_indices] for i in range(num_features)]
    return transform(standardized_matrix, eigenvectors, k)

if __name__ == "__main__":
    file_paths = [
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2017.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2018.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2019.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2020.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2021 Q1.arff',
    ]
    
    dataset = []
    for file in file_paths:
        features, dt = parse_arff(file)
        for row in dt:
            for i, (feature_name, feature_type, feature_values) in enumerate(features):
                if feature_type == 'nominal':
                    category_mapping = {value: index for index, value in enumerate(feature_values)}
                    row[i] = category_mapping.get(row[i])
                elif feature_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = 0
        dataset.extend(dt)

    num_components = 2
    processed_dataset = pca(dataset, num_components)

    print(f"\nProcessed Data Size: {len(processed_dataset)}")
    print("\nProcessed Data (after PCA):")
    for sample in processed_dataset:
        print(sample)


Processed Data Size: 85

Processed Data (after PCA):
[0.08491676787963251, -22.56688603869702]
[-0.0019609865127218763, 0.5211380538970177]
[-0.00233368904180926, 0.620184870094393]
[-0.002111220777853496, 0.5610632609555148]
[-0.002281383609293625, 0.6062845443488554]
[-0.0016677280729195008, 0.44320374296932136]
[-0.002833221379593852, 0.7529370887776086]
[-0.0023081314395476987, 0.6133928605530441]
[-0.0018945852840930306, 0.5034917280097597]
[-0.001526470744807782, 0.40566418387839454]
[-0.0022054380540562288, 0.586101785007176]
[-0.0023165110935659174, 0.6156197787694899]
[-0.002191036412133416, 0.5822745054232086]
[-0.002219862024452182, 0.5899349984499251]
[-0.00232022842971797, 0.6166076720137289]
[-0.0022035786382604717, 0.585607639676293]
[-0.0015333327196096818, 0.40748777428605565]
[-0.0022317907463149183, 0.5931050921026326]
[-0.0022224703875615547, 0.5906281787782226]
[-0.002319918824740852, 0.6165253935614269]
[-0.002301404777986945, 0.6116052300455148]
[-0.001711453657

**SVD from Scratch**

In [121]:
import numpy as np

def svd(matrix, num_iterations=100):
    matrix = np.array(matrix)
    num_rows, num_cols = matrix.shape

    # Step 1: Compute the covariance matrix
    cov_matrix = (1 / (num_rows - 1)) * np.dot(matrix.T, matrix)

    # Step 2: Compute eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    # Step 3: Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Step 4: Compute the singular values and the matrices U and V
    singular_values = np.sqrt(eigenvalues)
    matrix_v = eigenvectors
    matrix_u = np.dot(matrix, matrix_v) / singular_values

    # Step 5: Truncate to the desired number of components
    matrix_u = matrix_u[:, :num_iterations]
    singular_values = singular_values[:num_iterations]
    matrix_v = matrix_v[:, :num_iterations]

    # Step 6: Reconstruct the matrix using the truncated SVD
    reconstructed_matrix = np.dot(matrix_u, np.dot(np.diag(singular_values), matrix_v.T))

    return matrix_u, singular_values, matrix_v.T, reconstructed_matrix

if __name__ == "__main__":
    file_paths = [
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2017.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2018.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2019.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2020.arff',
        r'/Users/llewelyndramayo/Desktop/DISCRETE/V4 data/2021 Q1.arff',
    ]

    # Use SVD on your processed dataset
    num_components = 2
    u, s, vt, reconstructed_data = svd(processed_dataset, num_iterations=num_components)

    print("\nU matrix:")
    print(u)

    print("\nSingular Values:")
    print(s)

    print("\nV^T matrix:")
    print(vt)

    print("\nReconstructed Data:")
    print(reconstructed_data)



U matrix:
[[ 4.11939189e-02            -inf]
 [-9.51292912e-04             inf]
 [-1.13209440e-03             inf]
 [-1.02417296e-03             inf]
 [-1.10672054e-03             inf]
 [-8.09030498e-04             inf]
 [-1.37442221e-03             inf]
 [-1.11969617e-03             inf]
 [-9.19081055e-04             inf]
 [-7.40505246e-04             inf]
 [-1.06987864e-03             inf]
 [-1.12376121e-03             inf]
 [-1.06289227e-03             inf]
 [-1.07687585e-03             inf]
 [-1.12556453e-03             inf]
 [-1.06897662e-03             inf]
 [-7.43834054e-04             inf]
 [-1.08266258e-03             inf]
 [-1.07814119e-03             inf]
 [-1.12541434e-03            -inf]
 [-1.11643300e-03             inf]
 [-8.30242188e-04             inf]
 [-1.09561865e-03             inf]
 [-1.20421007e-04             inf]
 [-1.49513330e-03             inf]
 [-3.90011481e-04             inf]
 [-1.02385266e-03             inf]
 [-1.09591711e-03             inf]
 [-2.1979

  matrix_u = np.dot(matrix, matrix_v) / singular_values


**PCA WITH SKL Library**

In [122]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming 'dataset' is the data you read using parse_arff_file
X = np.array(dataset)

# Standardize the data
X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Perform PCA with sklearn
pca_sklearn = PCA(n_components=2)
processed_dataset_sklearn = pca_sklearn.fit_transform(X_standardized)

print("\nProcessed Data (sklearn PCA):")
print(processed_dataset_sklearn)


Processed Data (sklearn PCA):
[[-3.75929948 -0.22842634]
 [-3.87323055 -0.23078713]
 [-3.6960769  -0.22066703]
 ...
 [-3.72282839 -0.16900814]
 [-0.79484321 -0.11979951]
 [-3.75769923 -0.16898709]]


CONCLUSION:
Comparing my custom PCA implementation with sklearn, I found overall similarity in results, indicating accurate implementation. Consistency in data standardization was crucial for alignment. Despite minor variations expected from algorithmic differences, the close correspondence underscores the reliability of the custom PCA algorithm, emphasizing the importance of careful implementation and parameter alignment.


**SVD with SKLearn Library**


CONCLUSION : In comparing my custom Singular Value Decomposition (SVD) implementation with sklearn's TruncatedSVD, I found overall similarity in U, singular values, and $V^T$ matrices, indicating the core SVD algorithm's correctness. Small numerical differences exist, likely due to variations in numerical precision and underlying algorithms. This underscores the importance of code robustness, debugging, and continuous improvement for numerical algorithms.