# CS 3101 - Prefinals
### Submitted by Maria Eloisa H. Garcia 
#### (20102861 - BSCS III) <br>

### I. PCA & SVD Program From Scratch

In [1]:
def parse_arff(file_path):
    data_started = False
    attrs = []
    data = []
    nom_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attrs.append((attr_name, 'nominal', values))
                    attr_info = ((attr_name, 'nominal', values))

                    attr_name, attr_type, attr_values = attr_info

                    nom_mapping.append({value: index for index, value in enumerate(attr_values)})
                else:
                    attrs.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')

                data.append(line.split(','))

    return attrs, data


def dot_prod(vec1, vec2):
    return sum(x * y for x, y in zip(vec1, vec2))

def scalar_mult(scalar, vec):
    return [scalar * x for x in vec]

def subtract_vecs(vec1, vec2):
    return [x - y for x, y in zip(vec1, vec2)]

def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mat_mult(mat1, mat2):
    return [[dot_prod(row, col) for col in transpose(mat2)] for row in mat1]

def pca(data, num_components):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)
    u_mat, _, v_t = svd(cov_matrix)

    principal_comp = [row[:num_components] for row in transpose(v_t)]
    projected_data = mat_mult(centered_data, principal_comp)

    return projected_data

def svd(data):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)

    singular_vec = [1.0] * len(cov_matrix[0])
    for _ in range(50):
        singular_vec = mat_mult([singular_vec], cov_matrix)[0]
        magnitude = sum(x ** 2 for x in singular_vec) ** 0.5
        singular_vec = scalar_mult(1.0 / magnitude, singular_vec)

    u_mat = [singular_vec]
    v_mat = [singular_vec]

    singular_val = dot_prod(centered_data[0], singular_vec)
    s_mat = [[singular_val if i == j else 0.0 for j in range(len(u_mat))] for i in range(len(v_mat))]

    return u_mat, s_mat, transpose(v_mat)

def main():
    dataset_files = ['./v4-data/2017.arff', './v4-data/2018.arff', './v4-data/2019.arff', './v4-data/2020.arff', './v4-data/2021 Q1.arff']
    # dataset_files = ['./v4-data/2017.arff']

    for file_path in dataset_files:
        dataset_label = file_path.split('/')[-1].split('.')[0]

        print(f"\nProcessing dataset: {dataset_label}")
        attrs, data = parse_arff(file_path)

        for row in data:
            for i in range(len(attrs)):
                attr_name, attr_type, attr_values = attrs[i]
                if attr_type == 'nominal':
                    nom_mapping = {value: index for index, value in enumerate(attr_values)}
                    row[i] = nom_mapping.get(row[i])
                elif attr_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = -1

        data_as_list = [list(map(float, row)) for row in data]

        num_comp_pca = 3
        projected_data_pca = pca(data_as_list, num_comp_pca)

        print("\nProjected Data (PCA):")
        for row in projected_data_pca:
            print(row)

        u_svd, s_svd, v_t_svd = svd(data_as_list)

        print("\nMatrix U (SVD):")
        for row in u_svd:
            print(row)

        print("\nMatrix S (SVD):")
        for row in s_svd:
            print(row)

        print("\nMatrix V^T (SVD):")
        for row in v_t_svd:
            print(row)

if __name__ == "__main__":
    main()


Processing dataset: 2017

Projected Data (PCA):
[1.7734338245703392e-05, 7.549815375103083e-09, 5.0037381289998464e-08]
[1.6746811290026172e-05, 7.12940802242913e-09, 4.725107699542779e-08]
[1.6335341725160662e-05, 6.954238292148316e-09, 4.609011687269e-08]
[1.2549821728397991e-05, 5.342676773564827e-09, 3.54092837435024e-08]
[1.2467527815424889e-05, 5.3076428275086644e-09, 3.517709171895484e-08]
[1.0327886078124249e-05, 4.396760230048432e-09, 2.9140099080718365e-08]
[7.118423472173287e-06, 3.030436333858082e-09, 2.0084610123363654e-08]
[6.871541733253982e-06, 2.9253344956895937e-09, 1.9388034049720982e-08]
[4.155842605141629e-06, 1.7692142758362215e-09, 1.1725697239651612e-08]
[6.994982602713634e-07, 2.9778854147738384e-10, 1.973632208654232e-09]
[-2.5922582586526996e-06, -1.1035693007691282e-09, -7.314048773248035e-09]
[-3.6620791273030204e-06, -1.5590105994992447e-09, -1.0332545092366272e-08]
[-4.155842605141629e-06, -1.7692142758362215e-09, -1.1725697239651612e-08]
[-5.30795738676

### II. scikit-learn PCA & SVD Program

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def parse_arff(file_path):
    data_started = False
    attrs = []
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()
                attr_type = parts[2].strip()

                if attr_type.startswith('{'):
                    values = [v.strip('\'') for v in parts[3:]]
                    attrs.append((attr_name, 'nominal', values))
                else:
                    attrs.append((attr_name, 'numeric'))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                data.append(data_line)

    return attrs, data

def preprocess_data(attrs, data):
    label_encoders = []
    for i in range(len(attrs)):
        attr_info = attrs[i]
        
        if len(attr_info) == 3 and attr_info[1] == 'nominal':
            attr_name, attr_type, attr_values = attr_info
            label_encoder = LabelEncoder()
            data[:, i] = label_encoder.fit_transform(data[:, i])
            label_encoders.append((i, label_encoder))
        elif len(attr_info) == 2 and attr_info[1] == 'numeric':
            attr_name, attr_type = attr_info
            data[:, i] = np.where(np.char.isnumeric(data[:, i]), data[:, i], np.nan)
            data[:, i] = data[:, i].astype(float)

    return data, label_encoders
    
def process_dataset(file_path):
    dataset_label = file_path.split('/')[-1].split('.')[0]
    print(f"\nProcessing dataset: {dataset_label}")

    attrs, data = parse_arff(file_path)

    data = np.array(data)
    data, label_encoders = preprocess_data(attrs, data)

    df = pd.DataFrame(data, columns=[attr[0] for attr in attrs])

    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    num_components = 3
    pca = PCA(n_components=num_components)
    projected_data = pca.fit_transform(df_imputed.drop(columns=['S']))

    print("\nProjected Data in scikit-learn:")
    print(pd.DataFrame(projected_data, columns=[f'PC{i+1}' for i in range(num_components)]))

    svd = TruncatedSVD(n_components=num_components)
    svd_result = svd.fit_transform(df_imputed.drop(columns=['S']))

    print("\nSVD Result in scikit-learn:")
    print(pd.DataFrame(svd_result, columns=[f'Component{i+1}' for i in range(num_components)]))

def main():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)

    file_paths = ['./v4-data/2017.arff', './v4-data/2018.arff', './v4-data/2019.arff', './v4-data/2020.arff', './v4-data/2021 Q1.arff']
    # file_paths = ['./v4-data/2017.arff']
    for file_path in file_paths:
        process_dataset(file_path)

if __name__ == "__main__":
    main()


Processing dataset: 2017

Projected Data in scikit-learn:
              PC1            PC2            PC3
0   -3.961987e+07  306196.159897   -5530.566238
1    2.234390e-05       0.148551      -0.109245
2    2.099156e-05       0.148199      -0.065342
3    1.943731e-05       0.093844      -0.141812
4   -3.961987e+07  -21648.460224   -4644.361342
5   -3.961987e+07  -21648.478090   -4644.345845
6   -1.123634e-01  -19756.683976      53.400055
7    2.863033e-02    5032.215207     -13.592332
8   -1.172537e-01  -20616.710246      55.673561
9    3.892360e-06       0.013237       0.002930
10  -3.961987e+07     210.177458   -4703.236608
11  -1.226013e-01  -21554.776938      58.315914
12  -3.961987e+07  -21648.599025   -4644.240946
13  -3.961987e+07  -21648.608645   -4644.232601
14  -1.217361e-01  -21401.845360      57.899802
15  -1.241380e-01  -21822.855383      59.172711
16  -2.081955e-05      -0.143660       0.091772
17   9.013685e-01  158479.606029    -428.579911
18   2.327041e-05       0.145

### III. Discussion
The implementation of PCA and SVD uses two different approaches. The first approach involves a custom script, while the second approach employs scikit-learn's TruncatedSVD and PCA modules. 

There are several notable differences between the two approaches. For instance, the first approach uses a custom mapping to convert nominal attributes to numeric, while the second uses scikit-learn's LabelEncoder. Similarly, the first approach sets missing numeric values to -1, while the second approach imputes missing numeric values using scikit-learn's SimpleImputer. Furthermore, while the first approach implements a simplified version of SVD using a fixed number of iterations, the second approach uses scikit-learn's TruncatedSVD, which may employ more sophisticated algorithms. The first approach computes PCA manually using the SVD result, while the second approach uses scikit-learn's PCA module. Both approaches project onto three components. However, the first approach handles data more manually and explicitly, while the second approach utilizes Pandas DataFrames and scikit-learn utilities to handle and process data. 

The SVD results of both approaches show differences in scales and values. These differences can be attributed to several factors. Firstly, the first approach uses a custom implementation of SVD, and the results may vary due to the specific algorithm used and the number of iterations. Secondly, differences in handling nominal attributes, missing values, and data types can affect the results. Finally, the first approach calculates PCA directly from SVD results, while scikit-learn's PCA module may use a different algorithm. In addition, differences in numerical precision during calculations may also contribute to variations.