In [None]:
import pandas as pd
from scipy.linalg import svd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
from mpl_toolkits import mplot3d  

In [None]:
df = pd.read_csv('preprocessed_data.csv')

In [None]:
continuous_vars = df.select_dtypes(include=['float64']).columns.tolist()
continuous_vars

In [None]:
pca_variable_subset = df[continuous_vars]

In [None]:
pca_variable_subset.info()

### PCA

In [None]:
X = pca_variable_subset.to_numpy()
N = X.shape[0]

In [None]:
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X_centered = (X - mean) / std

# Step 2: Compute covariance matrix
cov_matrix = np.cov(X_centered.T)

# Step 3: Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Step 4: Sort eigenvalues and eigenvectors
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

# Step 5: Project data onto principal components
X_pca = np.dot(X_centered, eigenvectors[:, :2])

# Results
print("Eigenvalues:", eigenvalues)
print("Eigenvectors:\n", eigenvectors)
print("Projected Data (PCA):\n", X_pca)

In [None]:
rho = eigenvalues / eigenvalues.sum()
threshold = 0.9

for i in range(len(eigenvalues)):
    if np.cumsum(rho)[i] > threshold:
        print(f'We need {i+1} components to explain at least 90% of the variance of the data')
        break

# Plot variance explained
plt.figure()
plt.plot(range(1, len(rho) + 1), rho, "x-")
plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
plt.plot([1, len(rho)], [threshold, threshold], "k--")
plt.title("Variance explained by principal components")
plt.xlabel("Number of principal components")
plt.ylabel("Variance explained")
plt.legend(["Individual", "Cumulative", "Threshold"])
plt.grid()
plt.savefig("./plots/pca_variance_threshold")
plt.show()

### SVD

In [None]:
# Subtract mean value from data
Y = (X - np.ones((N, 1)) * X.mean(axis=0)) / X.std(axis=0)

# PCA by computing SVD of Y
U, S, V = svd(Y, full_matrices=False)

# Compute variance explained by principal components
rho = (S * S) / (S * S).sum()

threshold = 0.90

for i in range(len(rho)):
    if np.cumsum(rho)[i] > threshold:
        print(f'{i+1} components/variables needed to surpass the threshold={threshold}')
        break


# Plot variance explained
plt.figure()
plt.plot(range(1, len(rho) + 1), rho, "x-")
plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
plt.plot([1, len(rho)], [threshold, threshold], "k--")
plt.title("Variance explained by principal components")
plt.xlabel("Principal component")
plt.ylabel("Variance explained")
plt.legend(["Individual", "Cumulative", "Threshold"])
plt.grid()
plt.show()

In [None]:
loadings_pc1 = V[0,:]
loadings_pc2 = V[1,:]

In [None]:
# Get the indices of the top 5 highest values
top_indices1 = np.argsort(loadings_pc1)[-5:]  # Get last 5 indices after sorting
top_indices_sorted1 = top_indices1[np.argsort(-loadings_pc1[top_indices1])]

top_indices2 = np.argsort(loadings_pc2)[-5:]  # Get last 5 indices after sorting
top_indices_sorted2 = top_indices2[np.argsort(-loadings_pc2[top_indices2])]

In [None]:
print('First Principal Component:\n')

for idx in top_indices_sorted1:
    print(f'{df_encoded.columns[idx]} with coefficient: {loadings_pc1[idx]}',end='\n')

print('\n#############################################################')
print('\nSecond Principal Component:\n')

for idx in top_indices_sorted2:
    print(f'{df_encoded.columns[idx]} with coefficient: {loadings_pc2[idx]}',end='\n')


In [None]:
df1 = pd.read_csv('preprocessed_data.csv')

In [None]:
df1.info()

In [None]:
classLabels = df1['student_affordable'].tolist()
classNames = set(classLabels)
classDict = dict(zip(classNames, range(len(classNames))))

# Extract vector y, convert to NumPy array
y = np.asarray([classDict[value] for value in classLabels])

In [None]:
# Project the centered data onto principal component space
Z = Y @ V.T

# Indices of the principal components to be plotted
i = 0
j = 1

# Plot PCA of the data
f = figure()
title("Copenhagen Apartments/Rooms data: PCA")
# Z = array(Z)
for c in range(len(classNames)):
    # select indices belonging to class c:
    class_mask = y == c
    plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5)
legend(classNames)
xlabel("PC{0}".format(i + 1))
ylabel("PC{0}".format(j + 1))

In [None]:
# Project the centered data onto principal component space
Z = Y @ V.T

# Indices of the principal components to be plotted
i = 0
j = 1
k = 2
    
# Creating figures for the plot  
fig = plt.figure(figsize = (10, 7))  
ax = plt.axes(projection ='3d')  

for c in range(len(classNames)):
    # select indices belonging to class c:
    class_mask = y == c
    ax.scatter3D(Z[class_mask, i], Z[class_mask, j], Z[class_mask, k], "o", alpha=0.5)
  

plt.title("Copenhagen Apartments/Rooms data: PCA")
legend(classNames, title="Student affordable")
ax.set_xlabel("PC{0}".format(i + 1))
ax.set_ylabel("PC{0}".format(j + 1))
ax.set_zlabel("PC{0}".format(k + 1))

# Change plot angle
ax.view_init(10, -140)

# Save plot
plt.savefig("./plots/pca_projection.png")

# display the  plot  
plt.show()  