### Question 6(a) Run the next few cells (before part b) to normalize "AgeHeight", do regression, do PCA on the input variables, and do regression using the components from PCA (dimension reduction on the input variables).

In [152]:
# Importing libraries needed
import numpy as np  # for matrices, array, linear algebra
import pandas as pd  # open source data analysis and manipulation
import sklearn # for PCA and other statistical packages

In [153]:
# Code for normalizing/standardizing data
def Normdata(X):
    Norm_X = X.copy()
    names = list(X)
    for i in names:
        Norm_X[i] = (X[i]-X[i].mean())/X[i].std()
    return Norm_X

In [154]:
# Code for regression using the Normal Equation (X.T*X)*theta = (X.T*y)
def LinearReg(Xdata,Y):
    X = np.vstack([np.ones(Xdata.shape[0]),Xdata.T]).T  # First, concatenate X with a column of ones on the left
    theta=(np.linalg.solve(np.matmul(X.T,X),np.matmul(X.T,Y)))
    print("Mean RSS = ",np.linalg.norm(np.matmul(X,theta) - Y)**2/X.shape[0])

In [155]:
AGH=pd.read_csv("AgeHeight.csv") 
names = list(AGH) # this gives us a list of column names

print("Regression on normalized data")
Norm_AGH = Normdata(AGH)
Xd = Norm_AGH[names[0:2]] # Data for Input/feature variables: first 2 columns
Y = Norm_AGH.y # Output variable data
LinearReg(Xd,Y)

Regression on normalized data
Mean RSS =  0.21541950113378674


In [156]:
# PCA on AGH
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(Norm_AGH[names[0:2]])

print("PCA on AgeHeight ormalized data")
print("First component: ", pca.components_[0], ", variance explained: ", 100*pca.explained_variance_ratio_[0], "%")
print("Second component: ", pca.components_[1], ", variance explained: ", 100*pca.explained_variance_ratio_[1], "%")

PCA on AgeHeight ormalized data
First component:  [0.70710678 0.70710678] , variance explained:  94.19417382415922 %
Second component:  [ 0.70710678 -0.70710678] , variance explained:  5.805826175840775 %


In [157]:
# Reduction of input dimension using PCA
print("Regression on the first component:")
pca2 = PCA(n_components=1)
Transformed_AGH = pca2.fit_transform(Norm_AGH[names[0:2]]) 
Y = Norm_AGH.y 
LinearReg(Transformed_AGH,Y)

print("Regression on both components:")
pca2 = PCA(n_components=2)
Transformed_AGH = pca2.fit_transform(Norm_AGH[names[0:2]]) 
Y = Norm_AGH.y 
LinearReg(Transformed_AGH,Y)

Regression on the first component:
Mean RSS =  0.21872510166964712
Regression on both components:
Mean RSS =  0.21541950113378686


### Question 6(b) Write code to do repeat part (a) for "Houses". Note that there are four input variables in Houses, and we should include 1, 2, 3, 4 components in turn.

In [159]:
houses_path = 'Houses.csv'
houses_data = pd.read_csv(houses_path)

In [160]:
def Normdata(X):
    Norm_X = X.copy()
    for col in Norm_X.columns:
        Norm_X[col] = (Norm_X[col] - Norm_X[col].mean()) / Norm_X[col].std()
    return Norm_X

In [161]:
def LinearReg(Xdata, Y):
    X = np.vstack([np.ones(Xdata.shape[0]), Xdata.T]).T  # Add column of ones for intercept
    theta = np.linalg.solve(np.matmul(X.T, X), np.matmul(X.T, Y))
    mean_rss = np.linalg.norm(np.matmul(X, theta) - Y)**2 / X.shape[0]
    print("Mean RSS = ", mean_rss)
    return mean_rss

In [162]:
normalized_houses = Normdata(houses_data)

In [163]:
input_columns = ['LIVING_AREA', 'LOT_SIZE', 'BEDROOMS', 'YEAR_BUILT']
X_houses = normalized_houses[input_columns]
Y_houses = normalized_houses['PRICE']

In [164]:
# PCA with 4 components
pca_houses_4 = PCA(n_components=4)
pca_houses_4.fit(X_houses)


components = pca_houses_4.components_
explained_variance_ratio_4 = pca_houses_4.explained_variance_ratio_

In [165]:
# Results
pca_results_4 = {
    f"Component {i+1}": components[i] for i in range(4)
}
pca_results_4.update({
    f"Variance Explained by Component {i+1} (%)": explained_variance_ratio_4[i] * 100 for i in range(4)
})

pca_results_4

{'Component 1': array([0.61939756, 0.53196549, 0.57253022, 0.07462259]),
 'Component 2': array([ 0.1819595 , -0.32897207, -0.01195791,  0.92656631]),
 'Component 3': array([-0.03467355, -0.67904879,  0.69781125, -0.2252775 ]),
 'Component 4': array([ 0.76291228, -0.38429525, -0.43026227, -0.29181568]),
 'Variance Explained by Component 1 (%)': 56.98292415246886,
 'Variance Explained by Component 2 (%)': 27.692204690772265,
 'Variance Explained by Component 3 (%)': 11.532010647059119,
 'Variance Explained by Component 4 (%)': 3.7928605096997496}

In [166]:
for n_components in range(1, 5):
    print(f"Regression with {n_components} principal component(s):")
    pca = PCA(n_components=n_components)
    X_transformed = pca.fit_transform(X_houses)
    LinearReg(X_transformed, Y_houses)

Regression with 1 principal component(s):
Mean RSS =  0.20885638848844335
Regression with 2 principal component(s):
Mean RSS =  0.2057702643614693
Regression with 3 principal component(s):
Mean RSS =  0.19379852660635097
Regression with 4 principal component(s):
Mean RSS =  0.16118244781630114
