In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [138]:
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split

# house_prices = load_boston()
# X, y = house_prices.data, house_prices.target

diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

n_train = 10
n_test = X.shape[0] - n_train

# normalize X
#X = (X - np.mean(X, axis=0, keepdims=True)) / np.std(X, axis=0, keepdims=True) 

# low rank approximation of X
# rank = np.linalg.matrix_rank(X)
# u, s, vh = np.linalg.svd(X)
# X = u[:,:rank].dot(np.diag(s)[:rank,:rank]).dot(vh[:rank,:])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n_train, test_size=n_test, random_state=4803)

# train theta
lambda_ = 0
theta = np.linalg.inv(X_train.T.dot(X_train)+lambda_*np.eye(X_train.shape[1])).dot(X_train.T.dot(y_train))

# test
y_pred = X_test.dot(theta)

# evaluate performance
print('MSE on Training Data: {:0.3f}'.format(np.sum((X_train.dot(theta) - y_train)**2)/y_train.size))
print('MSE on Test Data: {:0.3f}'.format(np.sum((y_pred - y_test)**2)/y_test.size))



MSE on Training Data: 0.000
MSE on Test Data: 99036.313


In [156]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

class MyLeastSquares:
    def __init__(self, X_train, y_train):
        """Function stores feature matrix and corresponding target data.
        
        Parameters:
        -----------
        X_train: array_like, shape(N,D)
            ndarray containing N training examples, each with D feature values.
            
        y_train: array_like, shape(N,1)
            ndarray containing target values for each of N examples in X_train."""
        
        self.X_train = X_train
        self.y_train = y_train
        
    def fit(self):
        """Function computes the weight vector of shape (D+1, 1) for regression"""
        # append ones for bias
        X = np.concatenate((self.X_train, np.ones((self.X_train.shape[0],1))), axis=1)
        self.theta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(self.y_train))
     
    def predict(self, X_test):
        """Function predicts targets for given X_test.
        
        Parameters:
        -----------
        X_test: array_like, shape(N,D)
            ndarray containing N test examples with D features each.
            
        Returns: array_like, shape(N,1).
            ndarray containing predicted targets of shape (N,1).
        """
        
        X = np.concatenate((X_test, np.ones((X_test.shape[0],1))), axis=1)
        y_pred = X.dot(self.theta)
        
        return y_pred
    
# load dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# train-test split
n_train = 40
n_test = X.shape[0] - n_train

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n_train, test_size=n_test, random_state=4803)

# train theta
LS = MyLeastSquares(X_train, y_train)
LS.fit()

# test
y_pred = LS.predict(X_test)

# evaluate performance
print('MSE on Training Data: {:0.3f}'.format(np.sum((LS.predict(X_train) - y_train)**2)/y_train.size))
print('MSE on Test Data: {:0.3f}'.format(np.sum((y_pred - y_test)**2)/y_pred.size))



MSE on Training Data: 3203.340
MSE on Test Data: 3282.631


In [168]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

def rank_truncate(X, rank):
    """Function perfroms rank truncation for given feature matrix X.
    
    Parameters:
    -----------
    X: array_like, shape(N,D).
        ndarray containing N training examples with D features each.
    
    rank: int
        integer specifying the number of singular values to truncate with.
        
    Returns:
    --------
    X_trunc: array_like, shape(N,D)
        ndarray containing rank-approximation of X.
    """
    
    u, s, vh = np.linalg.svd(X)
    X_trunc = u[:,:rank].dot(np.diag(s)[:rank,:rank]).dot(vh[:rank,:])
    
    return X_trunc
    
    
# load dataset
house_prices = load_boston()
X, y = house_prices.data, house_prices.target

# train-test split
n_train = 13
n_test = X.shape[0] - n_train

# perform rank truncation
rank = np.linalg.matrix_rank(X)
X = rank_truncate(X, rank)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n_train, test_size=n_test, random_state=4803)

# train theta
LS = MyLeastSquares(X_train, y_train)
LS.fit()

# test
y_pred = LS.predict(X_test)

# evaluate performance
print('MSE on Training Data: {:0.3f}'.format(np.sum((LS.predict(X_train) - y_train)**2)/y_train.size))
print('MSE on Test Data: {:0.3f}'.format(np.sum((y_pred - y_test)**2)/y_pred.size))



MSE on Training Data: 7199.569
MSE on Test Data: 1763911382873375450975169673166848.000
