# Part 1

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
myData = pd.read_csv("Coding2_Data.csv")
var_names = myData.columns
y = myData[['Y']].to_numpy()
X = myData.drop(['Y'], axis = 1).to_numpy()

In [9]:
X.shape, len(y)

((506, 13), 506)

## CD for Lasso

In [19]:
def one_var_lasso(r, x, lam):
    a = (r.T @ x) / np.square(np.linalg.norm(x))
    n = 2 * r.size * lam / np.square(np.linalg.norm(x))
    if a > n/2:
        return a - n/2
    elif abs(a) <= n/2:
        return 0
    else:
        return a + n/2

In [None]:
def MyLasso(X, y, lam_seq, maxit = 100):
    
    # Input
    # X: n-by-p design matrix without the intercept 
    # y: n-by-1 response vector 
    # lam.seq: sequence of lambda values (arranged from large to small)
    # maxit: number of updates for each lambda 
    
    # Output
    # B: a (p+1)-by-len(lam.seq) coefficient matrix 
    #    with the first row being the intercept sequence 

  
    n, p = X.shape
    nlam = len(lam_seq)
    B = np.zeros((p+1, nlam))
    
    ##############################
    # YOUR CODE: 
    # (1) newX = Standardizad X; 
    # (2) Record the centers and scales used in (1) 
    ##############################
    
    centers = X.mean(0)
    scales = X.std(0)
    newX = (X - centers) / scales
    

    # Initilize coef vector b and residual vector r
    b = np.zeros(p)
    r = y

    
    # Triple nested loop
    for m in range(nlam):
        for step in range(maxit):
            for j in range(p):
                X_j = newX[:, j].reshape(-1,1)
                r = r + X_j * b[j]
                b[j] = one_var_lasso(r, X_j, lam_seq[m])
                r = r - X_j * b[j]
        B[1:, m] = b 
    
    ##############################
    # YOUR CODE:
    # Scale back the coefficients;
    # Update the intercepts stored in B[, 1] (shouldn't it be B[1, ] ???!!!)
    ##############################
    
    b[1:,] = b[1:,] * scales + centers
    # where are the intercepts?
    
    return(B)

In [None]:
log_lam_seq = np.linspace(-1, -8, num = 80)
lam_seq = np.exp(log_lam_seq)
myout = MyLasso(X, y, lam_seq, maxit = 100)

In [None]:
p, _ = myout.shape
plt.figure(figsize = (12,8))

for i in range(p-1):
    plt.plot(log_lam_seq, myout[i+1, :], label = var_names[i])

plt.xlabel('Log Lambda')
plt.ylabel('Coefficients')
plt.title('Lasso Paths - Numpy implementation')
plt.legend()
plt.axis('tight')

In [None]:
lasso_coef = pd.read_csv("Coding2_lasso_coefs.csv").to_numpy()
lasso_coef.shape

In [1]:
abs(myout - lasso_coef).max()

NameError: name 'myout' is not defined

# Part 2

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [21]:
url = "https://raw.githubusercontent.com/liangfgithub/liangfgithub.github.io/master/Data/Coding2_Data2.csv"
myData = pd.read_csv(url)
# myData.head()
Y = myData['Y']
X = myData.drop(['Y'], axis = 1)

In [None]:
X.shape, len(Y)

In [None]:
n = len(Y)
indices = np.arange(0, n)
np.random.shuffle(indices)
test_ind = indices[:int(np.floor(0.25*n))]
train_ind = indices[len(test_ind):]

# Splitting the data into training and testing sets
X_train = X.iloc[train_ind]
Y_train = Y[train_ind]
X_test = X.iloc[test_ind]
Y_test = Y[test_ind]

In [None]:
full = lm().fit(X_train, Y_train)
mean_squared_error(Y_test, full.predict(X_test))

In [None]:
ridge_alphas = np.logspace(-10, 1, 100)
ridgecv = RidgeCV(alphas = ridge_alphas, cv = 10, 
                  scoring = 'neg_mean_squared_error', 
                  normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_

In [None]:
ridge_model = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge_model.fit(X_train, Y_train)
mean_squared_error(Y_test, ridge_model.predict(X_test))

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
lasso_alphas = np.logspace(-10, 1, 100)
lassocv = LassoCV(alphas = lasso_alphas, cv = 10, 
                  normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_

In [None]:
mean_mse = np.mean(lassocv.mse_path_, axis=1)
std_mse = np.std(lassocv.mse_path_, axis=1) / np.sqrt(10) 

cv_alphas = lassocv.alphas_
min_idx = np.argmin(mean_mse)

alpha_min = cv_alphas[min_idx]

threshold = mean_mse[min_idx] + std_mse[min_idx]
alpha_1se = max(cv_alphas[np.where(mean_mse <= threshold)])

alpha_min, alpha_1se  #alpha_min = lassocv.alpha_

In [None]:
lasso_model_min = Lasso(alpha = alpha_min, normalize = True, max_iter=10000)
lasso_model_min.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_model_min.predict(X_test))

In [None]:
lasso_model_1se = Lasso(alpha = alpha_1se, normalize = True, max_iter=10000)
lasso_model_1se.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_model_1se.predict(X_test))

In [None]:
nonzero_indices = np.where(lasso_model_1se.coef_ != 0)[0]
lm_refit = lm()
lm_refit.fit(X_train.iloc[:, nonzero_indices], Y_train)
mean_squared_error(Y_test, lm_refit.predict(X_test.iloc[:, nonzero_indices]))

In [None]:
class PCR(object):

    def __init__(self, num_folds=10):
        self.folds = num_folds

    def fit(self, X, Y):
        n, p = X.shape
        indices = np.arange(n)
        np.random.shuffle(indices)
        index_sets = np.array_split(indices, self.folds)
        ncomp = min(p, n - 1 - max([len(i) for i in index_sets]))
        cv_err = np.zeros(ncomp)

        for ifold in range(self.folds):
            train_inds =  np.delete(index_sets, obj=ifold, axis=0).ravel()
            test_inds = index_sets[ifold]

            X_train = X[train_inds, :]
            pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA())])
            pipeline.fit(X_train)
            X_train = pipeline.transform(X_train)
            coefs = Y[train_inds].T @ X_train / np.sum(X_train**2, axis=0)
            b0 = np.mean(Y[train_inds])

            X_test = pipeline.transform(X[test_inds, :])

            for k in np.arange(ncomp):
                preds = X_test[:, :k] @ coefs.T[:k] + b0
                cv_err[k] += cv_err[k] + np.sum((Y[test_inds]-preds)**2)

        min_ind = np.argmin(cv_err)
        self.ncomp = min_ind+1
        pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=self.ncomp))])
        self.transform = pipeline.fit(X)
        self.model = lm().fit(self.transform.transform(X), Y)

    def predict(self, X):
        X_ = self.transform.transform(X)
        return self.model.predict(X_)

In [None]:
pcr = PCR()
pcr.fit(X_train.to_numpy(), Y_train.to_numpy())
mean_squared_error(Y_test, pcr.predict(X_test.to_numpy()))