# HW 3
- Author: Marc Brooks
- NetID: mgb45

## 1.) (Hard-thresholding)

## 2. (Great British Bake-off)

In [68]:
# importing required packages
import numpy as np
import pandas as pd
import itertools

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoLarsIC, LassoLarsCV, RidgeCV, LinearRegression
from sklearn.model_selection import cross_val_score, LeaveOneOut

import warnings
warnings.filterwarnings("ignore")

## (a)

First, since $\epsilon$ and $\textbf{X}$ are independent we can see that: 
$$
\begin{align}
\text{Var}(Y) &= \text{Var}(\textbf{X}^{T}\beta^{*} + \epsilon) \\
&= \text{Var}(\textbf{X}^{T}\beta^{*}) + \text{Var}(\epsilon) \\
&= \beta^{*T}\Sigma(\rho)\beta^{*} + \sigma^2 
\end{align}
$$

Furthmore, we recognize $P(Y - \textbf{X}^{T}\beta^{*})^{2} = P\epsilon^{2}$ as the expected squared residuals.
Since $P\epsilon = 0$ then $P\epsilon^2 = \sigma^2$ and $P(Y - \textbf{X}^{T}\beta^{*})^{2} = \sigma^2$.


Thus,
$$
\begin{align}
R^{2} &= 1 - \frac{P(Y - \textbf{X}^{T}\beta^{*})^{2}}{\text{Var}(Y)} \\
&= 1 - \frac{\sigma^2}{\beta^{*T}\Sigma(\rho)\beta^{*} + \sigma^2}
\end{align}
$$

## (b)

Setting $R^2 = .8$ we can solve for $\sigma^2$ as follows.

$$
\begin{align}
.8 &= 1 - \frac{\sigma^2}{\beta^{*T}\Sigma(\rho)\beta^{*} + \sigma^2} \\
.2 &= \frac{\sigma^2}{\beta^{*T}\Sigma(\rho)\beta^{*} + \sigma^2} \\
\sigma^2 &= .2(\beta^{*T}\Sigma(\rho)\beta^{*} + \sigma^2)  \\
.8\sigma^2 &= .2(\beta^{*T}\Sigma(\rho)\beta^{*})  \\
\sigma^2 &= .25(\beta^{*T}\Sigma(\rho)\beta^{*}) 
\end{align}
$$

In [2]:
n = 100
p = [10, 25, 50]
rho = [0, .25, .5]

Generating data sets with sparse signal

In [5]:
rho = .25
p=10
n = 100

In [6]:
beta_sp = np.fromfunction(lambda j,_: (2/np.sqrt(n))*(j+1 <= np.sqrt(p)), (p,1))

In [7]:
beta_dns =  np.fromfunction(lambda j,_: (5/(j+1)*np.sqrt(n)), (p,1))

In [8]:
Sigma_p = np.fromfunction(lambda i,j: rho**(np.abs(i-j)), (p,p))

In [9]:
sigma2_spr = beta_sp.T @ Sigma_p @ beta_sp
sigma2_dns = beta_dns.T @ Sigma_p @ beta_dns

In [10]:
epsilon_spr = np.random.normal(loc=0, scale=sigma2_spr)
epsilon_dns = np.random.normal(loc=0, scale=sigma2_dns)

In [11]:
X = np.random.multivariate_normal(mean = np.zeros(p), cov = Sigma_p, size = n)

In [12]:
Y_spr =  X @ beta_sp + epsilon_spr
Y_dns =  X @ beta_dns + epsilon_dns

In [13]:
lasso_aic = LassoLarsIC(criterion='aic')
lasso_aic.fit(X,Y_spr.reshape(n,))

LassoLarsIC()

In [14]:
lasso_aic.score(X, Y_spr)

1.0

In [15]:
mean_squared_error(Y_spr, lasso_aic.predict(X))

1.5789929242053192e-31

In [62]:

loo = LeaveOneOut()
loo

LeaveOneOut()

In [58]:
lasso_loo = LassoLarsCV(cv=loo)
lasso_loo.fit(X,Y_spr.reshape(n,))

LassoLarsCV(cv=LeaveOneOut())

In [18]:
lm_spr = LinearRegression().fit(X, Y_spr)

In [19]:
lm_spr.coef_.round(5)

array([[ 0.2,  0.2,  0.2,  0. , -0. ,  0. ,  0. ,  0. , -0. , -0. ]])

In [20]:
1/(lm_spr.coef_**2)**(1/2)

array([[5.00000000e+00, 5.00000000e+00, 5.00000000e+00, 3.29263346e+16,
        3.97496533e+15, 2.51196132e+19, 4.96353618e+16, 1.22316420e+16,
        1.47395290e+16, 9.57335477e+15]])

In [73]:
X.shape

(100, 10)

In [79]:
(X @ np.diagflat(np.abs(lm_spr.coef_))).shape

(100, 10)

In [7]:
labels = ['lasso']*3 + ['adpt lasso']*3 + ['ridge']*3 + ['adpt ridge']*3
tunings = ['AIC', 'BIC', 'LOO-CV']*4
labs = [labels[i] + ' ' + tunings[i] for i in range(len(labels))]

In [15]:
emse_table = pd.DataFrame(itertools.product(['sparse', 'dense'], 
              None                 labs,
                               [10,25,50],
                               [0,.25,.5]), 
             columns=['signal','Method-Tuning', 'p', 'rho'])

emse_table['mse'] = None

In [16]:
emse_table

Unnamed: 0,signal,Method-Tuning,p,rho,mse
0,sparse,lasso AIC,10,0.00,
1,sparse,lasso AIC,10,0.25,
2,sparse,lasso AIC,10,0.50,
3,sparse,lasso AIC,25,0.00,
4,sparse,lasso AIC,25,0.25,
...,...,...,...,...,...
211,dense,adpt ridge LOO-CV,25,0.25,
212,dense,adpt ridge LOO-CV,25,0.50,
213,dense,adpt ridge LOO-CV,50,0.00,
214,dense,adpt ridge LOO-CV,50,0.25,


In [111]:
U, D, V = np.linalg.svd(X, full_matrices=False)

In [112]:
(D**2/(D**2 + .02)).shape

(10,)

In [125]:
(U @ np.diag(D**2/(D**2 + .02)) @ U.T) @ Y_spr

array([[-0.12032482],
       [ 0.65299965],
       [-0.15502049],
       [-0.19917645],
       [-0.05998755],
       [ 0.57136972],
       [ 0.44384808],
       [ 0.54306634],
       [ 0.09496487],
       [-0.85864191],
       [ 0.07780335],
       [-0.52612679],
       [-0.7516899 ],
       [-0.33650755],
       [ 0.38243784],
       [-0.556927  ],
       [-0.24007669],
       [ 1.37058865],
       [-0.10725297],
       [ 0.53195727],
       [ 0.88450368],
       [-0.84105139],
       [ 0.81478276],
       [-0.21927419],
       [ 0.04208565],
       [ 0.67693705],
       [-0.19145329],
       [-0.59865626],
       [ 0.21910518],
       [ 0.73046872],
       [-0.16745336],
       [ 0.13532716],
       [ 0.39359962],
       [ 0.66698155],
       [-0.04222687],
       [ 0.1163018 ],
       [ 0.5509335 ],
       [-0.3991371 ],
       [ 0.3155339 ],
       [ 0.66813726],
       [-0.80002002],
       [-0.47572198],
       [-0.03634204],
       [-0.01877347],
       [-0.62404861],
       [-0

In [127]:
def lambda_bic(lambs, design_mat, Y_mat):
    U, D, V = np.linalg.svd(design_mat, full_matrices=False)
    
    max_i = np.argmax([mean_squared_error(Y_mat, (U @ np.diag(D**2/(D**2 + l)) @ U.T) @ Y_mat) for l in lambs])
    return lambds[max_i]
        
    
    
    

In [None]:
lambda_bic(np.linspace(0,1,.01), X, Y_spr)

In [100]:
? np.linalg.svd

In [None]:
n = 100
for p,rho in itertools.product([10,25,50],[0,.25,.5]):
    print(p,rho)
    # Generate data
    beta_spr = np.fromfunction(lambda j,_: (2/np.sqrt(n))*(j+1 <= np.sqrt(p)), (p,1))
    beta_dns =  np.fromfunction(lambda j,_: (5/(j+1)*np.sqrt(n)), (p,1))

    Sigma_p = np.fromfunction(lambda i,j: rho**(np.abs(i-j)), (p,p))
    sigma2_spr = .25*(beta_spr.T @ Sigma_p @ beta_spr)
    sigma2_dns = .25*(beta_dns.T @ Sigma_p @ beta_dns)
    
    # Sparse Signal
    lasso_aic_mse_spr = []
    lasso_bic_mse_spr = []
    lasso_loocv_mse_spr = []
    adp_lasso_aic_mse_spr = []
    adp_lasso_bic_mse_spr = []
    adp_lasso_loocv_mse_spr = []
    
    ridge_aic_mse_spr = []
    ridge_bic_mse_spr = []
    ridge_loocv_mse_spr = []
    adp_ridge_aic_mse_spr = []
    adp_ridge_bic_mse_spr = []
    adp_ridge_loocv_mse_spr =[]
    
    # Dense Signal 
    lasso_aic_mse_dns = []
    lasso_bic_mse_dns = []
    lasso_loocv_mse_dns = []
    adp_lasso_aic_mse_dns = []
    adp_lasso_bic_mse_dns = []
    adp_lasso_loocv_mse_dns = []
    
    ridge_aic_mse_dns = []
    ridge_bic_mse_dns = []
    ridge_loocv_mse_dns = []
    adp_ridge_aic_mse_dns = []
    adp_ridge_bic_mse_dns = []
    adp_ridge_loocv_mse_dns = []
    
    
    for _ in range (1000):
        # Create Dataset
        epsilon_spr = np.random.normal(loc=0, scale=sigma2_spr)
        epsilon_dns = np.random.normal(loc=0, scale=sigma2_dns)

        X = np.random.multivariate_normal(mean = np.zeros(p), cov = Sigma_p, size = n)

        Y_spr =  X @ beta_sp + epsilon_spr
        Y_dns =  X @ beta_dns + epsilon_dns
        
        #For addpative regressions
        lm_spr = LinearRegression().fit(X, Y_spr)
        lm_dns = LinearRegression().fit(X, Y_dns)
        
        # Adpative Lasso
        # Here we are compute X^T (D^-1)^T
        adX_sprL = X @ np.diagflat(np.abs(lm_spr.coef_))
        adX_dnsL = X @ np.diagflat(np.abs(lm_dns.coef_))
        
        # Adpative Ridge
        adX_spr_Rd = X @ np.diagflat((lm_spr.coef_**2)**(1/2))
        adX_dns_Rd = X @ np.diagflat((lm_dns.coef_**2)**(1/2))
        
        # For LOO-CV
        loo = LeaveOneOut()
    
        # Calculate Lassos
        # AIC
        # Sparse
        lasso_aic_spr = LassoLarsIC(criterion='aic')
        lasso_aic_spr.fit(X, Y_spr)
        lasso_aic_mse_spr.append(mean_squared_error(Y_spr, lasso_aic_spr.predict(X)))
        # Dense
        lasso_aic_dns = LassoLarsIC(criterion='aic')
        lasso_aic_dns.fit(X, Y_dns)
        lasso_aic_mse_dns.append(mean_squared_error(Y_dns, lasso_aic_dns.predict(X)))
        #BIC
        # Sparse
        lasso_bic_spr = LassoLarsIC(criterion='bic')
        lasso_bic_spr.fit(X, Y_spr)
        lasso_bic_mse_spr.append(mean_squared_error(Y_spr, lasso_bic_spr.predict(X)))
        # Dense
        lasso_bic_dns = LassoLarsIC(criterion='bic')
        lasso_bic_dns.fit(X, Y_dns)
        lasso_bic_mse_dns.append(mean_squared_error(Y_dns, lasso_bic_dns.predict(X)))
        #LOO-CV
        # Sparse
        lasso_loo_spr = LassoLarsCV(cv=loo)
        lasso_loo_spr.fit(X, Y_spr)
        lasso_loocv_mse_spr.append(mean_squared_error(Y_spr, lasso_loo_spr.predict(X)))
        # Dense
        lasso_loo_dns = LassoLarsCV(cv=loo)
        lasso_loo_dns.fit(X, Y_dns)
        lasso_loocv_mse_spr.append(mean_squared_error(Y_dns, lasso_loo_dns.predict(X)))

        #Adaptive Lasso
        # AIC
        # Sparse
        adp_lasso_aic_spr = LassoLarsIC(criterion='aic')
        adp_lasso_aic_spr.fit(adX_sprL, Y_spr)
        adp_lasso_aic_mse_spr.append(mean_squared_error(Y_spr, adp_lasso_aic_spr.predict(adX_sprL)))
        # Dense
        adp_lasso_aic_dns = LassoLarsIC(criterion='aic')
        adp_lasso_aic_dns.fit(adX_dnsL, Y_dns)
        adp_lasso_aic_mse_dns.append(mean_squared_error(Y_dns, adp_lasso_aic_dns.predict(adX_dnsL)))
        #BIC
        # Sparse
        adp_lasso_bic_spr = LassoLarsIC(criterion='bic')
        adp_lasso_bic_spr.fit(adX_sprL, Y_spr)
        adp_lasso_bic_mse_spr.append(mean_squared_error(Y_spr, adp_lasso_bic_spr.predict(adX_sprL)))
        # Dense
        adp_lasso_bic_dns = LassoLarsIC(criterion='bic')
        adp_lasso_bic_dns.fit(adX_dnsL, Y_dns)
        adp_lasso_bic_mse_dns.append(mean_squared_error(Y_dns, adp_lasso_bic_dns.predict(adX_dnsL)))
        #LOO-CV
        # Sparse
        adp_lasso_loo_spr = LassoLarsCV(cv=loo)
        adp_lasso_loo_spr.fit(adX_sprL, Y_spr)
        adp_lasso_loocv_mse_spr.append(mean_squared_error(Y_spr, adp_lasso_loo_spr.predict(adX_sprL)))
        # Dense
        adp_lasso_loo_dns = LassoLarsCV(cv=loo)
        adp_lasso_loo_dns.fit(adX_dnsL, Y_dns)
        adp_lasso_loocv_mse_dns.append(mean_squared_error(Y_dns, adp_lasso_loo_dns.predict(adX_dnsL)))


        # Calculate Ridge
        # AIC

        #BIC

        #LOO-CV

        #Adaptive Ridge
        # AIC

        # BIC

        # LOO-CV


10 0


In [37]:
n= 100
p = 10
rho = .25
beta_spr = np.fromfunction(lambda j,_: (2/np.sqrt(n))*(j+1 <= np.sqrt(p)), (p,1))

Sigma_p = np.fromfunction(lambda i,j: rho**(np.abs(i-j)), (p,p))
sigma2_spr = .25*(beta_spr.T @ Sigma_p @ beta_spr)

epsilon_spr = np.random.normal(loc=0, scale=sigma2_spr)

X = np.random.multivariate_normal(mean = np.zeros(p), cov = Sigma_p, size = n)

Y_spr =  X @ beta_sp + epsilon_spr


lasso_aic_spr = LassoLarsIC(criterion='aic')
x = lasso_aic_spr.fit(X, Y_spr.ravel())

In [39]:
 #For addpative regressions
lm_spr = LinearRegression().fit(X, Y_spr)
lm_dns = LinearRegression().fit(X, Y_dns)
        
        # Adpative Lasso
        # Here we are compute X^T (D^-1)^T
adX_sprL = X @ np.diagflat(np.abs(lm_spr.coef_))
adX_dnsL = X @ np.diagflat(np.abs(lm_dns.coef_))
        
        # Adpative Ridge
adX_spr_Rd = X @ np.diagflat((lm_spr.coef_**2)**(1/2))
adX_dns_Rd = X @ np.diagflat((lm_dns.coef_**2)**(1/2)) 

In [40]:
lm_spr.coef_

array([[ 2.00000000e-01,  2.00000000e-01,  2.00000000e-01,
         2.41351772e-17,  1.81752026e-17, -1.92174004e-17,
        -9.49103888e-17, -1.79067230e-16, -1.83295070e-16,
        -8.29886526e-17]])

In [41]:
np.diagflat(np.abs(lm_spr.coef_))

array([[2.00000000e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.00000000e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.00000000e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.41351772e-17,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.81752026e-17, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
   

In [43]:
x = []

In [54]:
x.append(5)
x

[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]