In [1]:
from functools import reduce
from itertools import product

import numpy as np
import pandas as pd

import toolbox as tb

### TODO:
- figure out if rigde/lasso is centering variables already?
- add control over mean/variance of variables?

# Define functions

In [2]:
def generate_XY(N, sigma, num_non_correlated=1, num_spurious = 0, num_correlated = 0):
    """
    Used to generate data with different properties
    """
    
    # create non-correlated gaussian data
    S = np.eye(num_non_correlated)
    X = np.random.multivariate_normal(np.zeros(num_non_correlated), S, N)
    
    # Add correlated indicators
    if num_correlated > 0:
        A = np.random.randint(-5, 5, size=(num_correlated,num_correlated))
        S2 = A.T@A
        X2 = np.random.multivariate_normal(np.zeros(num_correlated),S2,N)
        X = np.hstack([X, X2])
        
    # create a Y 
    e = np.random.normal(0, sigma**2, N)
    betas = np.random.binomial(10, 0.5, num_non_correlated + num_correlated) - 5
    Y = (X@betas) + e
    
    if num_spurious > 0:
        X3 = np.random.multivariate_normal(np.zeros(num_spurious), np.eye(num_spurious), N)
        X = np.hstack([X, X3])
    return X,Y

In [3]:
def fit_models(X, Y, model_parms, num_cv_iterations = 10): 
    """
    Split data, fit several models, rinse and repeat
    """
    in_sample_scores = []
    out_sample_scores = []

    for _ in range(num_cv_iterations):
        X_train, X_test, Y_train, Y_test = tb.train_test_split(X,Y, test_fraction=0.25)
        in_sample = []
        out_sample = []

        for mod_type, l in model_parms:
            mod = tb.regularized_linear_model(X_train,Y_train, mod_type, l)
            in_sample.append(mod.score(X_train,Y_train))
            out_sample.append(mod.score(X_test, Y_test))
            
        in_sample_scores.append(in_sample)
        out_sample_scores.append(out_sample)

    out_sample_scores = pd.DataFrame(out_sample_scores).T
    in_sample_scores = pd.DataFrame(in_sample_scores).T
    idx = pd.MultiIndex.from_tuples(model_parms, names=['model', 'lambda'])
    out_sample_scores.index = idx
    in_sample_scores.index = idx
    return in_sample_scores, out_sample_scores

### Question: is sklearn centering automatically?

In [17]:
np.random.seed(43)
X,Y = generate_XY(N=100, sigma=3, num_non_correlated=5, num_spurious=0, num_correlated=0)

First test closed from OLS regression

In [22]:
mod = tb.linear_model(X,Y, add_constant=False, verbose=False)
mod.params

array([ 2.17419698, -0.14421206, -0.54218838,  1.48810666, -2.78748293])

In [23]:
np.linalg.inv(X.T@X)@(X.T@Y)

array([ 2.17419698, -0.14421206, -0.54218838,  1.48810666, -2.78748293])

Now test closed form ridge regression

In [24]:
lam = 2

In [29]:
res = tb.regularized_linear_model(X,Y,'ridge',alpha=lam)
res.coef_

array([ 2.09534865, -0.18430095, -0.63436477,  1.49045728, -2.659303  ])

In [30]:
np.linalg.inv(X.T@X + lam*np.eye(X.shape[1]))@(X.T@Y)

array([ 2.11824516, -0.14239539, -0.54355627,  1.44481577, -2.69918465])

Now demean

In [33]:
X2 = tb.demean(X)
Y2 = tb.demean(Y)

In [34]:
np.linalg.inv(X2.T@X2 + lam*np.eye(X2.shape[1]))@(X2.T@Y2)

array([ 2.09534865, -0.18430095, -0.63436477,  1.49045728, -2.659303  ])

In [36]:
np.linalg.inv(X2.T@X2)@(X2.T@Y2)

array([ 2.15065263, -0.18671589, -0.63451438,  1.53441799, -2.74635643])

# Generate Data

In [4]:
np.random.seed(43)
X,Y = generate_XY(N=100, sigma=3, num_non_correlated=5, num_spurious=100, num_correlated=10)

In [5]:
l_vals = [0, 1, 5, 10, 50, 100, 500]
model_types = ['ridge', 'lasso']
parms = list(product(model_types, l_vals))

In [6]:
np.random.seed(43)
from warnings import catch_warnings, simplefilter
with catch_warnings():
    simplefilter("ignore")
    iss, oss = fit_models(X,Y, parms)

In [7]:
pd.concat([iss.mean(axis=1), oss.mean(axis=1)], axis=1, keys=['In', 'Out'])

Unnamed: 0_level_0,Unnamed: 1_level_0,In,Out
model,lambda,Unnamed: 2_level_1,Unnamed: 3_level_1
ridge,0,1.0,0.726811
ridge,1,0.999894,0.747686
ridge,5,0.998595,0.789871
ridge,10,0.996565,0.812249
ridge,50,0.983594,0.852228
ridge,100,0.973545,0.862219
ridge,500,0.94159,0.861493
lasso,0,1.0,0.379258
lasso,1,0.959784,0.866054
lasso,5,0.926956,0.878112
