In [29]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
%matplotlib inline

In [5]:
def symmetricize(arr1D):
    ID = np.arange(arr1D.size)
    return arr1D[np.abs(ID - ID[:,None])]

In [72]:
N, K = 100, 750
X = np.random.normal(0, 1, (N, K))
X = np.hstack((np.ones((N, 1)), X))
betas_real = np.random.normal(0, 1, K+1)
cov_betas = symmetricize(betas_real)
betas_real += (np.random.normal(0, 1, K+1) / 10)
y = X.dot(betas_real)

In [73]:
y_hat = X.dot(np.linalg.lstsq(X, y)[0])
r2_score(y, y_hat)

1.0

### Standard linear regression (no regularization)

In [75]:
folds = KFold(n_splits=10)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

scores = np.zeros(10)
for i, (train_idx, test_idx) in enumerate(folds.split(X, y)):
    pipe.fit(X[train_idx], y[train_idx])
    preds = pipe.predict(X[test_idx])
    scores[i] = pearsonr(y[test_idx], preds)[0]
    
print(scores, end='\n\n')
print("R: %3f. (%.3f)" % (scores.mean(), scores.std()))

[-0.36469511 -0.06510429  0.19879247  0.40344633 -0.05125232  0.56635711
  0.45281091  0.2645922   0.55536819  0.33347589]

R: 0.229379. (0.288)


### Ridge

In [76]:
folds = KFold(n_splits=10)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RidgeCV())
])

scores = np.zeros(10)
for i, (train_idx, test_idx) in enumerate(folds.split(X, y)):
    pipe.fit(X[train_idx], y[train_idx])
    preds = pipe.predict(X[test_idx])
    scores[i] = pearsonr(y[test_idx], preds)[0]

print(scores)
print("R: %3f. (%.3f)" % (scores.mean(), scores.std()))

[-0.40153522 -0.06952581  0.3433329   0.44530918 -0.01890273  0.53877646
  0.46723613  0.26109276  0.56704463  0.31522416]
R: 0.244805. (0.297)


### Tikhonov

In [77]:
folds = KFold(n_splits=10)
lambd = 100
regul = lambd * cov_betas.T.dot(cov_betas)
scaler = StandardScaler()

scores = np.zeros(10)
for i, (train_idx, test_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    betas_recon = np.linalg.pinv(X_train.T.dot(X_train) + regul).dot(X_train.T).dot(y_train)
    scores[i] = pearsonr(X_test.dot(betas_recon), y_test)[0]

scores.mean()

-0.21602906682839412