In [16]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from scipy.misc import comb as comb1
from scipy.special import gamma
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy.linalg as la
import time

In [2]:
def ridge_regression(X,y,lmbda):
    '''
    Parameters:
    X - a NxD matrix, where N is number of observations 
        and D is number of features. Do not include a
        column of ones for an intercept in X.
    y - a Nx1 vector of observations to be predicted
    lmbda - a parameter that determines how much to penalize
        large coefficients
    
    '''
    n,d = X.shape
    Xc = X - np.outer(np.ones(n), X.mean(axis=0))
    beta = la.solve(Xc.T.dot(Xc) + lmbda*np.eye(d), Xc.T.dot(y))
    intercept = y.mean()
    return intercept, beta

In [6]:
df = pd.read_csv('wages.csv')
df['female*married'] = df['female']*df['married']
train, test = train_test_split(df, test_size=.3, train_size = .7)
Xtrain = train[['female','educ','exper', 'tenure', 'married', 'female*married', 'numdep', 'nonwhite']]
ytrain = train['wage']
Xtest = test[['female','educ','exper', 'tenure', 'married', 'female*married', 'numdep', 'nonwhite']]
ytest = test['wage']

In [28]:
ols_intercept, ols_beta = ridge_regression(Xtrain,ytrain,0)
ols_mse = la.norm(ytest - ols_intercept - np.dot(Xtest,ols_beta))**2 / len(ytest)
#print('OLS Results:\t{}\t{}\t{}'.format(ols_intercept, ols_beta, ols_mse))

intercepts = [ols_intercept]
betas = [ols_beta]
compare_betas = []
mses = [ols_mse]
compare_time = []

for lmbda in range(-5,6):
    start = time.clock()
    intercept, beta = ridge_regression(Xtrain,ytrain,10**lmbda)
    ridge_time = time.clock() - start
    predict1 = intercept + np.dot(Xtest, beta)
    mse = mean_squared_error(ytest, predict1)
    intercepts.append(intercept)
    betas.append(beta)
    mses.append(mse)
    
    start = time.clock()
    ridge = linear_model.Ridge(alpha = lmbda)
    ridge.fit(Xtrain,ytrain)
    sklearn_time = time.clock() - start
    compare_time.append([ridge_time, sklearn_time])
    compare_betas.append(np.array([beta,ridge.coef_]).T)
    
betas = np.array(betas)

print("First row is OLS results, following results are ridge regression with lambda = 10^k for integer k in [-5,5], starting at k=-5.")
print('Intercepts')
print(np.array(intercepts).reshape(len(intercepts), 1))
print('\n\n')
print('Coefficients')
print(['female','educ','exper', 'tenure'])
print(betas[:,:4])
print('')
print(['married', 'female*married', 'numdep', 'nonwhite'])
print(betas[:,4:])
print('\n\n')
print('Mean Squared Errors')
print(np.array(mses).reshape(len(mses),1))

print("\n\nComparing my ridge regression to scikit-learn's, time listed first, followed by coefficients.\nMy results are on the left and sk-learn's on the right.")
for k in range(len(compare_betas)):
    print(compare_time[k])
    print(compare_betas[k])
    print('\n\n')

First row is OLS results, following results are ridge regression with lambda = 10^k for integer k in [-5,5], starting at k=-5.
Intercepts
[[ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]
 [ 5.85236413]]



Coefficients
['female', 'educ', 'exper', 'tenure']
[[-0.46011316  0.53168456  0.01715607  0.11658823]
 [-0.46011392  0.53168457  0.01715607  0.11658824]
 [-0.46012078  0.53168469  0.0171561   0.1165883 ]
 [-0.46018935  0.5316859   0.01715642  0.11658888]
 [-0.4608744   0.53169795  0.01715962  0.1165947 ]
 [-0.46765832  0.53181759  0.01719132  0.11665263]
 [-0.52936732  0.53293641  0.01748528  0.11720848]
 [-0.81909864  0.53972616  0.01914972  0.12138187]
 [-0.68249108  0.54355845  0.02046853  0.137528  ]
 [-0.14480245  0.41370932  0.01182945  0.1517687 ]
 [-0.01876529  0.11995489  0.00557999  0.10865631]
 [-0.00217126  0.01486199  0.00870167  0.0261425 ]]

['married', '