A very good question is why Ridge requires so much lower regularization factor when using SGD than when using matrix inversion.
One explanation might be that when you are using matrix inversion you are doing it across all the data in one, and then subtract the regularization. If doing regular sgd you end up subtracting the regularizing term on each and every batch update(which is one data point with regular sgd). When you are done going through the whole data set, you've essentially subtracted the regularizing term n_batches times, whereas you've only done that once if you would have used batch stochastic gradient descent, using the average of the whole data sets gradient. So perhaps one should divide the regularizing term by amount of batches?

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../src/')

In [68]:
import numpy as np
from modelling import ols,ridge
from modelling.sgd import SGD_optimizer
from model_evaluation.param_analysis import evaluate_parameter
from data.create_dataset import *
from visualization.visualize import *
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge

In [9]:
X, z = create_dataset('../data/raw/SRTM_data_Norway_1.tif',degree=4)
X_train, X_test, z_train, z_test = train_test_split(X,z, test_size=0.2)
Xscaler = StandardScaler().fit(X_train)
zscaler = StandardScaler().fit(z_train)

X_train = Xscaler.transform(X_train)
z_train = zscaler.transform(z_train)


In [23]:
beta_ols = ols.fit_beta(X_train,z_train,fit_intercept=False)
beta_ols

array([[ 0.        ],
       [ 3.04093673],
       [ 1.71139914],
       [-6.54620225],
       [-7.02806498],
       [-3.43016744],
       [ 7.22626538],
       [ 9.36566755],
       [ 2.33435218],
       [ 4.6773026 ],
       [-3.59703173],
       [-1.88177428],
       [-3.84071539],
       [ 1.92373873],
       [-3.07587299]])

In [58]:
lmb = 0.001
beta_rid = ridge.fit_beta(X_train,z_train, lmb,fit_intercept=False)
beta_rid

array([[ 0.        ],
       [ 3.03358673],
       [ 1.70666781],
       [-6.52234693],
       [-7.01822223],
       [-3.41635829],
       [ 7.19838475],
       [ 9.35343981],
       [ 2.32703329],
       [ 4.6609275 ],
       [-3.58594329],
       [-1.87684412],
       [-3.83745181],
       [ 1.92574176],
       [-3.06904065]])

In [63]:
epochs = 20000
#batch = X_train.shape[0]
batch = 32

sgd_rid2 = SGD_optimizer(fit_intercept = False, batch_size = batch, 
                        n_epochs = epochs,use_momentum= True,
                        regularization = 'l2',lmb=0.0001,schedule = 'constant', lr0=0.1)
sgd_rid2.fit(X_train,z_train)
sgd_rid2.beta

array([[ 0.        ],
       [ 2.85832515],
       [ 1.57707906],
       [-6.00199197],
       [-6.84317518],
       [-3.10879803],
       [ 6.54545255],
       [ 9.06723041],
       [ 2.17566171],
       [ 4.28561985],
       [-3.33597244],
       [-1.79897844],
       [-3.77179958],
       [ 1.9488044 ],
       [-2.8949555 ]])

In [65]:
epochs = 20000
#batch = X_train.shape[0]
batch = 32

sgd_rid2 = SGD_optimizer(fit_intercept = False, batch_size = batch, 
                        n_epochs = epochs,use_momentum= True,
                        regularization = 'l2',lmb=0.0001,schedule = 'constant', lr0=0.1)
sgd_rid2.fit(X_train,z_train)
sgd_rid2.beta

array([[ 0.        ],
       [ 1.15620053],
       [ 0.52234237],
       [-1.07951399],
       [-3.85531165],
       [-0.50460038],
       [ 1.5314463 ],
       [ 5.22852995],
       [ 0.31744146],
       [ 1.20164497],
       [-1.55601893],
       [-0.33426202],
       [-2.54453955],
       [ 2.21717679],
       [-1.58873545]])

In [70]:
epochs = 20000
#batch = X_train.shape[0]
batch = 64

sgd_rid2 = SGD_optimizer(fit_intercept = False, batch_size = batch, 
                        n_epochs = epochs,use_momentum= True,
                        regularization = 'l2',lmb=0.001,schedule = 'constant', lr0=0.13)
sgd_rid2.fit(X_train,z_train)
sgd_rid2.beta

array([[ 0.        ],
       [ 2.92053121],
       [ 1.63370313],
       [-6.25706545],
       [-6.89461402],
       [-3.23010034],
       [ 6.918491  ],
       [ 9.22419856],
       [ 2.2494836 ],
       [ 4.50321688],
       [-3.47217256],
       [-1.76788945],
       [-3.8070113 ],
       [ 1.95740532],
       [-2.9836715 ]])

In [71]:
from sklearn.linear_model import SGDRegressor
sgdreg = SGDRegressor( random_state=10, fit_intercept = False, max_iter = 100000, penalty='l2',alpha=0.001, eta0 = 1,tol=None)
sgdreg = sgdreg.fit(X_train[:,1:],z_train.ravel())
print(sgdreg.coef_.T)
print(sgdreg.intercept_.T)

[ 0.24269728  0.10307935  0.74812672 -1.32547564  0.12803054  0.30178118
  1.4714907  -0.29739704  0.1371599  -1.15357854  0.51335772 -0.73360958
  1.1908755  -0.67113991]
[0.]


In [73]:
epochs = 20000
#batch = X_train.shape[0]
batch = 64

sgd_rid2 = SGD_optimizer(fit_intercept = False, batch_size = batch, 
                        n_epochs = epochs,use_momentum= True,
                        regularization = 'l2',lmb=0.001,schedule = 'invscaling', lr0=0.2)
sgd_rid2.fit(X_train,z_train)
sgd_rid2.beta

array([[ 0.        ],
       [ 2.35451709],
       [ 1.2772085 ],
       [-4.40563112],
       [-6.05057245],
       [-2.21319381],
       [ 4.78984292],
       [ 8.14706156],
       [ 1.6339035 ],
       [ 3.24611884],
       [-2.66370341],
       [-1.39736773],
       [-3.50634364],
       [ 2.10726707],
       [-2.47401644]])

In [74]:
epochs = 20000
#batch = X_train.shape[0]
batch = 64

sgd_rid2 = SGD_optimizer(fit_intercept = False, batch_size = batch, 
                        n_epochs = epochs,use_momentum= True,
                        regularization = 'l2',lmb=0.001,schedule = 'constant', lr0=0.13)
sgd_rid2.fit(X_train,z_train)
sgd_rid2.beta

array([[ 0.        ],
       [ 2.37051145],
       [ 1.2755062 ],
       [-4.39729127],
       [-6.02393938],
       [-2.1984276 ],
       [ 4.7951057 ],
       [ 8.16525274],
       [ 1.60700479],
       [ 3.26724819],
       [-2.67555531],
       [-1.37498721],
       [-3.51123549],
       [ 2.12632762],
       [-2.50153532]])

In [75]:
def func(a):
    a= a+1

In [76]:
c = 1
func(c)
c

1

In [78]:
type((1,2,3))

tuple

In [80]:
len((1,))

1

In [5]:
a = [1,2,3,4]

for num in reversed(a[:-1]):
    print(num)

3
2
1
