## Coding problem: 

Implement a gradient descent method for Ridge Regression by using the PyTorch library. Your implementation should be a class that has the required methods “.fit” and “.predict”. You should include an application of your code to a data set.

In [183]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [213]:
#Code from class Intro_To_Locally_Weighted_Regression
class CustomMinMaxScaler:
    def __init__(self):
        self.min = None
        self.max = None

    def fit(self, data):
        """
        Compute the minimum and maximum value of the data for scaling.

        Args:
        - data (torch.Tensor): Input data tensor.
        """
        self.min = torch.min(data, dim=0, keepdim=True).values
        self.max = torch.max(data, dim=0, keepdim=True).values

    def transform(self, data):
        """
        Scale the data based on the computed minimum and maximum values.

        Args:
        - data (torch.Tensor): Input data tensor.

        Returns:
        - torch.Tensor: Scaled data tensor.
        """
        if self.min is None or self.max is None:
            raise ValueError("Scaler has not been fitted yet. Please call 'fit' with appropriate data.")
        
        #I discovered that sometimes in this step you might get self.max-self.min = 0, which leads to a division by 0
        #error, leading to NaNs in the output tensor. For instance, if one column of the tensor has a max value of 8
        #and a min value of 8, 8-8 = 0 (this happened in the cylinders column in the cars dataset) which leads to 
        #division by zero. I have added an extra step which replaces any NaNs in the scaled_data with 1.
        scaled_data = (data - self.min) / (self.max - self.min)
        scaled_data = torch.where(torch.isnan(scaled_data), torch.tensor(1.0), scaled_data)
        return scaled_data

    def fit_transform(self, data):
        """
        Fit to data, then transform it.

        Args:
        - data (torch.Tensor): Input data tensor.

        Returns:
        - torch.Tensor: Scaled data tensor.
        """
        self.fit(data)
        return self.transform(data)

#Getting gradient explosion with MSE increasing for some reason. Ask prof for advice.
class CustomRidgeModel:
    
    def __init__(self, alpha=0.00001, max_iter=10000, lr=0.02):
        self.alpha = alpha
        self.max_iter = max_iter
        self.lr = lr
        
    #User specifies if they would like to scale data here, if they would like to add an intercept column.
    def scale(self,X,y,scale=True,intercept=True):
        #X is of shape (n,p) where n is number of samples and p is number of features
        #y is of shape (n,1)
        self.X = torch.tensor(X)
        
        
        #Scaling
        if scale == True:
            scaler = CustomMinMaxScaler()
            self.X = scaler.fit_transform(self.X)
        self.y = torch.tensor(y)
        self.n = self.X.shape[0]
        
        #Add intercept
        if intercept:
            ones_column = torch.ones(self.n, 1)
            self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)
        self.p = self.X.shape[1]
        self.w=torch.tensor(np.ones((1,self.p))*0)
        
    #Fit the data: perform gradient descent until weights are optimized
    def fit(self,X,y,tolerance = 1e-6,scale=True,intercept=True):
        self.scale(X,y,scale,intercept)
        w = self.w
        for i in range(self.max_iter):
            X = self.X
            y = self.y
            yPred = X@w.flatten()
            error = y-yPred
            opt = (1/self.n)*torch.sum(error**2)+self.alpha*torch.sum(w**2)
            grad = -(2.0/self.n) * error@X + self.alpha*2*w
            mse = (1.0/self.n) * torch.sum(error**2)
            newW = w - self.lr*grad
            if torch.sum(abs(newW - w)) < tolerance:
                print("Gradient Descent has converged")
                break
            w = newW
            if i % 100 == 0:
                print("At iteration %s, the gradient is %s \n the MSE is %s, and the loss function is %s" % (str(i),str(grad.tolist()), mse.item(),opt.item()))
        self.w = newW
    #Predict new value of y for a row
    def predict(self,new_x,intercept=True,scale=True):
        w = self.w
        if scale:
            scaler = CustomMinMaxScaler()
            new_x = scaler.fit_transform(new_x)
        
        if intercept:
            ones_column = torch.ones(new_x.shape[0],1)
            new_x = torch.cat((ones_column, torch.tensor(new_x)), dim=1)            
        yPred = new_x@w.flatten()
        return yPred

model = CustomRidgeModel()
data = pd.read_csv('../content/01intro/cars.csv')
y = data['MPG'].values
X = data.drop(['MPG'],axis=1).values
#This will print the gradient descent at 100-iteration intervals
model.fit(X,y)
#This will print expected return mpg for a given set of inputs. This input can be of arbitrary length n as long as
#it has the same number of features as the data the model was fitted with.
new_X = torch.tensor(X)[0:25]
model.predict(new_X)
#We could use this with k-fold for further optimization of model hyperparameters.

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 0, the gradient is [[-46.891836719387754, -19.052142880612237, -11.927325599707315, -15.02158424758712]] 
 the MSE is 610.4738251295921, and the loss function is 610.4738251295921
At iteration 100, the gradient is [[-3.2701493142622247, 2.4959221730476187, 2.2975709819214107, 1.8475508740965552]] 
 the MSE is 63.148331680965676, and the loss function is 63.153079446202824
At iteration 200, the gradient is [[-1.7679976402980992, 1.330005324460747, 1.2709145049220247, 1.1177791322376072]] 
 the MSE is 33.17090350017052, and the loss function is 33.17836665652273
At iteration 300, the gradient is [[-0.9750708985105735, 0.6735153218173656, 0.6945420057932299, 0.7003622420653045]] 
 the MSE is 24.05559335849761, and the loss function is 24.06504525245937
At iteration 400, the gradient is [[-0.5397116029800763, 0.31502192677335267, 0.37788260542397367, 0.4686398536413705]] 
 the MSE is 21.232529732328388, and the loss function is 21.243232037601146
At iteration 500, the gradient

At iteration 6000, the gradient is [[-0.0028516283922191796, -0.01573247356268585, -0.013766407739827619, 0.03905956359602634]] 
 the MSE is 18.36003943777146, and the loss function is 18.37441603406192
At iteration 6100, the gradient is [[-0.002794996239477423, -0.015083515876452713, -0.013692783185567556, 0.03802266312087491]] 
 the MSE is 18.356176146796216, and the loss function is 18.370577180335875
At iteration 6200, the gradient is [[-0.0027397266531762453, -0.014456945884660045, -0.013616908821848064, 0.037015968024157776]] 
 the MSE is 18.35251206813563, and the loss function is 18.366936990252594
At iteration 6300, the gradient is [[-0.0026857820618843777, -0.013852033720446622, -0.013538913497643955, 0.03603855806522732]] 
 the MSE is 18.3490358672296, and the loss function is 18.363484138785402
At iteration 6400, the gradient is [[-0.0026331260096638933, -0.013268072819438774, -0.013458921267559374, 0.03508954159326688]] 
 the MSE is 18.345736886548213, and the loss functio

  new_x = torch.cat((ones_column, torch.tensor(new_x)), dim=1)


tensor([16.4219, 14.2659, 16.7077, 16.9884, 16.9103,  8.1404,  7.5841,  8.1455,
         7.0547, 12.3957, 14.5907, 15.0555, 12.8511, 16.6907, 29.1498, 23.7636,
        24.1697, 25.4969, 31.1875, 33.3104, 27.0464, 28.8434, 29.2948, 29.9948,
        25.0764], dtype=torch.float64)

# Complete the exercise provided in the Application to Locally Weighted Regression notebook and test the method on a data set, for example, the one provided in class.

Adjust the code below and make it work without errors. Compare the results with the previous ones.

In [179]:
# Gaussian Kernel
def Gaussian(x):
  return np.where(np.abs(x)>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*x**2))
# this is the correct vectorized version
def tricubic(x):
  return np.where(np.abs(x)>1,0,(1-np.abs(x)**3)**3)
# Epanechnikov Kernel
def Epanechnikov(x):
  return np.where(np.abs(x)>1,0,3/4*(1-np.abs(x)**2))
# Quartic Kernel
def Quartic(x):
  return np.where(np.abs(x)>1,0,15/16*(1-np.abs(x)**2)**2)

def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(dist(u,v)/(2*tau))

def dist(u,v):
  D = []
  if len(v.shape)==1:
    v = v.reshape(1,-1)
  # we would like all the pairwise combinations if u and v are matrices
  # we could avoid two for loops if we consider broadcasting
  for rowj in v:
    D.append(np.sqrt(np.sum((u-rowj)**2,axis=1)))
  return np.array(D).T

def kernel_function(xi,x0,kern, tau):
    return kern((xi - x0)/(2*tau))

In [188]:
#This code was copied and adjusted from Efficeint_Applications_with_Distances_and_Weights
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = LinearRegression()
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
            lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
            yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = np.zeros(n)
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test[i] = lm.predict(x_new[i].reshape(1,-1))
        return yest_test

#We'll use the same X and y as last time but we'll pre-scale them this time.
scaler = MinMaxScaler()
data = pd.read_csv('../content/01intro/cars.csv')
y = data['MPG']
X = data.drop(['MPG'],axis=1)
Xscaled = scaler.fit_transform(torch.tensor(X.values))
model = Lowess()
model.fit(Xscaled,y)
#This returns a tensor with y values for each x in the input
yEst = model.predict(Xscaled)
yEst

array([17.12717127, 16.35627513, 16.6662146 , 17.26997367, 17.40461308,
       14.37842598, 14.09099113, 14.42087629, 13.74169599, 15.41415626,
       15.16359016, 16.47623847, 15.06659002, 13.99996086, 28.04194919,
       21.77241641, 21.78087761, 21.77325133, 31.23932603, 32.71997936,
       25.72652373, 28.06934906, 28.69713971, 28.32976296, 21.86163991,
       12.99761072, 13.59975579, 13.6746693 , 11.25667231, 31.23932603,
       27.08072068, 28.99070598, 19.80860843, 19.12274268, 18.50875233,
       18.67196718, 19.29718566, 14.65380186, 13.96137964, 14.89407973,
       14.92000035, 12.089635  , 12.86433643, 11.97745926, 21.76825052,
       26.68687283, 18.79800082, 19.78969263, 28.34130208, 29.42987767,
       33.30931607, 32.58983282, 34.0829047 , 34.18797713, 32.7226503 ,
       32.88433991, 28.6500694 , 31.21803773, 30.26129802, 26.68687283,
       28.30553224, 14.38580341, 14.31411677, 14.75629636, 15.00197065,
       16.80305546, 13.04183511, 13.4168358 , 13.62232151, 14.14

In [190]:
#Let's see the MSE
from sklearn.metrics import mean_squared_error as mse
mse(yEst,y)

13.791404587907422

In [214]:
#Compared to the ridge-regression class we wrote earlier, we see an improvement!
#Here, we use k-fold validation to compare how this class does compared to the custom ridge regression we wrote
from sklearn.model_selection import KFold
mse_lwr = []
mse_rf = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = CustomRidgeModel()
model_lw = Lowess(kernel= Epanechnikov,tau=0.14)

data = pd.read_csv('../content/01intro/cars.csv')
y = data['MPG'].values
x = data.drop(['MPG'],axis=1).values
scale = MinMaxScaler()

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  model_lw.fit(xtrain,ytrain)
  yhat_lw = model_lw.predict(xtest)

  model_rf.fit(xtrain,ytrain)
  yhat_rf = model_rf.predict(xtest,scale=False)

  mse_lwr.append(mse(ytest,yhat_lw))
  mse_rf.append(mse(ytest,yhat_rf))
print('The Cross-validated Mean Squared Error for Locally Weighted Regression is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for Ridge-Regression is : '+str(np.mean(mse_rf)))
#As we can see, Lowess beats ridge!

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 0, the gradient is [[-46.83579545454547, -19.079886394318176, -11.986209499853182, -15.802950244726192]] 
 the MSE is 609.1130955517049, and the loss function is 609.1130955517049
At iteration 100, the gradient is [[-3.314563973745384, 2.4989227051603233, 2.2478344983135545, 1.8892354701013572]] 
 the MSE is 63.42874159310418, and the loss function is 63.43345613439054
At iteration 200, the gradient is [[-1.7955507710815957, 1.3291254213002517, 1.2395644600071472, 1.1406659768800644]] 
 the MSE is 33.17812726312421, and the loss function is 33.18558148829518
At iteration 300, the gradient is [[-0.9907510229514866, 0.6723728469878286, 0.6745265550141913, 0.7134507148775177]] 
 the MSE is 23.983167570653166, and the loss function is 23.992628425834184
At iteration 400, the gradient is [[-0.5489485490723223, 0.31398812591421177, 0.3641907785451106, 0.47624660100195426]] 
 the MSE is 21.13600582789536, and the loss function is 21.146728074285953
At iteration 500, the gradient 

At iteration 6100, the gradient is [[-0.0030735431812554765, -0.011911924088155551, -0.017149315313882897, 0.0358695029399056]] 
 the MSE is 18.293203724953244, and the loss function is 18.307564576211917
At iteration 6200, the gradient is [[-0.003012078726801113, -0.0113169925151521, -0.017041294835232076, 0.03490632847412171]] 
 the MSE is 18.289804004363322, and the loss function is 18.30418694535873
At iteration 6300, the gradient is [[-0.0029521944844204603, -0.010744230551895446, -0.016931524166438915, 0.033972654554722065]] 
 the MSE is 18.28657185142196, and the loss function is 18.30097637433661
At iteration 6400, the gradient is [[-0.0028938430627959027, -0.010192870276303896, -0.0168201236604614, 0.03306752542476743]] 
 the MSE is 18.28349752608311, and the loss function is 18.297923133647355
At iteration 6500, the gradient is [[-0.0028369785932033124, -0.009662169763925994, -0.016707208931197263, 0.03219001694872677]] 
 the MSE is 18.2805718996353, and the loss function is 

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 1600, the gradient is [[-0.00819287140436872, -0.07780165441916716, -0.011338618426427872, 0.1292452320652306]] 
 the MSE is 18.136229917048787, and the loss function is 18.148705386277257
At iteration 1700, the gradient is [[-0.007815173911094278, -0.07514446011569988, -0.011712425692960333, 0.12528485925594998]] 
 the MSE is 18.091704635785046, and the loss function is 18.104219320331577
At iteration 1800, the gradient is [[-0.007525210818653482, -0.07251331009291993, -0.012010603575139398, 0.12148805201290026]] 
 the MSE is 18.04991327547739, and the loss function is 18.062467393263425
At iteration 1900, the gradient is [[-0.007285872525606117, -0.0699359815802675, -0.012260722892879715, 0.1178320002998047]] 
 the MSE is 18.010683603197272, and the loss function is 18.023277373358674
At iteration 2000, the gradient is [[-0.0070766663597361685, -0.06742645734210205, -0.012478208199175089, 0.11430252621974768]] 
 the MSE is 17.973854518879847, and the loss function is 17.

At iteration 7200, the gradient is [[-0.0021949892532271997, -0.007754042483182001, -0.012187078985522019, 0.025753263527406024]] 
 the MSE is 17.413490172567624, and the loss function is 17.427627290093806
At iteration 7300, the gradient is [[-0.002151978659656058, -0.007369821038733519, -0.012093234357499594, 0.025075788379856346]] 
 the MSE is 17.411764005709696, and the loss function is 17.42591680265111
At iteration 7400, the gradient is [[-0.0021100057680343513, -0.006999766194823714, -0.01199859805582814, 0.02441830205589773]] 
 the MSE is 17.41012123509301, and the loss function is 17.42428933225251
At iteration 7500, the gradient is [[-0.0020690414332615913, -0.006643396230035326, -0.011903244634327527, 0.023780180762255958]] 
 the MSE is 17.40855720962829, and the loss function is 17.422740236660363
At iteration 7600, the gradient is [[-0.002029057399088383, -0.00630024534835692, -0.011807245603433896, 0.023160820648833996]] 
 the MSE is 17.40706755833784, and the loss functi

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 2100, the gradient is [[-0.0068091311913626865, -0.06500329000225959, -0.017181947305604767, 0.11562008988909508]] 
 the MSE is 18.251746986315926, and the loss function is 18.264757960661306
At iteration 2200, the gradient is [[-0.006652612168041782, -0.06266796013618899, -0.01733770790304888, 0.11235411563909053]] 
 the MSE is 18.216879829471857, and the loss function is 18.229932952399295
At iteration 2300, the gradient is [[-0.0065037486719800525, -0.06040715899604209, -0.017477070337584687, 0.10918807320423508]] 
 the MSE is 18.18402846856826, and the loss function is 18.197123805419462
At iteration 2400, the gradient is [[-0.0063606938819581, -0.058219808490881896, -0.01760183575717855, 0.10611804917430726]] 
 the MSE is 18.153071491970923, and the loss function is 18.166209036276353
At iteration 2500, the gradient is [[-0.006222391407512631, -0.056104265909881434, -0.01771321726410891, 0.10314062566005638]] 
 the MSE is 18.12389493640066, and the loss function is 18

At iteration 8000, the gradient is [[-0.0021408719358279814, -0.00491422124912401, -0.014263155912950628, 0.023960827203324625]] 
 the MSE is 17.64188666709535, and the loss function is 17.656655006110924
At iteration 8100, the gradient is [[-0.0021041965322065996, -0.0046240541412824075, -0.014135675178776874, 0.023385293052500863]] 
 the MSE is 17.64029235478454, and the loss function is 17.655076335307932
At iteration 8200, the gradient is [[-0.00206828163689191, -0.004344782947302384, -0.014008069865873463, 0.02282560295966208]] 
 the MSE is 17.638764276678547, and the loss function is 17.653563559439093
At iteration 8300, the gradient is [[-0.0020331083479946366, -0.004076041384504985, -0.013880397236561634, 0.022281286627039248]] 
 the MSE is 17.6372991529446, and the loss function is 17.65211340629101
At iteration 8400, the gradient is [[-0.001998658294742256, -0.00381747502018964, -0.01375271205998952, 0.021751888219594336]] 
 the MSE is 17.63589388620427, and the loss function

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 2600, the gradient is [[-0.004961886109674414, -0.061827545588348944, -0.008514760881504173, 0.09954598641858058]] 
 the MSE is 18.888332066750817, and the loss function is 18.90154868894217
At iteration 2700, the gradient is [[-0.004849690532793719, -0.05975484561924628, -0.008703123734883226, 0.09674934184328608]] 
 the MSE is 18.861433233753733, and the loss function is 18.874693294113776
At iteration 2800, the gradient is [[-0.00474064237299635, -0.057747498495819616, -0.008880038800591905, 0.09403513782390038]] 
 the MSE is 18.83608487121676, and the loss function is 18.84938811215564
At iteration 2900, the gradient is [[-0.004634554965188606, -0.0558035639646668, -0.009045994433145127, 0.09140084493504196]] 
 the MSE is 18.812195462857392, and the loss function is 18.825541576471455
At iteration 3000, the gradient is [[-0.004531290523241967, -0.053921124799999116, -0.009201431705129087, 0.08884403365209366]] 
 the MSE is 18.78967894944183, and the loss function is 18

At iteration 8100, the gradient is [[-0.0016069288120297755, -0.007924354910995298, -0.00950438915605779, 0.022378075525070616]] 
 the MSE is 18.429244787653925, and the loss function is 18.44405877894303
At iteration 8200, the gradient is [[-0.0015778803127179243, -0.007588211086537314, -0.009440086271797184, 0.021816287496258083]] 
 the MSE is 18.427948422768637, and the loss function is 18.442777285645345
At iteration 8300, the gradient is [[-0.0015494705021852433, -0.007263489068386841, -0.009374950562869874, 0.021270137802891543]] 
 the MSE is 18.426713429498342, and the loss function is 18.441556805939697
At iteration 8400, the gradient is [[-0.0015216829640193686, -0.006949826021713181, -0.009309042035822872, 0.020739168026962972]] 
 the MSE is 18.425536527630534, and the loss function is 18.440394067939547
At iteration 8500, the gradient is [[-0.001494501744662744, -0.006646870356955601, -0.009242418438496212, 0.020222933501826603]] 
 the MSE is 18.424414623940184, and the loss

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 3500, the gradient is [[-0.005372417507699861, -0.04537131437765403, -0.013841180351693814, 0.08335210205057479]] 
 the MSE is 19.69917178402542, and the loss function is 19.712967040118354
At iteration 3600, the gradient is [[-0.0052451736696548295, -0.043787364471693714, -0.013837291075358263, 0.08100260080897158]] 
 the MSE is 19.681204951230928, and the loss function is 19.69504545208606
At iteration 3700, the gradient is [[-0.005121356443590085, -0.04225440597869512, -0.013828198275107556, 0.07872250784456991]] 
 the MSE is 19.664265058420384, and the loss function is 19.67815015420966
At iteration 3800, the gradient is [[-0.005000866488712663, -0.04077084546484875, -0.01381412905201926, 0.07650972876609641]] 
 the MSE is 19.648291577421173, and the loss function is 19.66222059947013
At iteration 3900, the gradient is [[-0.00488360758099668, -0.03933513859660262, -0.013795302586036905, 0.0743622330271208]] 
 the MSE is 19.63322759633912, and the loss function is 19.64

At iteration 9300, the gradient is [[-0.0015345427026487177, -0.004091412766883586, -0.009593232956288316, 0.017273017157980634]] 
 the MSE is 19.385809199513215, and the loss function is 19.40116400095464
At iteration 9400, the gradient is [[-0.001505374486301234, -0.003873625936773387, -0.0094999319171014, 0.01684151316473115]] 
 the MSE is 19.384996189741127, and the loss function is 19.400363433325754
At iteration 9500, the gradient is [[-0.0014768740000733927, -0.003663683493641073, -0.00940694875000177, 0.01642199398919219]] 
 the MSE is 19.38421918497261, and the loss function is 19.3995985667366
At iteration 9600, the gradient is [[-0.0014490236446043443, -0.003461330221595671, -0.009314305896272669, 0.016014108035629807]] 
 the MSE is 19.383476340426988, and the loss function is 19.398867564014587
At iteration 9700, the gradient is [[-0.001421806323653586, -0.0032663189137050884, -0.00922202470191545, 0.015617514284604787]] 
 the MSE is 19.382765914376975, and the loss functio

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 4100, the gradient is [[-0.004871943151993253, -0.030779817362320317, -0.022832530970891204, 0.07105154170025732]] 
 the MSE is 18.402613476021088, and the loss function is 18.41656145284752
At iteration 4200, the gradient is [[-0.004771636606598123, -0.029534280017319302, -0.02269919478070732, 0.06909356980488954]] 
 the MSE is 18.389849485302825, and the loss function is 18.403838282499844
At iteration 4300, the gradient is [[-0.004673822833178106, -0.0283317823703412, -0.022562519644855186, 0.06719403710671093]] 
 the MSE is 18.377778211200223, and the loss function is 18.391807135432323
At iteration 4400, the gradient is [[-0.004578431426251129, -0.02717091118481752, -0.022422708738950057, 0.06535113320820746]] 
 the MSE is 18.366358974258155, and the loss function is 18.380427328402146
At iteration 4500, the gradient is [[-0.004485394111297035, -0.02605029875427144, -0.02227995753053763, 0.06356310470039231]] 
 the MSE is 18.355553574145247, and the loss function is 1

At iteration 9800, the gradient is [[-0.0016970198818001912, -0.0003730581950885878, -0.013648957096888131, 0.016374627731882054]] 
 the MSE is 18.16169622696302, and the loss function is 18.177035904915567
At iteration 9900, the gradient is [[-0.0016693126214652478, -0.0002334377078276261, -0.013499606360033932, 0.015999219912532146]] 
 the MSE is 18.160785499293354, and the loss function is 18.176137448296693
At iteration 0, the gradient is [[-47.443059450424904, -18.898130333144493, -11.752400630286004, -14.984866586558908]] 
 the MSE is 623.3149268538247, and the loss function is 623.3149268538247
At iteration 100, the gradient is [[-3.2309163941772727, 2.490033784690403, 2.3148959645774374, 1.8770188874565084]] 
 the MSE is 63.158119642781344, and the loss function is 63.16298411000525
At iteration 200, the gradient is [[-1.7440598309365716, 1.3252341327841342, 1.2873069747276862, 1.1377550314207312]] 
 the MSE is 33.26401153020861, and the loss function is 33.271582831825285
At i

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 3900, the gradient is [[-0.003969539685490518, -0.03973059980513157, -0.009488787279835726, 0.06868037756476529]] 
 the MSE is 18.518621611686964, and the loss function is 18.53251887735498
At iteration 4000, the gradient is [[-0.0038797738067970506, -0.03829222561564999, -0.009587378218568502, 0.06670403024913664]] 
 the MSE is 18.50616127681258, and the loss function is 18.520096977527277
At iteration 4100, the gradient is [[-0.003792418966034849, -0.036901882168710355, -0.009677869750576131, 0.06478813163633917]] 
 the MSE is 18.494435527848825, and the loss function is 18.50840895969606
At iteration 4200, the gradient is [[-0.003707403738475379, -0.035558009590070126, -0.009760581616355011, 0.06293078388508523]] 
 the MSE is 18.483399386940775, and the loss function is 18.49740984083008
At iteration 4300, the gradient is [[-0.0036246589274119673, -0.03425909813412943, -0.009835822579712305, 0.06113014937626597]] 
 the MSE is 18.473010673966808, and the loss function is

At iteration 8500, the gradient is [[-0.0015346683457220667, -0.005984723906964296, -0.009106518052937993, 0.019203261225244884]] 
 the MSE is 18.314349015773747, and the loss function is 18.329362892379713
At iteration 8600, the gradient is [[-0.0015065803158416952, -0.005697853709558634, -0.009042706200813107, 0.01871278622485735]] 
 the MSE is 18.31337971916231, and the loss function is 18.328406454634685
At iteration 8700, the gradient is [[-0.001479131509569517, -0.005421326140746125, -0.00897813431903523, 0.01823646317364985]] 
 the MSE is 18.312455854979966, and the loss function is 18.327495122231316
At iteration 8800, the gradient is [[-0.0014523046879518094, -0.005154796302432658, -0.008912861169646069, 0.017773858699065174]] 
 the MSE is 18.31157494721861, and the loss function is 18.326626427604793
At iteration 8900, the gradient is [[-0.0014260831234152633, -0.004897930492319582, -0.008846943212719383, 0.017324553041243553]] 
 the MSE is 18.310734666156957, and the loss fu

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 3700, the gradient is [[-0.004372116168467608, -0.0376462664866816, -0.01222022068794148, 0.06981515937846318]] 
 the MSE is 19.06305098362521, and the loss function is 19.07651612917295
At iteration 3800, the gradient is [[-0.004275530187448024, -0.03628488287243459, -0.012268569132247576, 0.06786416788833896]] 
 the MSE is 19.050467429656827, and the loss function is 19.063967282510546
At iteration 3900, the gradient is [[-0.004181434355453002, -0.034968612656833244, -0.01230965108305808, 0.06597120046498285]] 
 the MSE is 19.038600137005933, and the loss function is 19.05213414781238
At iteration 4000, the gradient is [[-0.004089757627523803, -0.033696008534748585, -0.012343771170700926, 0.06413447899403193]] 
 the MSE is 19.02740647957901, and the loss function is 19.04097408927916
At iteration 4100, the gradient is [[-0.0040004311430392285, -0.032465668993039806, -0.012371223410631101, 0.06235228061104559]] 
 the MSE is 19.01684642127507, and the loss function is 19.0

At iteration 8500, the gradient is [[-0.0016410263209213093, -0.005067255714315885, -0.01003246516495413, 0.019212359397039226]] 
 the MSE is 18.84826872136734, and the loss function is 18.862807153450053
At iteration 8600, the gradient is [[-0.001610867862017001, -0.004813567751251124, -0.009944570279639634, 0.0187354039751221]] 
 the MSE is 18.84728290738855, and the loss function is 18.861833431138834
At iteration 8700, the gradient is [[-0.0015813716110615575, -0.004569100364826781, -0.009856412012114091, 0.018271737486013278]] 
 the MSE is 18.846341128398617, and the loss function is 18.8609034557702
At iteration 8800, the gradient is [[-0.001552520573138985, -0.004333547464872452, -0.009768038855383618, 0.017820965299513773]] 
 the MSE is 18.845441109106993, and the loss function is 18.8600149590232
At iteration 8900, the gradient is [[-0.0015242982362658262, -0.004106612788530323, -0.009679497245311813, 0.01738270486987761]] 
 the MSE is 18.844580702988797, and the loss function

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 3900, the gradient is [[-0.004956936341588017, -0.038339598756025536, -0.017555118632304134, 0.0766249818835513]] 
 the MSE is 18.833681341301485, and the loss function is 18.847563923704115
At iteration 4000, the gradient is [[-0.004855212326892633, -0.03692925280975846, -0.017538476997817732, 0.07453962048386417]] 
 the MSE is 18.81871421948176, and the loss function is 18.832640879017596
At iteration 4100, the gradient is [[-0.004755947366844319, -0.03556564421742135, -0.017515269525316957, 0.07251506974606989]] 
 the MSE is 18.80456897869254, and the loss function is 18.818539016268684
At iteration 4200, the gradient is [[-0.004659074455206667, -0.034247282358441974, -0.017485795024688434, 0.07054949433367087]] 
 the MSE is 18.79119785257739, and the loss function is 18.805210559921168
At iteration 4300, the gradient is [[-0.0045645285542364, -0.032972723412356394, -0.017450341745896422, 0.06864111519467805]] 
 the MSE is 18.77855592880511, and the loss function is 18.

At iteration 9500, the gradient is [[-0.0017309159228364834, -0.002830972780029989, -0.011947502523412614, 0.0180733286618455]] 
 the MSE is 18.55985447421408, and the loss function is 18.575234026989076
At iteration 9600, the gradient is [[-0.0017016836372604192, -0.002639761880983685, -0.011824750788427861, 0.017650148116370543]] 
 the MSE is 18.558899577798297, and the loss function is 18.57429247042269
At iteration 9700, the gradient is [[-0.0016730304601678893, -0.002455872540435991, -0.011702483647669118, 0.01723825750476036]] 
 the MSE is 18.557982437758213, and the loss function is 18.573388372661984
At iteration 9800, the gradient is [[-0.0016449428141804795, -0.0022790588743033186, -0.011580730643060227, 0.01683733107543329]] 
 the MSE is 18.557101292228477, and the loss function is 18.572519978964856
At iteration 9900, the gradient is [[-0.001617407484375481, -0.002109082881568362, -0.011459519801634451, 0.016447052858797596]] 
 the MSE is 18.556254472520873, and the loss fu

  self.X = torch.cat((ones_column, torch.tensor(self.X)), dim=1)


At iteration 3700, the gradient is [[-0.002953019003376409, -0.037844948347150686, -0.008123759349421665, 0.06395345602234681]] 
 the MSE is 18.033691016482937, and the loss function is 18.047029071145403
At iteration 3800, the gradient is [[-0.002889246191199088, -0.03648934025630864, -0.008214134423015133, 0.062100074590628905]] 
 the MSE is 18.022801323018214, and the loss function is 18.036170216553963
At iteration 3900, the gradient is [[-0.002827111058182099, -0.03517897102650909, -0.00829703786038292, 0.06030343332896713]] 
 the MSE is 18.012558149414872, and the loss function is 18.025957347842894
At iteration 4000, the gradient is [[-0.0027665664138110235, -0.03391236980949264, -0.00837276853950698, 0.058561751387322114]] 
 the MSE is 18.002921833659887, and the loss function is 18.016350795683195
At iteration 4100, the gradient is [[-0.002707566550801889, -0.0326881130597272, -0.008441614969274334, 0.05687330442856046]] 
 the MSE is 17.99385518218084, and the loss function is

At iteration 7800, the gradient is [[-0.0012963537111144055, -0.007525610650710688, -0.008053763470299726, 0.020102246973347893]] 
 the MSE is 17.86159036374348, and the loss function is 17.875775594047955
At iteration 7900, the gradient is [[-0.00127275167239436, -0.007200011933116821, -0.00800025156400548, 0.01957081297485951]] 
 the MSE is 17.86055086970081, and the loss function is 17.874747961691728
At iteration 8000, the gradient is [[-0.0012496705131687263, -0.006885933624998703, -0.0079457802472019, 0.019054910111045913]] 
 the MSE is 17.85956374773218, and the loss function is 17.873772397237737
At iteration 8100, the gradient is [[-0.0012270966424556625, -0.0065829929354813445, -0.00789041513976498, 0.018554061027774623]] 
 the MSE is 17.858626042247337, and the loss function is 17.872845952612096
At iteration 8200, the gradient is [[-0.0012050168635617444, -0.00629081949909102, -0.007834219306567797, 0.01806780337845452]] 
 the MSE is 17.857734974715694, and the loss functio