<a href="https://colab.research.google.com/github/mounikarevanuru/mlfoundations/blob/main/algorithms/linear_regression/linear_regression_stochastic_gradient_descent_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

In [48]:
class LinearRegressionBatchGD:

  def __init__(self, lr = 0.01, epochs = 1000):
    self.lr = lr
    self.epochs = epochs
    self.coef_ = None
    self.intercept_ = None

  def _rmse_loss(self, X, y):
    y_pred = self.predict(X)
    return np.sqrt(np.mean((y - y_pred) ** 2))


  def fit(self, X, y):

    self.coef_ = np.ones(X.shape[1])
    self.intercept_ = 0

    for i in range(self.epochs):

      y_pred = X.dot(self.coef_) + self.intercept_
      intercept_der = -2 * np.mean(y - y_pred)
      coef_der = -2/X.shape[0] * X.T.dot((y - y_pred))

      self.intercept_ = self.intercept_ - (self.lr * intercept_der)
      self.coef_ = self.coef_ - (self.lr *  coef_der)

    self.loss_ = self._rmse_loss(X, y)

  def predict(self, X):

    return X.dot(self.coef_) + self.intercept_

  def r2_score(self, X, y):

    y_pred = self.predict(X)
    ss_res = np.sum((y - y_pred) ** 2)
    ss_total = np.sum((y - np.mean(y)) ** 2)

    return 1 - ss_res/ss_total


In [49]:
diabetes = load_diabetes()
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [50]:
X = diabetes.data
y = diabetes.target

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [52]:
model = LinearRegressionBatchGD(lr = 0.6)
model.fit(X_train, y_train)

In [53]:
print(model.predict(X_test))

[141.51279539 178.62908894 140.77251672 291.07878201 123.27070535
  96.76926386 253.83392118 190.42596669  86.68872562 114.00780666
  95.72492383 156.80878145  65.77063242 208.23064379 103.60355208
 135.0238285  223.10924193 246.50096397 195.1674512  213.56109834
 202.9736918   88.58441925  75.28169534 188.36180652 155.0722236
 163.93939886 188.42652927 176.28135344  51.28704169 114.92518814
 180.35200695  93.66909149 132.51009634 181.45520685 172.9188356
 190.0329094  125.46294593 121.62229767 150.3919376   60.83994722
  79.8015025  110.65116288 159.3917743  151.89244144 174.29691861
  65.55381978  81.90539323 106.22808299  60.9118396  156.58465532
 154.59877856  65.6308711  115.67851496 109.2792167  168.84866359
 156.65804169  97.37745296 204.44398244 115.89530288  68.71755381
 183.38864584 198.14703483 141.74287125 109.69512064 126.07025353
 198.87322979 167.49780706 160.21298932 114.42163994 139.96427958
 178.7631951  195.16637469 238.49806467 142.04836617  82.62626765
 150.0567240

In [54]:
print(model.coef_)
print(model.intercept_)

[  40.26454631 -216.68668283  525.41595534  331.42896969  -79.36458851
 -125.38903971 -217.50258709  145.0737031   387.84689637  100.45153024]
151.33914129001272


In [55]:
model.r2_score(X_test, y_test)

np.float64(0.45818074963212363)

In [57]:
print("RMSE loss:", model.loss_)

RMSE loss: 53.90852302552439
