In [1]:
# Import Libraries

from sklearn.datasets import make_regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import r2_score

### Stochastic Gradient Descent

`Unline BGD which uses entire dataset, SGD updates weights after processing just one random sample at a time.` <br>
`This makes SGD much faster but at the same time introduces more noise into updates.`

#### Advantage
> **Faster than BGD:** <br>`Since it updates weights after each sample, it is faster for large datasets.`<br>
> **Works well with Large Dataset:** <br>`No need to load entire dataset into memory.`<br>
> **Can Escape Local Minima:** <br>`The randomness in the updates can help SGD to escape local minima unlike BGD.`

#### Disadvantage
> **Noisy Updates:** <br>`Each sample can lead to different updates, making convergence less stable.`<br>
> **May Fluctuate around Minimum:** <br>`Since weights change frequently, it may not settle exactly at minimum but oscillate (aaju baju mein) around it.`<br>
> **Requires Careful Learning Rate Tuning:** <br>`If the learning rate is too high SGD might overshoot the optimal values.`

In [3]:
# Create data for regression

X,y = make_regression(n_samples= 100, n_features= 5, n_informative= 3, n_targets= 1, noise= 50, random_state= 1)

In [4]:
# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 0)

### Stochastic GD from Scratch

In [5]:
# write a class for for gradient descent

class SGD:
    '''
    training: method for learning the intercept and slope,
    testing: method for visual representation of the predicted line.
    '''
    def __init__(self, learning_rate=0.005, epochs= 100):
        '''
        Input: intercept, slope and learning rate as input
        '''
        self.intercept_ = None
        self.coeff_ = None
        self.learning_rate = learning_rate
        self.epochs = epochs
    
    def training(self, X_train, y_train):
        '''
        Input: independent(X_train), dependant(y_train) variable and number of iterations
        Output: returns updated intercept and slope
        '''
        

        X_train = pd.DataFrame(X_train) # convert data to dataframe

        # Initilize weights and bias
        self.intercept_ = 0
        self.coeff_ = np.ones(X_train.shape[1])
        
        for iter in range(self.epochs):
            for j in range(X_train.shape[0]):

                # Choose random data
                index_ = np.random.randint(0,X_train.shape[0]-1) 
                
                # Compute predictions
                y_pred = np.dot(X_train.iloc[index_,:], self.coeff_) + self.intercept_ # m*x + b
    
                # Compute gradients
                derivative_intercept = -2 * np.sum(y_train[index_] - y_pred) # -2*(y_train[index_] - intercept - X_train[index_]* coeff)
                derivative_slope = -2 * np.dot((y_train[index_] - y_pred), X_train.iloc[index_,:]) # -2*((y_train[index_] - intercept - X_train[index_]* coeff)*X_train[index_])
                
                self.intercept_ = self.intercept_ - (self.learning_rate * derivative_intercept)
                self.coeff_ = self.coeff_ - (self.learning_rate * derivative_slope)
            
        return self.intercept_, self.coeff_

    def testing(self, X):
        '''
        Input: independent(X_train) and dependant(y_train) variable
        Output: predictions
        '''
        y_test_pred = np.dot(X,self.coeff_) + self.intercept_
        
        return y_test_pred

In [6]:
# create instance/object

sgd = SGD()
sgd

<__main__.SGD at 0x1f7eea28da0>

In [7]:
sgd.training(X_train, y_train)

(np.float64(-0.7664534184622473),
 array([12.30265514, 51.40907628,  4.64526623, -7.55577363, 29.81169035]))

`It takes random observation, so it will always give different output.`

### With SGD Regressor

In [8]:
s = SGDRegressor()
s

In [9]:
s.fit(X_train, y_train)

In [10]:
s.coef_, s.intercept_

(array([16.05309565, 53.82710285,  4.96908387, -4.80787776, 30.73675068]),
 array([0.10413235]))

`It takes random observation, so it will always give different output.`