In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv


In [9]:
import seaborn as sns 
import plotly.express as px 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import SGDRegressor

In [14]:
class InputNotArrayException(Exception):
    def __init__(self, message = 'Given input is not an array nor dataframe'):
        self.message = message 
        super().__init__(message)

        
class BatchGradientDescent():
    
    def __init__(self, epochs = 10, learning_rate = 0.0001, threshold = 0.0000001):
        self.learning_rate = learning_rate 
        self.epochs = epochs 
        self.threshold = threshold
        self.coef_ = None 
    
    def fit(self, X, y):
        
        expression = not ((isinstance(X, np.ndarray) and isinstance(y, np.ndarray)) or (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)))
        if expression:
            raise InputNotArrayException()
        elif  (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)):
            X, y = X.values, y.values 
        rows, cols = X.shape

        X = np.hstack([X, np.ones((rows, 1))])        
        y = y.reshape(rows, 1)
        B = np.zeros((cols + 1, 1))
        
        for i in range(self.epochs):
            # Find the gradient at that point 
            
            slopes = -2*X.T.dot(y - X.dot(B))
            B = B - self.learning_rate*slopes
            
        self.coef_ = B
        
        return self 
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        X = np.hstack([X, np.ones((rows, 1))])
        return X.dot(self.coef_)
    
    def score(self, X, y):
        
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        
        y_preds = self.predict(X)
        y_mean = np.mean(y)
        explained_variance = (y_preds - y_mean)*(y_preds - y_mean)
        total_variance = (y - y_mean)*(y - y_mean)
        R2 =np.sum(explained_variance)/np.sum(total_variance)
        return R2 
           
        

In [70]:
class StochasticGradientDescent():
    
    def __init__(self, epochs = 1000, learning_rate = 0.0001, threshold = 0.0000001):
        self.learning_rate = learning_rate 
        self.epochs = epochs 
        self.threshold = threshold
        self.coef_ = None 
    
    def fit(self, X, y):
        
        expression = not ((isinstance(X, np.ndarray) and isinstance(y, np.ndarray)) or (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)))
        if expression:
            raise InputNotArrayException()
        elif  (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)):
            X, y = X.values, y.values 
        
        rows, cols = X.shape

        X = np.hstack([X, np.ones((rows, 1))])        
        y = y.reshape(rows, 1)
        B = np.zeros((cols + 1, 1))
        
        for i in range(self.epochs):
            # Find the gradient at that point 
            
            for j in range(rows):
                idx = np.random.randint(0, rows)
                
                slopes = -2*X.T[:, idx]*(y[idx] - X[idx, :].dot(B))
                B = B - self.learning_rate*slopes.reshape(cols+1, 1)
                
        self.coef_ = B
        
        return self 
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        X = np.hstack([X, np.ones((rows, 1))])
        return X.dot(self.coef_)
    
    def score(self, X, y):
        
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        
        y_preds = self.predict(X)
        y_mean = np.mean(y)
        explained_variance = (y_preds - y_mean)*(y_preds - y_mean)
        total_variance = (y - y_mean)*(y - y_mean)
        R2 =np.sum(explained_variance)/np.sum(total_variance)
        return R2 
    

In [71]:
df = pd.read_csv('/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv')
df

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891
5,2.9,23.0,56642
6,3.0,23.0,60150
7,3.2,23.3,54445
8,3.2,23.3,64445
9,3.7,23.6,57189


In [117]:
%%timeit
X, y = df.iloc[:, :-1], df.iloc[:, -1]
gd = BatchGradientDescent(learning_rate = 0.00003, epochs=870)
gd.fit(X.values, y.values)

6.45 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [116]:
gd.score(X.values, y.values)

0.9513079327269061

In [195]:
best_score  = 0
epoch = {}
for i in range(20, 100):
    sgd = StochasticGradientDescent(epochs = i, learning_rate = 0.0008)
    sgd.fit(X.values, y.values)
    current_score = sgd.score(X.values, y.values)
    if current_score > best_score and current_score < 1:
        best_score = current_score 
        epoch[i] = current_score 
        
        
        
    
    

In [197]:
sgd = StochasticGradientDescent(epochs = 58, learning_rate = 0.0008)
sgd.fit(X.values, y.values)

<__main__.StochasticGradientDescent at 0x7fa118330050>

In [198]:
sgd.score(X.values, y.values)

0.9827107277614602

In [199]:
sgd.coef_

array([[6598.97401695],
       [1625.224904  ],
       [-633.17998499]])

In [156]:
gd.coef_

array([[6707.56267051],
       [1506.17389487],
       [-601.04989628]])