In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv


In [2]:
import seaborn as sns 
import plotly.express as px 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import SGDRegressor

In [3]:
class InputNotArrayException(Exception):
    def __init__(self, message = 'Given input is not an array nor dataframe'):
        self.message = message 
        super().__init__(message)

        
class BatchGradientDescent():
    
    def __init__(self, epochs = 100000, learning_rate = 0.0001, threshold = 0.0000001):
        self.learning_rate = learning_rate 
        self.epochs = epochs 
        self.threshold = threshold
        self.coef_ = None 
    
    def fit(self, X, y):
        
        expression = not ((isinstance(X, np.ndarray) and isinstance(y, np.ndarray)) or (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)))
        if expression:
            raise InputNotArrayException()
        elif  (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)):
            X, y = X.values, y.values 
        rows, cols = X.shape

        X = np.hstack([X, np.ones((rows, 1))])        
        y = y.reshape(rows, 1)
        B = np.zeros((cols + 1, 1))
        
        for i in range(self.epochs):
            # Find the gradient at that point 
    
            for j in range(cols+1):
                slope = -2*np.sum(X[:, j].reshape((1, rows)).dot(y - X.dot(B)))
                B[j] = B[j] - self.learning_rate*slope
                
                
        self.coef_ = B
        
        return self 
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        X = np.hstack([X, np.ones((rows, 1))])
        return X.dot(self.coef_)
    
    def score(self, X, y):
        
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        
        y_preds = self.predict(X)
        y_mean = np.mean(y)
        explained_variance = (y_preds - y_mean)*(y_preds - y_mean)
        total_variance = (y - y_mean)*(y - y_mean)
        R2 =np.sum(explained_variance)/np.sum(total_variance)
        return R2 
    

        
        

In [4]:
class StochasticGradientDescent():
    
    def __init__(self, epochs = 100000, learning_rate = 0.0001, threshold = 0.0000001):
        self.learning_rate = learning_rate 
        self.epochs = epochs 
        self.threshold = threshold
        self.coef_ = None 
    
    def fit(self, X, y):
        
        expression = not ((isinstance(X, np.ndarray) and isinstance(y, np.ndarray)) or (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)))
        if expression:
            raise InputNotArrayException()
        elif  (isinstance(X, pd.DataFrame) and isinstance(y, pd.DataFrame)) or (isinstance(X, pd.Series) and isinstance(y, pd.Series)):
            X, y = X.values, y.values 
        
        rows, cols = X.shape

        X = np.hstack([X, np.ones((rows, 1))])        
        y = y.reshape(rows, 1)
        B = np.zeros((cols + 1, 1))
        
        for i in range(self.epochs):
            # Find the gradient at that point 
            
            for k in range(rows):
                idx = np.random.randint(0, rows)
                
                for j in range(cols + 1):
                    X_ = X[idx, :]
                    slope = -2*(X_[j]*(y[idx] - X_.dot(B)))
                    B[j] = B[j] - self.learning_rate*slope
                
        self.coef_ = B
        
        return self 
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        X = np.hstack([X, np.ones((rows, 1))])
        return X.dot(self.coef_)
    
    def score(self, X, y):
        
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException()
        rows, cols = X.shape
        
        y_preds = self.predict(X)
        y_mean = np.mean(y)
        explained_variance = (y_preds - y_mean)*(y_preds - y_mean)
        total_variance = (y - y_mean)*(y - y_mean)
        R2 =np.sum(explained_variance)/np.sum(total_variance)
        return R2 
    

In [5]:
df = pd.read_csv('/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv')
df

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891
5,2.9,23.0,56642
6,3.0,23.0,60150
7,3.2,23.3,54445
8,3.2,23.3,64445
9,3.7,23.6,57189


In [6]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
gd = BatchGradientDescent(learning_rate = 0.00003, epochs=10000)
gd.fit(X.values, y.values)
gd.score(X.values, y.values)

0.9573873206588385

In [7]:
gd.predict(X.values)

array([[ 38268.11828492],
       [ 40369.18811638],
       [ 42017.42604112],
       [ 45836.13291516],
       [ 47484.3708399 ],
       [ 53404.14754539],
       [ 54077.32253886],
       [ 55876.50443251],
       [ 55876.50443251],
       [ 59695.21130655],
       [ 61494.39320019],
       [ 62318.51216256],
       [ 62318.51216256],
       [ 62991.68715603],
       [ 67193.82681894],
       [ 69886.5267928 ],
       [ 72742.31646877],
       [ 75598.10614475],
       [ 81146.59579459],
       [ 83329.2104771 ],
       [ 90224.05011387],
       [ 92243.57509427],
       [ 99138.41473104],
       [102667.37940048],
       [107542.69405685],
       [111071.6587263 ],
       [115946.97338267],
       [118129.58806518],
       [124351.25270848],
       [127207.04238446]])

In [8]:
sgd = StochasticGradientDescent(epochs = 1000, learning_rate = 0.0001)
sgd.fit(X.values, y.values)

<__main__.StochasticGradientDescent at 0x7f7095992dd0>

In [9]:
sgd.predict(X.values)

array([[ 39641.97894278],
       [ 41766.94659443],
       [ 43420.19164826],
       [ 47238.82255097],
       [ 48892.0676048 ],
       [ 54835.66615916],
       [ 55505.04782014],
       [ 57315.53373991],
       [ 57315.53373991],
       [ 61134.16464262],
       [ 62944.6505624 ],
       [ 63771.27308931],
       [ 63771.27308931],
       [ 64440.65475029],
       [ 68690.59005359],
       [ 71368.1166975 ],
       [ 74279.28867884],
       [ 77190.46066018],
       [ 82779.15928543],
       [ 85020.94960579],
       [ 91948.411553  ],
       [ 93956.55653594],
       [100884.01848315],
       [104464.57212547],
       [109383.88908975],
       [112964.44273206],
       [117883.75969634],
       [120125.5500167 ],
       [126383.63030293],
       [129294.80228427]])

In [10]:
sgd.score(X.values, y.values)

0.9764257426173055

In [11]:
sgd.coef_

array([[6693.81660979],
       [1572.40865938],
       [-741.80117495]])

In [12]:
gd.coef_

array([[6731.74993466],
       [1509.43968904],
       [-835.04011314]])