In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv


In [2]:
import seaborn as sns 
import plotly.express as px 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import SGDRegressor

In [3]:
class InputNotArrayException(Exception):
    def __init__(self, message = 'Given input is not an array nor dataframe'):
        self.message = message 
        super().__init__(message)
        
    
class GradientDescent():
    
    def __init__(self, epochs = None, learning_rate = 0.004578, threshold = 0.0000001):
        self.learning_rate = learning_rate 
        self.epochs = epochs 
        self.threshold = threshold
    
    def fit(self, X, y):
        try:
            rows, cols = X.shape
        except Exception as e:
            if (isinstance(X, pd.DataFrame) or isinstance(X, pd.Series)) and (isinstance(y, pd.Series) or isinstance(y, pd.Series)):
                rows, cols = X.values.shape                
            else: raise InputNotArrayException()
        
        X, y = X.values, y.values 
        X = np.hstack([X, np.ones((rows, 1))])        
        y = y.reshape(rows, 1)
        B = np.zeros((cols + 1, 1))
        
        for col in range(cols+1):
            b = 10
            while b - B[col] > self.threshold:
                b = B[col]
                slope = -2*np.sum(X[:, col].reshape((1, rows)).dot(y - X.dot(B)))
                b = b - self.learning_rate*slope 
                B[col] = b
                
        self.B = B
        
        return self 
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): X = X.values
        elif not isinstance(X, np.ndarray): raise InputNotArrayException
        rows, cols = X.shape
        X = np.hstack([X, np.ones((rows, 1))])
        return X.dot(self.B)


In [4]:
X = np.hstack([np.ones((10, 1)), np.ones((10, 1))])
X[:, 0].reshape((1, 10))

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [5]:
df = pd.read_csv('/kaggle/input/salary-data-with-age-and-experience/Salary_Data.csv')
df

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891
5,2.9,23.0,56642
6,3.0,23.0,60150
7,3.2,23.3,54445
8,3.2,23.3,64445
9,3.7,23.6,57189


In [6]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
gd = GradientDescent()
gd.fit(X, y)
gd.predict(X)

array([[-6.91005662e+07],
       [-7.16280836e+07],
       [-7.26233547e+07],
       [-7.40900351e+07],
       [-7.50853062e+07],
       [-7.90795039e+07],
       [-7.90663907e+07],
       [-8.05724106e+07],
       [-8.05724106e+07],
       [-8.20390910e+07],
       [-8.35451108e+07],
       [-8.40427464e+07],
       [-8.40427464e+07],
       [-8.40296332e+07],
       [-8.90846680e+07],
       [-8.90322153e+07],
       [-9.41134765e+07],
       [-9.91947376e+07],
       [-1.04223546e+08],
       [-1.09317920e+08],
       [-1.14320502e+08],
       [-1.14281163e+08],
       [-1.19283745e+08],
       [-1.24351893e+08],
       [-1.29393815e+08],
       [-1.34461963e+08],
       [-1.39503884e+08],
       [-1.44598259e+08],
       [-1.49613954e+08],
       [-1.54695215e+08]])

In [7]:
sgd = SGDRegressor()
sgd.fit(X, y)
sgd.predict(X)
sgd.score(X, y)

-18641337.427701164

In [8]:
lr = LinearRegression()
lr.fit(X, y) 
lr.predict(X)
lr.score(X, y)

0.9599822609873284

In [9]:
sgd.coef_

array([ 1.73503571e+08, -9.99211930e+07])

In [10]:
sgd.intercept_

array([1.71549816e+09])