### Multi Linear Reggression

In [1]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image
import seaborn as sns
sns.set()

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
X,y = load_diabetes(return_X_y=True)

In [3]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]], shape=(442, 10))

In [4]:
X.shape

(442, 10)

In [5]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [6]:
y.shape

(442,)

## Linear Regression from scratch using OLS

In [7]:
class multi_linear_regression:
    '''
    This class consist of the init method for the initilization, fit method to learn parameters and a predict method to predict for the unseen data
    '''

    def __init__(self):
        self.slope = None
        self.intercept = None

    def fit(self, X_train, X_test):
        '''
        Input: Independent variables and a dependent variable
        Output: Learn parameter slope and intercept
        '''
        X_train = np.insert(X_train, 0, 1, axis= 1) # to add a column for the intercept b0 with value 1
        
        # calculate the coeffeicient/slope
        bita = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train) # bita(b) = (inverse of(x.transpose * X) * x.transpose * y)
        
        self.intercept = bita[0] # updating intercept
        self.slope = bita[1:] # updating coefficient/slope

        return f'The intercept is {self.intercept}, \nSlope/Coefficient is {self.slope}'

    def predict(self, X_test):
        '''
        Input: Unseen data
        Output: Prediction
        '''
        prediction = np.dot(X_test, self.slope) + self.intercept

        return prediction

In [8]:
# split data in train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 0)

X_train.shape, X_test.shape

((309, 10), (133, 10))

In [9]:
# create instance

multiLR = multi_linear_regression()

In [10]:
# Fit the model

print(multiLR.fit(X_train, y_train))

The intercept is 153.7190162438038, 
Slope/Coefficient is [ -52.46478548 -193.50733393  579.49108514  272.453666   -504.64830389
  241.62372969  -69.76596029   86.61313961  721.92083806   26.78067442]


In [11]:
# predict for the testing data

prediction = multiLR.predict(X_test)
prediction

array([239.67646226, 250.52785246, 164.85108003, 120.27660503,
       181.73443919, 262.21130761, 112.23506474, 191.94794432,
       151.49899378, 236.97230238, 172.17064249, 181.76621901,
       112.11999874,  93.10380711, 242.61242687,  91.18595934,
       153.65788928,  64.67647092,  99.36484095, 212.26056138,
       197.04060446, 162.46365074, 164.1435797 , 157.73720559,
       207.25521787, 170.20643739, 111.97244286,  82.77403445,
       186.91422667, 164.71981545, 175.37450608,  82.34705002,
       144.29011545, 149.22311094, 144.09055592, 194.25826256,
       166.5246271 , 188.10529014, 126.97581238, 205.91418409,
        85.53291478, 167.78048883, 147.74631196, 183.85195035,
       177.14599535,  71.92736587, 139.84170986, 139.03352655,
       125.11919021, 231.42311603, 163.79321356,  78.92705801,
       151.43615002, 159.59655213, 237.28482326, 176.12400723,
       191.69400387, 118.93294883, 131.10216716, 174.80489902,
       216.57056952, 169.8778643 , 156.38489415, 113.57

In [12]:
# check score

r2_score(y_test,prediction)

0.3928992721696297

### Checking answer using the sklearn

In [13]:
reg = LinearRegression()

In [14]:
reg.fit(X_train,y_train)

In [15]:
y_pred = reg.predict(X_test)

In [16]:
r2_score(y_test,y_pred)

0.39289927216962905

In [17]:
reg.coef_

array([ -52.46478548, -193.50733393,  579.49108514,  272.453666  ,
       -504.64830389,  241.62372969,  -69.76596029,   86.61313961,
        721.92083806,   26.78067442])

In [18]:
reg.intercept_

np.float64(153.71901624380382)

`From the above we can see our model written from scratch is giving same output as Linear Regression model`