## using the normal equation to implement linear regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import model_selection

In [2]:
class multi_LR:
    def train(self, arr1, arr2):
        # arr1: data points for input parameters, there are 'm' data points and n input parameters
        #(n+1)th column of arr1 will be constructed in this function as all 1s
        # arr2: target values, there are 'm' target values
        
        num_IP=np.array(arr1, dtype=float)
        num_target=np.array(arr2, dtype=float)
        num_IP=np.insert(num_IP, 8, 1, axis=1)
        num_IPT=np.transpose(num_IP)
        self.bias_matrix=np.matmul(np.matmul(np.linalg.inv(np.matmul(num_IPT, num_IP)), num_IPT), num_target)
    
    def predict(self, arr1):
        #arr1: the testing data/ data for which values have to be predicted
        num_test=np.array(arr1, dtype=float)
        num_test=np.insert(num_test, 8, 1,axis=1)
        num_predicted=np.matmul(num_test, self.bias_matrix)
        return num_predicted
    
    def score(self, arr1, arr2):
        #arr1: data for which target values are given by arr2
        #we will first predict the target values for arr1 and then compare them with the values in arr2
        num_target=np.array(arr2, dtype=float)
        num_input=np.array(arr1, dtype=float)
        
        num_predicted=self.predict(num_input)
        mean=np.mean(num_target)
        numerator=np.sum((num_predicted-num_target)**2)
        denominator=np.sum((mean-num_target)**2)
        COD=1-(numerator/denominator)
        return COD      

### using the california dataset from the boston dataset file to check our implmentation

In [3]:
from sklearn import datasets

In [4]:
#the data has been cleaned aldready
housing=datasets.fetch_california_housing()
housing


{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [5]:
#train_test_split
x_train, x_test, target_train, target_test=model_selection.train_test_split(housing.data,housing.target)

In [6]:
algo1=multi_LR()
algo1.train(x_train, target_train)
print("bias matrix\n", algo1.bias_matrix)
predicted=algo1.predict(x_test)
print("predicted values\n",predicted)

bias matrix
 [ 4.52707151e-01  9.47906996e-03 -1.36369792e-01  8.69299865e-01
 -3.85611874e-06 -3.48661548e-03 -4.22319164e-01 -4.36140665e-01
 -3.72478550e+01]
predicted values
 [1.55872997 0.95252803 2.09825805 ... 2.19263663 3.76285883 1.12730678]


In [7]:
print(algo1.score(x_test, target_test))

0.5899574101180252


In [11]:
num_arr=np.array([[1,2],[2,3],[3,4]])
num_arr.shape

(3, 2)

In [9]:
np.insert(num_arr, [2], 0, axis=1)

array([[1, 2, 0],
       [2, 3, 0],
       [3, 4, 0]])