# Multiple Regression using Normal Equation

**Dataset:** [Diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) from scikit-learn

## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd

## 2. Build model

In [2]:
class MultipleRegression():
    '''
    A class which implements multiple regression model using the normal equation.
    '''
    def __init__(self):
        self.coefficients = []
    
    def fit(self, X, y):
        '''
        Used to calculate the coefficients of the multiple regression model.
        
        :param X: array, features
        :param y: array, true values
        :return: None
        '''
        if len(X.shape) == 1:
            X = self._reshape_X(X)
        
        X = self._concatenate_ones(X)
        self.coefficients = np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)
        
    def predict(self, test_data):
        '''
        Makes predictions using the line equation.
        
        :param X: array, features
        :return: array, predictions
        '''
        bias = self.coefficients[0]
        weights = self.coefficients[1:]
        prediction = bias
        
        for xi, bi in zip(test_data, weights):
            prediction += (xi * bi)
        return prediction
    
    def mean_squared_error(self, y, y_pred):
        '''
        Private method, used to evaluate loss at each iteration.

        :param: y - array, true values
        :param: y_hat - array, predicted values
        :return: float
        '''
        error = 0
        for i in range(len(y)):
            error += ((y[i] - y_pred[i])**2)
        return error / len(y)
    
    def _reshape_X(self, X):
        return X.reshape(-1, 1)
    
    def _concatenate_ones(self, X):
        ones = np.ones(shape = X.shape[0]).reshape(-1, 1)
        return np.concatenate((ones, X), 1)

## 3. Load diabetes dataset

In [3]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

### 3.1. Input and output shape

In [4]:
print(X.shape)
print(y.shape)

(442, 10)
(442,)


### 3.2. Feature names

In [5]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

### 3.3. Train Test split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape) # 80% of data
print(y_train.shape) # 20% of data

(353, 10)
(353,)


## 4. Create multiple regression model using MultipleRegression class

In [7]:
model = MultipleRegression()

### 4.1. Fit model

In [8]:
model.fit(X_train, y_train)

In [9]:
model.coefficients

array([ 151.34565535,   37.90031426, -241.96624835,  542.42575342,
        347.70830529, -931.46126093,  518.04405547,  163.40353476,
        275.31003837,  736.18909839,   48.67112488])

## 5. Make predictions

In [10]:
model.predict(X[0])

210.74244148429636

In [11]:
y_pred = []

for row in X_test:
    y_pred.append(model.predict(row))

In [12]:
df = pd.DataFrame({
    'actual': y_test,
    'predicted': np.ravel(y_pred)
})

df.head()

Unnamed: 0,actual,predicted
0,219.0,139.548313
1,70.0,179.520306
2,202.0,134.041333
3,230.0,291.411936
4,111.0,123.787237


## 6. Evaluate model performance using Mean Squared Error

In [13]:
mse = model.mean_squared_error(y_test, y_pred)
print(mse)

2900.1732878832368


# In summary
## 1. Create Multiple Regression class

In [14]:
import numpy as np
import pandas as pd

# Create Multiple Regression model
class MultipleRegression():
    '''
    A class which implements multiple regression model using the normal equation.
    '''
    def __init__(self):
        self.coefficients = []
    
    def fit(self, X, y):
        '''
        Used to calculate the coefficients of the multiple regression model.
        
        :param X: array, features
        :param y: array, true values
        :return: None
        '''
        if len(X.shape) == 1:
            X = self._reshape_X(X)
        
        X = self._concatenate_ones(X)
        self.coefficients = np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)
        
    def predict(self, test_data):
        '''
        Makes predictions using the line equation.
        
        :param X: array, features
        :return: array, predictions
        '''
        bias = self.coefficients[0]
        weights = self.coefficients[1:]
        prediction = bias
        
        for xi, bi in zip(test_data, weights):
            prediction += (xi * bi)
        return prediction
    
    def mean_squared_error(self, y, y_pred):
        '''
        Private method, used to evaluate loss at each iteration.

        :param: y - array, true values
        :param: y_hat - array, predicted values
        :return: float
        '''
        error = 0
        for i in range(len(y)):
            error += ((y[i] - y_pred[i])**2)
        return error / len(y)
    
    def _reshape_X(self, X):
        return X.reshape(-1, 1)
    
    def _concatenate_ones(self, X):
        ones = np.ones(shape = X.shape[0]).reshape(-1, 1)
        return np.concatenate((ones, X), 1)

## 2. Load dataset, create model, display results

In [15]:
# Load diabetes dataset
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

# Train Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate Multiple Regression model
model = MultipleRegression()

# Fit model
model.fit(X_train, y_train)

# Make predictions
preds = model.predict(X_test)

# Store each prediction value in a list
y_pred = []
for row in X_test:
    y_pred.append(model.predict(row))
    
# Compare actual values with predictions
df = pd.DataFrame({
    'actual': y_test,
    'predicted': np.ravel(y_pred)
})
# Display DataFrame
df.head()

# Calculate Mean Squared Error
mse = model.mean_squared_error(y_test, y_pred)

# Print results
print('Coefficients:', model.coefficients)
print('Mean Squared Error:', mse)

Coefficients: [ 151.34565535   37.90031426 -241.96624835  542.42575342  347.70830529
 -931.46126093  518.04405547  163.40353476  275.31003837  736.18909839
   48.67112488]
Mean Squared Error: 2900.1732878832368
