# Multiple Linear Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Multiple linear regression formula
# y = b0 + b1*x1 + b2*x2 + b3*x3 +... bj*xi + D
# where,
# y = dependent variable 
# xi = explanatory varibles
# bj = y-intercept
# bk = slope coefficient for each explanatory var
# D = model's error term (residuals)

## Importing the dataset

In [None]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)

## Encoding categorical data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Column 3 contains the non-numerical data, so we want to use OneHotEncoder with ColumnTransformer to encode numbers
# to the states in column 3.
# No need to apply feature scaling (scales min and max between 0 and 1) when using Multiple Linear Regression
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

# this will train the regressor model on the training data -> x = features, y = label
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [9]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
# Reshape the len of y_pred into 1 column, instead of 1 row
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [None]:
# Making a single prediction based on manually entered data. Passing a 2d array.
print(regressor.predict([[0, 0, 10000, 12000, 1000, 4000000]]))

We will use the regressor.predict formula and pass a 2D array -> [[STATE (as binary matrix), R&D Spend, Administration, Marketing]]