# Multiple Linear Regression: Sklearn

Note: 

- first check p-values and further stats with statsmodels before using sklearn
- checking model assumptions and preprocessing excluded

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## Load data

In [2]:
url = "https://raw.githubusercontent.com/mk-gurucharan/Regression/master/Startups_Data.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df["State"].unique()

array(['New York', 'California', 'Florida'], dtype=object)

## Declare features and targets

In [4]:
# declare features and targets
y = df.iloc[:, -1].values
X = df.iloc[:, :-1].values
X[:5]

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

## One-Hot-Encoding of categorical data

(Note: you should drop the dummy variable with the most 1's to avoid multicollinearity in the model.)

In [5]:
# one-hot-encode categorical "State" feature
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [3])], remainder="passthrough")
X = np.array(ct.fit_transform(X))
X[:5]

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

## Train-Test Split

In [6]:
# 80:20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape[0])
print(X_test.shape[0])
print(X_train.shape[0] / X.shape[0])

40
10
0.8


## Build and train model

In [7]:
# create multiple linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Get model parameters

In [8]:
# get bias, weights, R-squared, and adjusted R-squared
bias = regressor.intercept_
print("Bias: {:.2f}".format(bias))

weights = regressor.coef_
for i in range(len(weights)):
  print("Weight {}: {:.2f}".format(i, weights[i]))

R2 = regressor.score(X,y)
print("R2: {:.2f}".format(R2))

adj_R2 = 1- (1-R2) * (X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1)
print("Adj-R2: {:.2f}".format(adj_R2))

Bias: 54343.30
Weight 0: -315.26
Weight 1: 623.53
Weight 2: -308.27
Weight 3: 0.81
Weight 4: -0.07
Weight 5: 0.03
R2: 0.95
Adj-R2: 0.94


## Make prediction

In [9]:
# use model to make prediction and compare to actuals
yhat = regressor.predict(X_test)
eval_df = pd.DataFrame({"Actuals": y_test, "Predictions": yhat, "%-Error": np.abs(((yhat-y_test)/y_test)*100)})
eval_df

Unnamed: 0,Actuals,Predictions,%-Error
0,134307.35,126362.879083,5.915142
1,81005.76,84608.453836,4.447454
2,99937.59,99677.494252,0.260258
3,64926.08,46357.460686,28.599631
4,125370.37,128750.482885,2.696102
5,35673.41,50912.417419,42.718113
6,105733.54,109741.350327,3.790482
7,107404.34,100643.242816,6.294994
8,97427.84,97599.275746,0.175962
9,122776.86,113097.425244,7.883761


## Evaluate model

In [10]:
# compute mean percentage error
eval_df.loc[:, "%-Error"].mean()

10.278189896423983