# Multiple Linear Regression

## Data pre-processing

Import libraries

In [12]:
import pandas as pd
import numpy as np
# Matplotlib for ploting data
import matplotlib.pyplot as plt
# Statsmodels for optimice RLM
import statsmodels.api as sm
# Sklearn (split the dataset)
from sklearn.model_selection import train_test_split
# Sklear (encode categorical data)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# Sklearn (lineal regression model)
from sklearn.linear_model import LinearRegression

Import dataset

In [13]:
dataset = pd.read_csv('../../data/50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

Encode categorical data

In [14]:
labele_X = LabelEncoder()
X[:, 3] = labele_X.fit_transform(X[:,3])
ct_X = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [3])],
                       remainder='passthrough')
X = np.array(ct_X.fit_transform(X), dtype=float)

# Drop one dummy variable to avoid multicollinearity
X = X[:, 1:]

Split dataset into random train and test subsets

In [15]:
# Split the dataset in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Building model

Create a lineal regression model with training set

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)

### Improving results by optimising RLM

Build optimal RLM using backward elimination:
- Add coefficient to identify the independent variable

In [17]:
X = np.append(arr = np.ones((50,1)).astype(int), values=X, axis=1)

- Function for backward elimination method based on "p value"

In [18]:
def backwardElimination(x, sl):    
    numVars = len(x[0])    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        if maxVar > sl:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    x = np.delete(x, j, 1)    
    regressor_OLS.summary()    
    return x

- Optimize the independent variable with the function

In [19]:
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)
X_Modeled = X_Modeled[:, 1:]

- Build optimized model

In [20]:
# Split the dataset in training and testing with optimized data
X_opt_train, X_opt_test, y_train, y_test = train_test_split(X_Modeled, y, test_size=0.2, random_state=0)
# Fit RLM with optimized data
opt_model = LinearRegression()
opt_model.fit(X_opt_train, y_train)

## Evaluate the results

Predic with testing and optimized training set

In [21]:
y_pred = model.predict(X_test)
y_opt_pred = opt_model.predict(X_opt_test)

Preview of results

In [22]:
results = pd.DataFrame({
    'Test': y_test,
    'Prediction': np.around(y_pred, decimals=2),
    'Opt prediction': np.around(y_opt_pred, decimals=2)
})

display(results)

Unnamed: 0,Test,Prediction,Opt prediction
0,103282.38,103015.2,104667.28
1,144259.4,132582.28,134150.83
2,146121.95,132447.74,135207.8
3,77798.83,71976.1,72170.54
4,191050.39,178537.48,179090.59
5,105008.31,116161.24,109824.77
6,81229.06,67851.69,65644.28
7,97483.56,98791.73,100481.43
8,110352.25,113969.44,111431.75
9,166187.94,167921.07,169438.15
