# Multiple Linear Regression

# Import libraries

In [92]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## Import dataset

In [93]:
df = pd.read_csv('50_Startups.csv')

# Independent variables
X = df.iloc[:, :-1].values
# Dependent variable
y = df.iloc[:, -1].values

## Preprocessing data

In [94]:
# Check null
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [95]:
# Check not a number
df.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [96]:
# If column has value equal 0, we fill the zero by the mean
# for col in df.select_dtypes(include='float64').columns:
#     for i in range(len(df[col])):
#         if df[col][i] == 0:
#             df.loc[i, col] = df[col].mean()

for col in df.select_dtypes(include='float64'):
        df[col] = df[col].mask(df[col] == 0, df[col].mean())

In [97]:
print(df)

      R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.2000       136897.80      471784.1000    New York  192261.83
1   162597.7000       151377.59      443898.5300  California  191792.06
2   153441.5100       101145.55      407934.5400     Florida  191050.39
3   144372.4100       118671.85      383199.6200    New York  182901.99
4   142107.3400        91391.77      366168.4200     Florida  166187.94
5   131876.9000        99814.71      362861.3600    New York  156991.12
6   134615.4600       147198.87      127716.8200  California  156122.51
7   130298.1300       145530.06      323876.6800     Florida  155752.60
8   120542.5200       148718.95      311613.2900    New York  152211.77
9   123334.8800       108679.17      304981.6200  California  149759.96
10  101913.0800       110594.11      229160.9500     Florida  146121.95
11  100671.9600        91790.61      249744.5500  California  144259.40
12   93863.7500       127320.38      249839.4400     Florida  14

## Encode categorical data

In [98]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[
        ('encoder', OneHotEncoder(), [3])
], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [99]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Splitting the dataset into Training set and Test set

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Training the Multiple Linear Regression model on the Training set

In [101]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the test set result

In [102]:
y_pred = regressor.predict(X_test)

In [103]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_pred),1)), axis=1))

[[104282.76 103282.38]
 [132536.88 144259.4 ]
 [133910.85 146121.95]
 [ 72584.77  77798.83]
 [179920.93 191050.39]
 [114549.31 105008.31]
 [ 66444.43  81229.06]
 [ 98404.97  97483.56]
 [114499.83 110352.25]
 [169367.51 166187.94]
 [ 96522.63  96778.92]
 [ 88040.67  96479.51]
 [110949.99 105733.54]
 [ 90419.19  96712.8 ]
 [128020.46 124266.9 ]]
