## 1. Import libraries, test data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('50_Startups.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## 2. Encode the data and split into training/test data sets

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.35, random_state = 0)

## 3. Train the Multi Linear Regression model

In [5]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train[:,3:], y_train)

[[55493.95 103057.49 214634.81]
 [46014.02 85047.44 205517.64]
 [75328.87 144135.98 134050.07]
 [46426.07 157693.92 210797.67]
 [91749.16 114175.79 294919.57]
 [130298.13 145530.06 323876.68]
 [119943.24 156547.42 256512.92]
 [1000.23 124153.04 1903.93]
 [542.05 51743.15 0.0]
 [65605.48 153032.06 107138.38]
 [114523.61 122616.84 261776.23]
 [61994.48 115641.28 91131.24]
 [63408.86 129219.61 46085.25]
 [78013.11 121597.55 264346.06]
 [23640.93 96189.63 148001.11]
 [76253.86 113867.3 298664.47]
 [15505.73 127382.3 35534.17]
 [120542.52 148718.95 311613.29]
 [91992.39 135495.07 252664.93]
 [64664.71 139553.16 137962.62]
 [131876.9 99814.71 362861.36]
 [94657.16 145077.58 282574.31]
 [28754.33 118546.05 172795.67]
 [0.0 116983.8 45173.06]
 [162597.7 151377.59 443898.53]
 [93863.75 127320.38 249839.44]
 [44069.95 51283.14 197029.42]
 [77044.01 99281.34 140574.81]
 [134615.46 147198.87 127716.82]
 [67532.53 105751.03 304768.73]
 [28663.76 127056.21 201126.82]
 [78389.47 153773.43 299737.29]


## 4. Start predictions

In [6]:
y_pred = model.predict(x_test[:, 3:])

## 5. Display details

In [16]:
from sklearn.metrics import mean_squared_error
from itertools import zip_longest

print('Coefficient b0: ', model.intercept_, '\n')

for i in range(0, len(model.coef_)):
    print(f'Coefficient b{i+1}: ', model.coef_[i], '\n')

df = pd.DataFrame(list(zip_longest(y_test, y_pred, fillvalue=0)), columns=['y-value', 'y-predicted-value'])
df["residual_epsilon"] = (df['y-value'] - df['y-predicted-value'])



# print('Least square error / Q(b0, b1): ', mean_squared_error(y_test, y_pred) * len(x_test), '\n')
print(pd.DataFrame(ct.get_feature_names()[:3]).transpose(), "\n")
print(pd.DataFrame(x_test), "\n")
print(df)

Coefficient b0:  42989.00816508669 

Coefficient b1:  0.7788410440821667 

Coefficient b2:  0.02939189909755259 

Coefficient b3:  0.0347102548694203 

                        0                    1                     2
0  encoder__x0_California  encoder__x0_Florida  encoder__x0_New York 

     0    1    2          3          4          5
0  0.0  1.0  0.0   66051.52  182645.56   118148.2
1  1.0  0.0  0.0  100671.96   91790.61  249744.55
2  0.0  1.0  0.0  101913.08  110594.11  229160.95
3  0.0  1.0  0.0   27892.92   84710.77  164470.71
4  0.0  1.0  0.0  153441.51  101145.55  407934.54
5  0.0  0.0  1.0    72107.6  127864.55  353183.81
6  0.0  0.0  1.0   20229.59   65947.93   185265.1
7  0.0  0.0  1.0   61136.38  152701.92   88218.23
8  0.0  1.0  0.0   73994.56  122782.75  303319.26
9  0.0  1.0  0.0  142107.34   91391.77  366168.42 

     y-value  y-predicted-value  residual_epsilon
0  103282.38      103901.896970       -619.516970
1  144259.40      132763.059931      11496.340069
2  146