# Multiple Linear Regression

## Importing the libraries

In [134]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

## Importing the dataset

In [135]:
dataset = pd.read_csv('insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Encoding categorical data and idependent

In [136]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(pd.DataFrame(X))



       0    1    2    3    4    5    6    7   8       9  10
0     1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  19    27.9  0
1     0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  18   33.77  1
2     0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  28    33.0  3
3     0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  33  22.705  0
4     0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  32   28.88  0
...   ...  ...  ...  ...  ...  ...  ...  ...  ..     ... ..
1333  0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  50   30.97  3
1334  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  18   31.92  0
1335  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  18   36.85  0
1336  1.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  21    25.8  0
1337  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  61   29.07  0

[1338 rows x 11 columns]


## Splitting the dataset into the Training set and Test set

In [137]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

## Training the Multiple Linear Regression model on the Training set

In [138]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [139]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 8969.55  9095.07]
 [ 7068.75  5272.18]
 [36858.41 29330.98]
 [ 9454.68  9301.89]
 [26973.17 33750.29]
 [10864.11  4536.26]
 [  170.28  2117.34]
 [16903.45 14210.54]
 [ 1092.43  3732.63]
 [11218.34 10264.44]
 [28101.68 18259.22]
 [ 9377.73  7256.72]
 [ 5263.06  3947.41]
 [38416.04 46151.12]
 [40255.82 48673.56]
 [37098.25 44202.65]
 [15240.39  9800.89]
 [35912.88 42969.85]
 [ 9112.52  8233.10]
 [31461.92 21774.32]
 [ 3847.69  5080.10]
 [10130.12  7441.50]
 [ 2370.54  1256.30]
 [ 7140.22  2755.02]
 [11301.77 11085.59]
 [12961.65 10923.93]
 [14509.47 12644.59]
 [ 6159.90 18804.75]
 [ 9963.86  9715.84]
 [ 2177.86  1131.51]
 [ 9115.94 15828.82]
 [13073.69 11842.62]
 [ 4561.82  2020.55]
 [ 3408.21  5693.43]
 [ 4459.81  2904.09]
 [13032.07  7448.40]
 [ 1979.99  2597.78]
 [ 8813.28  7337.75]
 [33271.29 23887.66]
 [32585.52 38709.18]
 [ 3908.76  4687.80]
 [ 4326.11  2643.27]
 [14142.81 11674.13]
 [11423.45 12124.99]
 [ 8774.14  4890.00]
 [12097.28 12333.83]
 [ 5281.57  3579.83]
 [ 3150.56  4