In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dataset = pd.read_csv('insurance.csv')

dataset.head()

In [None]:
#Check for missing values in dataset
dataset.isnull().sum()

#Visualizing the distribution of charges
sns.displot(dataset, x="charges", color = 'b')
plt.title('Distribution of charges')
plt.show()

In [None]:
sns.boxplot(x='children', y='charges',hue='sex',data=dataset,palette='rainbow')
plt.title('Box plot of charges vs children');

In [None]:
#Removing dummy variables trap
categorical_columns = ['sex', 'children', 'smoker', 'region']
dataset_encode = pd.get_dummies(data = dataset, prefix = 'ENCODED', prefix_sep = '_',
                                columns = categorical_columns,
                                drop_first = True,
                                dtype = 'int8')

## Log transform
dataset_encode['charges'] = np.log(dataset_encode['charges'])

#Train test split
from sklearn.model_selection import train_test_split
x = dataset_encode.drop('charges',axis=1) # Independet variable
y = dataset_encode['charges'] # dependent variable

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

#Add x0 = 1 to the dataset
x_train_0 = np.c_[np.ones((x_train.shape[0], 1)), x_train]
x_test_0 = np.c_[np.ones((x_test.shape[0], 1)), x_test]

#Build a model using the linear regression equation theta =(x^T*x)^−1 * x^T * y
theta = np.matmul(np.linalg.inv( np.matmul(x_train_0.T,x_train_0) ), np.matmul(x_train_0.T,y_train))

#Test the model, evaluate the Model by calculating R^2 (the coefficient of determination)
y_predition = np.matmul(x_test_0, theta)

SSE = np.sum((y_predition - y_test) ** 2)
SST = np.sum((y_test - y_test.mean()) ** 2)
R_square = 1 - (SSE/SST)

print("The R square value is :", R_square)

#Mean Square Error calculation
MSE = np.sum((y_predition - y_test) ** 2) / x_test.shape[0]

print("The Mean Square Error :", MSE)