In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
# Assign numerical values for categorical data
# [sex] --> 1: male, 0: female
# [smoker] --> 1: yes, 0: no
# [region] --> 1: southwest, 2: southeast, 3: northwest, 4: northeast

data['sex'] = data['sex'].apply({'male': 1, 'female': 0}.get)
data['smoker'] = data['smoker'].apply({'yes': 1, 'no': 0}.get)
data['region'] = data['region'].apply({'southwest': 1, 'southeast': 2, 'northwest': 3, 'northeast': 4}.get)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


In [7]:
# shuffle and split data

data_np = data.to_numpy()
np.random.seed(0)
np.random.shuffle(data_np)

trainNum = int(np.ceil(len(data_np) * (2/3)))
trainingData, validationData = data_np[:trainNum], data_np[trainNum:]
x_train, y_train = trainingData[:, :-1], trainingData[:, -1:]
x_valid, y_valid = validationData[:, :-1], validationData[:, -1:]


In [9]:
# add bias to feature data and compute coefficients for linear regression

x_train = np.hstack((np.ones((x_train.shape[0], 1)), x_train))
x_valid = np.hstack((np.ones((x_valid.shape[0], 1)), x_valid))

coef = np.linalg.inv(x_train.T @ x_train) @ (x_train.T @ y_train)
coef

array([[-12492.51213365],
       [   272.70267901],
       [  -128.56089024],
       [   287.34786078],
       [   518.96969654],
       [ 24057.32777122],
       [   312.97570871]])

In [10]:
# get predictions for target using coef and compute rmse and smape for training and validation data

y_train_pred = x_train @ coef
y_valid_pred = x_valid @ coef

rmse_training = np.sqrt(np.mean((y_train - y_train_pred)**2))
rmse_validaation = np.sqrt(np.mean((y_valid - y_valid_pred)**2))

smape_training = (np.sum(np.abs(y_train - y_train_pred) / (np.abs(y_train) + np.abs(y_train_pred)))) * (1 / len(y_train))
smape_validaiton = (np.sum(np.abs(y_valid - y_valid_pred) / (np.abs(y_valid) + np.abs(y_valid_pred)))) * (1 / len(y_valid))

rmse_training, rmse_validaation, smape_training, smape_validaiton

(5757.954440690525, 6606.030095968491, 0.1807053406103454, 0.18320284506099282)