In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
# Assign numerical values for categorical data
# [sex] --> 1: male, 0: female
# [smoker] --> 1: yes, 0: no
# [region] --> 1: southwest, 2: southeast, 3: northwest, 4: northeast

data['sex'] = data['sex'].apply({'male': 1, 'female': 0}.get)
data['smoker'] = data['smoker'].apply({'yes': 1, 'no': 0}.get)
data['region'] = data['region'].apply({'southwest': 1, 'southeast': 2, 'northwest': 3, 'northeast': 4}.get)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


In [6]:
# function for running s-folds cross validation

def s_folds(data, num_folds):

    rmse = []
    data_np = data.to_numpy()

    for i in range(20):

        # shuffle data
        np.random.seed(i)
        np.random.shuffle(data_np)

        S = num_folds  

        fold_size = int(len(data_np) / S)

        for current_fold in range(S):

            # create start and end points to cycle through each s-fold
            start = current_fold * fold_size
            end = start + fold_size
            
            validationData = data_np[start:end]
            trainingData = np.concatenate((data_np[:start], data_np[end:]))

            # split data into training and validation
            x_train, y_train = trainingData[:, :-1], trainingData[:, -1:]
            x_valid, y_valid = validationData[:, :-1], validationData[:, -1:]

            # add bias feature
            x_train = np.hstack((np.ones((x_train.shape[0], 1)), x_train))
            x_valid = np.hstack((np.ones((x_valid.shape[0], 1)), x_valid))

            # compute linear regression
            coef = np.linalg.inv(x_train.T @ x_train) @ (x_train.T @ y_train)
            y_valid_pred = x_valid @ coef

            rmse_validaation = np.sqrt(np.mean((y_valid - y_valid_pred)**2))

        rmse_validaation / S

        rmse.append(rmse_validaation)

    return [np.mean(rmse), np.std(rmse)]


In [7]:
# display results of different s-fold runs

s3 = s_folds(data, 3)  
s223 = s_folds(data, 223)
sN = s_folds(data, len(data))

print('S = 3 Mean RMSE: {} and std: {}'.format(s3[0], s3[1]))
print('S = 223 Mean RMSE: {} and std: {}'.format(s223[0], s223[1]))
print('S = N Mean RMSE: {} and std: {}'.format(sN[0], sN[1]))

S = 3 Mean RMSE: 6193.535734413218 and std: 255.90795638679717
S = 223 Mean RMSE: 4974.015955045201 and std: 1742.9070778974594
S = N Mean RMSE: 3373.110297886759 and std: 3900.910320187101
