In [67]:
import pandas as pd

# Reading the data as a pandas DataFrame
insurance_df =  pd.read_csv('insurance.csv')


In [68]:
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### **Preprocessing**

In [69]:
insurance_df['sex'] = insurance_df['sex'] == 'female'
insurance_df['smoker'] = insurance_df['smoker'] == 'yes'

insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,True,27.900,0,True,southwest,16884.92400
1,18,False,33.770,1,False,southeast,1725.55230
2,28,False,33.000,3,False,southeast,4449.46200
3,33,False,22.705,0,False,northwest,21984.47061
4,32,False,28.880,0,False,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,False,30.970,3,False,northwest,10600.54830
1334,18,True,31.920,0,False,northeast,2205.98080
1335,18,True,36.850,0,False,southeast,1629.83350
1336,21,True,25.800,0,False,southwest,2007.94500


In [70]:
# applying one-hot encoding to the region feature
insurance_df = pd.get_dummies(insurance_df, columns=['region'], prefix=['region'])

# making all data integers to deal with just numeric data
insurance_df = insurance_df.astype(int)
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27,0,1,16884,0,0,0,1
1,18,0,33,1,0,1725,0,0,1,0
2,28,0,33,3,0,4449,0,0,1,0
3,33,0,22,0,0,21984,0,1,0,0
4,32,0,28,0,0,3866,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30,3,0,10600,0,1,0,0
1334,18,1,31,0,0,2205,1,0,0,0
1335,18,1,36,0,0,1629,0,0,1,0
1336,21,1,25,0,0,2007,0,0,0,1


# Separating the target from the data

In [71]:
X = insurance_df.drop(columns=['charges'])
y = insurance_df['charges']

## Splitting the data to train and test sets

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)





# Normalizing
normalizing would help stabilize the training process and make the weights converge faster.

In [73]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on X_train and transform X_train
X_train_normalized = scaler.fit_transform(X_train)

# Transform X_test using the same scaler
X_test_normalized = scaler.transform(X_test)


# Training using ScikitLearn


In [74]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

reg.fit(X_train_normalized,y_train)

train_accuracy = reg.score(X_train_normalized,y_train)
test_accuracy = reg.score(X_test_normalized,y_test)

print("train_accuracy: " + str(train_accuracy))
print("test_accuracy: "+ str(test_accuracy))


train_accuracy: 0.7371018840826646
test_accuracy: 0.7995787849960009


Defining a function to calculate the R squared metric to evaluate the model accuracy:


In [75]:
def r_squared(y_true, y_pred):
    # Calculate the mean of y_true
    mean_y = np.mean(y_true)

    # Calculate SSR (sum of squared residuals)
    ssr = np.sum((y_true - y_pred) ** 2)

    # Calculate SST (total sum of squares)
    sst = np.sum((y_true - mean_y) ** 2)

    # Calculate R-squared
    r2 = 1 - (ssr / sst)

    return r2

## Python form scratch implementation:

In [76]:
import numpy as np

# Linear Regression

class My_LinearRegression() :

    def __init__( self, learning_rate, iterations) :
        """This function initializes the object instance with arguments learning_rate, number of iterations"""

        self.learning_rate = learning_rate

        self.iterations = iterations

    def fit( self, X, Y ) :
        self.n = X.shape[0] #n: Number of trainig examples
        self.m = X.shape[1] #m: Number of features

        # weight initialization, initialize w,b to zeros
        self.W = np.zeros(self.m)
        self.b = 0

        self.X = X
        self.Y = Y

        # gradient descent learning
        for i in range( self.iterations ):
            self.update_weights()
        return self

    def update_weights( self ) :

        Y_pred = self.predict( self.X )

        # calculate gradients
        dW = -2 * (1/self.n) * np.dot(self.X.T, self.Y - Y_pred )

        db = -2 * (1/self.n) * np.sum(self.Y - Y_pred)

        self.W -= self.learning_rate * dW

        self.b -= self.learning_rate * db

        return self

    def predict( self, X ) :
        y_pred = np.dot(X,self.W) + self.b
        return y_pred

# Evaluating accuracy for the Python-from-scratch model

In [77]:
my_reg = My_LinearRegression(0.5,1000)
my_reg.fit(X_train_normalized,y_train)

train_score = r_squared(y_train,my_reg.predict(X_train_normalized))
y_pred = my_reg.predict(X_test_normalized)
test_score = r_squared(y_test,y_pred)

print('Train_Score: '+str(train_score))
print('Test_Score: '+str(test_score))

Train_Score: 0.7371018840826645
Test_Score: 0.7995787849960009


Both the scikit learn and the from-scratch implementations gave the same accuracy because they both follow the basic rules of linear regression models