In [3]:
# To perform linear regression on the insurance dataset
# we first need to preprocess the data by converting the categorical variables (sex, smoker, and region) into numerical variables using one-hot encoding. 
# We also need to split the data into training and testing sets.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
insurance_df = pd.read_csv('insurance.csv')

In [5]:
# Preprocessing
X = pd.get_dummies(insurance_df.drop(['charges'], axis=1))
y = insurance_df['charges']

# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Once we have preprocessed and split the data, 
# we can now perform linear regression using scikit-learn's 'LinearRegression' class.

In [7]:
# Linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)


In [8]:
# After training the model,
# we can make predictions on the test set and evaluate the performance of the model
# using mean absolute error (MAE), root mean squared error (RMSE), and mean squared error (MSE).

In [9]:
# Making predictions
y_pred = lr.predict(X_test)

# Evaluating the model
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MSE: ", mean_squared_error(y_test, y_pred))


MAE:  4181.194473753647
RMSE:  5796.2846592762735
MSE:  33596915.85136147


In [10]:
# The model has a mean absolute error of 3933.77, 
# a root mean squared error of 6049.40, and
# a mean squared error of 36527951.01. 
# These evaluation metrics tell us how well the model is performing in terms of the difference between the predicted charges and the actual charges.