In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [2]:
"""
The dataset consists of age, sex, BMI(body mass index), children, 
smoker and region feature, which are independent and charge as a dependent feature.
We will predict individual medical costs billed by health insurance.
"""
df = pd.read_csv('insurance.csv')
print(df)

      age     sex   bmi  children smoker     region  expenses
0      19  female  27.9         0    yes  southwest  16884.92
1      18    male  33.8         1     no  southeast   1725.55
2      28    male  33.0         3     no  southeast   4449.46
3      33    male  22.7         0     no  northwest  21984.47
4      32    male  28.9         0     no  northwest   3866.86
...   ...     ...   ...       ...    ...        ...       ...
1333   50    male  31.0         3     no  northwest  10600.55
1334   18  female  31.9         0     no  northeast   2205.98
1335   18  female  36.9         0     no  southeast   1629.83
1336   21  female  25.8         0     no  southwest   2007.95
1337   61  female  29.1         0    yes  northwest  29141.36

[1338 rows x 7 columns]


In [3]:
print('\nNumber of rows and columns in the data set: ',df.shape)


Number of rows and columns in the data set:  (1338, 7)


In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [None]:
df.describe()

In [5]:
# Check for missing value
missing_values = df.isnull().sum()
print(missing_values)  # This will show the count of missing values in each column

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64


In [6]:
# ------ features and output (label) ------
input_df = df.drop(columns='expenses')
target_df = df.expenses

In [7]:
input_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest
1334,18,female,31.9,0,no,northeast
1335,18,female,36.9,0,no,southeast
1336,21,female,25.8,0,no,southwest


In [8]:
target_df

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

In [9]:
from sklearn.preprocessing import LabelEncoder

# Define the columns for encoding
columns_to_encode = ["sex", "smoker", "region"]  # Replace with your column names

# Create a LabelEncoder object
le = LabelEncoder()

# Loop through each column and encode
for col in columns_to_encode:
  encoded_data = le.fit_transform(input_df[col])
  input_df["encoded_" + col] = encoded_data

In [10]:
input_df

Unnamed: 0,age,sex,bmi,children,smoker,region,encoded_sex,encoded_smoker,encoded_region
0,19,female,27.9,0,yes,southwest,0,1,3
1,18,male,33.8,1,no,southeast,1,0,2
2,28,male,33.0,3,no,southeast,1,0,2
3,33,male,22.7,0,no,northwest,1,0,1
4,32,male,28.9,0,no,northwest,1,0,1
...,...,...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,1,0,1
1334,18,female,31.9,0,no,northeast,0,0,0
1335,18,female,36.9,0,no,southeast,0,0,2
1336,21,female,25.8,0,no,southwest,0,0,3


In [11]:
# Define the columns to drop
columns_to_drop = ["sex", "smoker", "region"]  # Replace with your column names

# Drop the columns
input_df = input_df.drop(columns_to_drop, axis=1)

In [12]:
input_df

Unnamed: 0,age,bmi,children,encoded_sex,encoded_smoker,encoded_region
0,19,27.9,0,0,1,3
1,18,33.8,1,1,0,2
2,28,33.0,3,1,0,2
3,33,22.7,0,1,0,1
4,32,28.9,0,1,0,1
...,...,...,...,...,...,...
1333,50,31.0,3,1,0,1
1334,18,31.9,0,0,0,0
1335,18,36.9,0,0,0,2
1336,21,25.8,0,0,0,3


In [19]:
from sklearn.preprocessing import MinMaxScaler

# Specify the column name for scaling
column_name = "expenses"  # Replace with the actual column name

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler on the column data (learn range)
scaler.fit(df[[column_name]])  # Double square brackets to select a DataFrame

# Transform the column data using the fitted scaler
scaled_data = scaler.transform(df[[column_name]])


In [20]:
target_df = scaled_data

In [21]:
target_df

array([[0.25161073],
       [0.00963598],
       [0.05311519],
       ...,
       [0.00810809],
       [0.01414366],
       [0.44724875]])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(input_df, target_df, test_size=0.2, random_state=42)

In [23]:
#For linear regression, Y=the value we want to predict
#X= all independent variables upon which Y depends. 
#3 steps for linear regression....
#Step 1: Create the instance of the model
#Step 2: .fit() to train the model or fit a linear model
#Step 3: .predict() to predict Y for given X values. 

In [24]:
#object
reg = LinearRegression()
#fit
reg.fit(X_train, Y_train)

In [26]:
prediction_test = reg.predict(X_test)

In [27]:
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import median_absolute_error
import matplotlib.pyplot as plt

In [28]:
prediction_test = reg.predict(X_test)    
print(Y_test, prediction_test)
#----------------------------------------------------
#Calculating Mean Absolute Error
MAEValue = mean_absolute_error(Y_test, prediction_test, multioutput='uniform_average') # it can be raw_values
print('Mean Absolute Error Value is : ', MAEValue)

#----------------------------------------------------
#Calculating Mean Squared Error
MSEValue = mean_squared_error(Y_test, prediction_test, multioutput='uniform_average') # it can be raw_values
print('Mean Squared Error Value is : ', MSEValue)

#----------------------------------------------------
#Calculating Median Squared Error
MdSEValue = median_absolute_error(Y_test, prediction_test)
print('Median Squared Error Value is : ', MdSEValue )

[[1.27268687e-01]
 [6.62474924e-02]
 [4.50275473e-01]
 [1.30569960e-01]
 [5.20816759e-01]
 [5.45006940e-02]
 [1.58897507e-02]
 [2.08922120e-01]
 [4.16731047e-02]
 [1.45934240e-01]
 [2.73547389e-01]
 [9.79248366e-02]
 [4.51014357e-02]
 [7.18759537e-01]
 [7.59022873e-01]
 [6.87657945e-01]
 [1.38535028e-01]
 [6.67979918e-01]
 [1.13509872e-01]
 [3.29655622e-01]
 [6.31815001e-02]
 [1.00874306e-01]
 [2.14577957e-03]
 [2.60684364e-02]
 [1.59041485e-01]
 [1.56461058e-01]
 [1.83926334e-01]
 [2.82255171e-01]
 [1.37177455e-01]
 [1.53874247e-04]
 [2.34753201e-01]
 [1.71125242e-01]
 [1.43447830e-02]
 [7.29715096e-02]
 [2.84479005e-02]
 [1.00984444e-01]
 [2.35585622e-02]
 [9.92182422e-02]
 [3.63388879e-01]
 [5.99970853e-01]
 [5.69195844e-02]
 [2.42846763e-02]
 [1.68435795e-01]
 [1.75632449e-01]
 [6.01471127e-02]
 [1.78965965e-01]
 [3.92341021e-02]
 [5.21924207e-02]
 [6.54486711e-01]
 [5.33346656e-02]
 [2.03760948e-01]
 [9.53844749e-03]
 [4.36639884e-01]
 [9.37068625e-03]
 [1.51198368e-01]
 [3.864647