In [1]:
!pip install scikit-learn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm



In [3]:
health_df = pd.read_csv("Health_insurance.csv")
health_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [7]:
health_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
health_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [11]:
cont_scal = ['age', 'bmi', 'children']
cat_scal = ['sex', 'region', 'smoker']

In [27]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), cont_scal),
        ('cat', OneHotEncoder(), cat_scal)]
)


In [29]:
health_df_processed = preprocessor.fit_transform(health_df)

In [31]:
column_names = ['age','bmi','children','sex_female','sex_male','region_northeast','region_northwest','region_southeast',
                'region_southwest', 'smoker_no','smoker_yes']

In [33]:
num_features = cont_scal


In [35]:
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_scal)

In [37]:
column_combine = list(num_features) + list(cat_features)

In [39]:
health_df_processed_df = pd.DataFrame(health_df_processed, columns=column_combine)

In [41]:
print(health_df_processed_df.head())

        age       bmi  children  sex_female  sex_male  region_northeast  \
0 -1.438764 -0.453320 -0.908614         1.0       0.0               0.0   
1 -1.509965  0.509621 -0.078767         0.0       1.0               0.0   
2 -0.797954  0.383307  1.580926         0.0       1.0               0.0   
3 -0.441948 -1.305531 -0.908614         0.0       1.0               0.0   
4 -0.513149 -0.292556 -0.908614         0.0       1.0               0.0   

   region_northwest  region_southeast  region_southwest  smoker_no  smoker_yes  
0               0.0               0.0               1.0        0.0         1.0  
1               0.0               1.0               0.0        1.0         0.0  
2               0.0               1.0               0.0        1.0         0.0  
3               1.0               0.0               0.0        1.0         0.0  
4               1.0               0.0               0.0        1.0         0.0  


In [43]:
X = health_df_processed_df[column_combine]

In [47]:
y = health_df['charges']

In [49]:
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1338, 11), y shape: (1338,)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state =17)

In [53]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [55]:
print(reg.score(X_test,y_test))

0.7345433061333784


In [57]:
y_pred1 = reg.predict(X_test)

In [59]:
mae1 = mean_absolute_error(y_test, y_pred1).round(2)

In [75]:
print(f"mean absolute error:{mae1}")

mean absolute error:3943.23


In [77]:
mape1 = ((np.mean(np.abs(y_test-y_pred1)/y_test) * 100))
print(f"mean absolute percentage error: {mape1}%")

mean absolute percentage error: 46.98196731280588%
