In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from feature_engineering import FeatureEngineering
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('insurance.csv')
data['charges'] = data['charges'].round(2)

X = data.drop('charges', axis=1)
y = data['charges']

In [3]:
X.iloc[[2]]

Unnamed: 0,age,sex,bmi,children,smoker,region
2,28,male,33.0,3,no,southeast


In [4]:
y[[2]]

2    4449.46
Name: charges, dtype: float64

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), ['age_bmi', 'age_bmi_smoker'])
    ],
    remainder='passthrough'
)

In [6]:
pipe = Pipeline(steps=[
    ('feature_engineering', FeatureEngineering()),
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=111
)

In [8]:
pipe.fit(X_train, y_train)
with open('insurance_model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [9]:
with open('insurance_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [10]:
pred = pipe.predict(X_test)
print('RMSE = ', np.sqrt(mean_squared_error(pred, y_test)))

RMSE =  5600.288686865197


In [11]:
pred = model.predict(X_test)
print('RMSE = ', np.sqrt(mean_squared_error(pred, y_test)))

RMSE =  5600.288686865197


In [12]:
one_input = X.iloc[[2]]
one_input

Unnamed: 0,age,sex,bmi,children,smoker,region
2,28,male,33.0,3,no,southeast


In [13]:
pred1 = model.predict(one_input)
print('Predicted Charge = ', pred1[0])

Predicted Charge =  7280.608317221344
