In [134]:
import opendatasets as od # to download the dataset from Kaggle
import pandas as pd # to work with tabular data
from sklearn.preprocessing import Normalizer, OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder # to preprocess the data
from sklearn.compose import ColumnTransformer # to create a preprocessor
from sklearn.pipeline import Pipeline # to create a pipeline
from sklearn.linear_model import LinearRegression # to create a linear regression model
from sklearn.metrics import root_mean_squared_error as rmse # to evaluate the model
import joblib # to save the model
# Download the dataset from Kaggle
od.download("https://www.kaggle.com/datasets/mirichoi0218/insurance", force=True)

Dataset URL: https://www.kaggle.com/datasets/mirichoi0218/insurance
Downloading insurance.zip to ./insurance


100%|██████████| 16.0k/16.0k [00:00<00:00, 812kB/s]







In [140]:
import os

current_directory = os.getcwd()
print(current_directory)

/home/alex/Projects/mlops/lab5


In [135]:
%%writefile test_datasets.py
import os # to work with the file system
import pandas as pd # to work with tabular data
import joblib # to save the model
from sklearn.metrics import root_mean_squared_error as rmse # to evaluate the model

current_directory = os.getcwd() # get the current directory
df = pd.read_csv(current_directory + "/insurance/insurance.csv") # to load the dataset


def check_missing_values(df):
    """
    Check the number of missing values in the dataset
    """
    return df.isnull().sum().sum()

def test_check_missing_values():
    """
    Test the check_missing_values function
    """
    assert check_missing_values(df) == 0

def get_column_names(df):
    """
    Get the names of the columns in the dataset
    """
    return df.columns.to_list()

def test_get_column_names():
    """
    Test the get_column_names function
    """
    assert get_column_names(df) == ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

def get_data_types(df):
    """
    Get the data types of the columns in the dataset
    """
    return [dtype.name for dtype in df.dtypes]

def test_check_data_type():
    """
    Test the get_data_types function
    """
    assert get_data_types(df) == ['int64', 'object', 'float64', 'int64', 'object', 'object', 'float64']

def check_duplicates(df):
    """
    Check the number of duplicate rows in the dataset
    """
    return df.duplicated().sum()

def test_check_duplicates():
    """
    Test the check_duplicates function
    """
    assert check_duplicates(df) == 1

# load the model from the file
model = joblib.load('insurance/model.pkl')

def metrics_model(df, noise, model):
    """
    Evaluate the model with the dataset and noise
    """
    df_noised = df.copy() # create a copy of the dataset
    df_noised.iloc[700:1000, [0, 2, 3]] *= 3 # add noise to the dataset
    X, y = df.drop('charges', axis=1), df['charges'] # separate features and target
    X_noised, y_noised = df_noised.drop('charges', axis=1), df_noised['charges'] # separate features and target
    return rmse(y, model.predict(X_noised)), rmse(y, model.predict(X))

def test_metrics_model():
    """
    Test the metrics_model function
    """
    assert metrics_model(df, 2, model)[0] <= metrics_model(df, 2, model)[1] * 1.2



Overwriting test_datasets.py


In [136]:
df = pd.read_csv("insurance/insurance.csv") # Load the dataset
df.head(10) # Display the first two rows of the dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [137]:
# Preprocess the data with a ColumnTransformer
transforms = ColumnTransformer([
    ('norm1', Normalizer(), ['age']),
    ('le', OrdinalEncoder(), ['sex', 'smoker']),
    ('scaler', StandardScaler(), ['bmi']),
    ('minmax', MinMaxScaler(), ['children']),
    ('ohe', OneHotEncoder(), ['region'])
])
# Create a pipeline
model = Pipeline([
    ('preprocessor', transforms),
    ('regressor', LinearRegression())
])

In [138]:
X, y = df.drop('charges', axis=1), df['charges'] # separate features and target

In [139]:
model.fit(X, y) # train the model
rmse(y, model.predict(X)) # evaluate the model
joblib.dump(model, 'insurance/model.pkl')

['insurance/model.pkl']