In [77]:
import opendatasets as od # to download the dataset from Kaggle
import pandas as pd # to work with tabular data
from sklearn.preprocessing import Normalizer, OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder # to preprocess the data
from sklearn.compose import ColumnTransformer # to create a preprocessor
from sklearn.pipeline import Pipeline # to create a pipeline
from sklearn.linear_model import LinearRegression # to create a linear regression model
from sklearn.metrics import root_mean_squared_error as rmse # to evaluate the model
# Download the dataset from Kaggle
od.download("https://www.kaggle.com/datasets/mirichoi0218/insurance", force=True)

Dataset URL: https://www.kaggle.com/datasets/mirichoi0218/insurance
Downloading insurance.zip to ./insurance


100%|██████████| 16.0k/16.0k [00:00<00:00, 957kB/s]







In [47]:
%%writefile test_datasets.py # to save the test file
import pandas as pd # to work with tabular data
df = pd.read_csv("insurance/insurance.csv") # to load the dataset


def check_missing_values(df):
    """
    Check the number of missing values in the dataset
    """
    return df.isnull().sum().sum()

def test_check_missing_values():
    """
    Test the check_missing_values function
    """
    assert check_missing_values(df) == 0

def get_column_names(df):
    """
    Get the names of the columns in the dataset
    """
    return df.columns.to_list()

def test_get_column_names():
    """
    Test the get_column_names function
    """
    assert get_column_names(df) == ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

def get_data_types(df):
    """
    Get the data types of the columns in the dataset
    """
    return [dtype.name for dtype in df.dtypes]

def test_check_data_type():
    """
    Test the get_data_types function
    """
    assert get_data_types(df) == ['int64', 'object', 'float64', 'int64', 'object', 'object', 'float64']

def check_duplicates(df):
    """
    Check the number of duplicate rows in the dataset
    """
    return df.duplicated().sum()

def test_check_duplicates():
    """
    Test the check_duplicates function
    """
    assert check_duplicates(df) == 1

Overwriting test_datasets.py


In [69]:
df = pd.read_csv("insurance/insurance.csv") # Load the dataset
df.head(2) # Display the first two rows of the dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [70]:
# Preprocess the data with a ColumnTransformer
transforms = ColumnTransformer([
    ('norm1', Normalizer(), ['age']),
    ('le', OrdinalEncoder(), ['sex', 'smoker']),
    ('scaler', StandardScaler(), ['bmi']),
    ('minmax', MinMaxScaler(), ['children']),
    ('ohe', OneHotEncoder(), ['region'])
])
# Create a pipeline
model = Pipeline([
    ('preprocessor', transforms),
    ('regressor', LinearRegression())
])

In [71]:
X, y = df.drop('charges', axis=1), df['charges'] # separate features and target

In [76]:
model.fit(X, y) # train the model
rmse(y, model.predict(X)) # evaluate the model

7021.428149754168