**First, we will read the unseen data and split the label 'charges' from the features (y_test and X_test).**

In [None]:
import pandas as pd
data = pd.read_csv('real-data.csv')

In [None]:
y_test = data['charges'].copy()
X_test = data.drop('charges', axis =1)

**Before testing our model on the unseen data, we must load our saved model as well as our preprocessing functions.**

In [1]:
import pickle

# loading the preprocessing objects into the 'scaler.pkl' file.

with open('imputer.pkl', 'rb') as f:
    imputer = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('one_hot_encoder.pkl', 'rb') as f:
    one_hot_encoder = pickle.load(f)

with open('imp_neg_mode.pkl', 'rb') as f:
    imp_neg_mode = pickle.load(f)

with open('imp_region_mode.pkl', 'rb') as f:
    imp_region_mode = pickle.load(f)

with open('imp_typos_mode.pkl', 'rb') as f:
    imp_typos_mode = pickle.load(f)


In [6]:
from tensorflow.keras.models import load_model

# Loading our chosen model
model5 = load_model('trained_model.keras')

model5.summary()  # to confirm we have saved and loaded the correct/chosen model


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 100)               1200      
                                                                 
 dropout_4 (Dropout)         (None, 100)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               12928     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 32)               

**We must now make the unseen test data undergo the same preprocessing as the data we were given to train the model with. Since we are assuming that the unseen data can have any of the issues the given dataset had, we will address those issues. However, we will NOT retrain the functions used (like scalers, imputers, and encoders). To ensure fairness in our judgement of the model's predictions, we will use the same functions as they were.**

In [7]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
  
# Function to preprocess testing data
def preprocessing_testing(unseen_data, imp_neg_mode, imp_region_mode, imp_typos_mode, imputer, scaler, one_hot_encoder):
    import numpy as np
    
    ################################# Handling 'age' & 'children' Outliers #########################################

    unseen_data['age'] = unseen_data['age'].abs()
    # replace all negative values in children by np.nan (instead of None to keep it numerical)
    unseen_data.loc[unseen_data['children'] < 0, 'children'] = np.nan
    # next we can impute the np.nan with the mode number of children
    unseen_data[['children']] = imp_neg_mode.transform(unseen_data[['children']])

    ################################# Mode-Imputing 'region' Missing Values ##########################################
    
    import numpy as np
    
    #imp_region_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
    unseen_data[['region']] = imp_region_mode.transform(unseen_data[['region']])

    ###################################### Mode-Imputing 'sex' Typos #################################################

    # first we replace instances of the two typos with None
    unseen_data.loc[unseen_data['sex'] == 'dasdas', 'sex'] = None
    unseen_data.loc[unseen_data['sex'] == 'wqeqwrf', 'sex'] = None
    
    # next we can impute the None with the mode 
    #imp_typos_mode = SimpleImputer(missing_values=None, strategy='most_frequent') 
    unseen_data[['sex']] = imp_typos_mode.transform(unseen_data[['sex']])

    # Separate numerical and categorical columns
    numerical_cols = unseen_data.select_dtypes(include=["float64", "int64"]).columns
    categorical_cols = unseen_data.select_dtypes(include=["object"]).columns
    
    ############################################### Data Cleaning #################################################
    # Impute missing values for numerical data using the imputer from training
    unseen_data[numerical_cols] = imputer.transform(unseen_data[numerical_cols])
    
    ############################################## Feature Scaling ################################################
    # Scale numerical features using the scaler from training
    unseen_data[numerical_cols] = scaler.transform(unseen_data[numerical_cols])
    
    ############################################ Categorization ###################################################
    # One-hot encode categorical features using the encoder from training
    encoded_categorical = one_hot_encoder.transform(unseen_data[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded_categorical, 
        columns=one_hot_encoder.get_feature_names_out(categorical_cols),
        index=unseen_data.index
    )
    
    # Combine original data with encoded data
    df_encoded = pd.concat([unseen_data.drop(columns=categorical_cols), encoded_df], axis=1)
    return df_encoded
    

In [None]:
X_test_transformed = preprocessing_testing(X_test, imp_neg_mode, imp_region_mode, imp_typos_mode, imputer, scaler, one_hot_encoder)

**Using our model to predict the charges of the unseen data.**

In [None]:
predictions = model5.predict(X_test_transformed)

**Comparing our model's predictions to the actual charges then evaluating its performance.**

In [None]:
#model3.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['mean_absolute_error'])

test_loss, test_mae = model5.evaluate(X_test_transformed, y_test)

print(f'Mean Absolute Error (MAE) on the test set: {test_mae:.4f}')

In [None]:
for actual,pred in zip(y_test[:5], predictions[:5]):
    print(f'Actual: {actual}, pred: {pred}')