In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.losses import MeanSquaredError

In [None]:
# Some global variables

data_folder = "../Data/"
res_folder = "../Res/"
features = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity']
current_date = datetime.now().strftime('%Y%m%d')

In [None]:
# Read activity types

with open(data_folder + 'activities.txt', 'r') as file:
    activities = file.read().splitlines()

# Add missing activities
activities.extend(['Not Available', 'CoreTraining', 'Cycling'])

print("Activity Types: ", activities)

In [None]:
# Read training data and test data

train_df = pd.read_csv(data_folder + '20241127_train_cleaned.csv')
test_df = pd.read_csv(data_folder + '20241127_test_cleaned.csv')

print("Training Data Shape: ", train_df.shape)
print("Test Data Shape: ", test_df.shape)
train_df.head()

In [None]:
# One hot encode the activity columns

encode_activity = True

if encode_activity:
    
    # Initialize the encoder
    activity_cols = [col for col in train_df.columns if col.startswith('activity')]
    encoder = OneHotEncoder(categories=[activities]*len(activity_cols), sparse_output=False, handle_unknown='error')

    # Fit and transform the training data
    encoded_train = encoder.fit_transform(train_df[activity_cols])
    encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(activity_cols))

    # Transform the test data
    encoded_test = encoder.transform(test_df[activity_cols])
    encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(activity_cols))

    # Drop the original 'activity' columns and concatenate the encoded columns
    train_df = train_df.drop(activity_cols, axis=1).reset_index(drop=True)
    test_df = test_df.drop(activity_cols, axis=1).reset_index(drop=True)
    train_df = pd.concat([train_df, encoded_train_df], axis=1)
    test_df = pd.concat([test_df, encoded_test_df], axis=1)

print("Training Data Shape: ", train_df.shape)
print("Test Data Shape: ", test_df.shape)
train_df.head()

In [None]:
def save_result(test_df, pred, file_name):
    
    # Prepare data for submission
    test_df['bg+1:00'] = pred
    res_df = test_df[['id', 'bg+1:00']]
    
    res_df.to_csv(file_name, index=False)
    print(f"Result saved to {file_name}")
    
    return None

# save_result(test_df, test_df['bg-0:00'], res_folder + current_date + '_simple_baseline.csv')

In [None]:
# Prepare the validation data similar to test data for evaluation
# With few patient data not seen completely while the rest split into training and validation data

# Following patient numbers will be skipped for validation data
# These patients are found to have few data points compared to others
skip_p_nums = ['p01', 'P06', 'P05']

# Select the feature columns for training
feature_cols = []
for feature in features:
    # Skip categorical features if not encoded
    if not encode_activity and feature == 'activity':
        continue
    feature_cols.extend([col for col in train_df.columns if col.startswith(feature)])
feature_cols.remove('bg+1:00')

# Split the data into training and validation sets
X = train_df[~train_df['p_num'].isin(skip_p_nums)][feature_cols]
y = train_df[~train_df['p_num'].isin(skip_p_nums)]['bg+1:00']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=0)

# Append validation data with the skipped patient numbers data
X_val = pd.concat([X_val, train_df[train_df['p_num'].isin(skip_p_nums)][feature_cols]])
y_val = pd.concat([y_val, train_df[train_df['p_num'].isin(skip_p_nums)]['bg+1:00']])

print("Training Shape: ", X_train.shape)
print("Validation Shape: ", X_val.shape)

In [None]:
# Normalize the data before training

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Convert the scaled data back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

print("Scaled Training Data Shape: ", X_train_scaled.shape)
print("Scaled Validation Data Shape: ", X_val_scaled.shape)

In [None]:
# TODO: Sequential data modeling (using RNN or LSTM) need to be explored

def get_MLP_model(input_shape, verbose=False):
    
    # Define the model
    model = Sequential()
    model.add(Dense(128, input_dim=input_shape, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss=MeanSquaredError())
    
    if verbose:
        model.summary()
    
    return model

# Get the model instance
model = get_MLP_model(X_train.shape[1], True)

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=25, batch_size=32, verbose=1)

# Evaluate the model
train_pred = model.predict(X_train_scaled)
val_pred = model.predict(X_val_scaled)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
print(f"Train RMSE: {train_rmse}, Validation RMSE: {val_rmse}")

In [None]:
# Check for model bias and variance

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

In [None]:
# Test data prediction

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(train_df[feature_cols])
X_test_scaled = scaler.transform(test_df[feature_cols])

model = get_MLP_model(X_train.shape[1], True)
model.fit(X_train_scaled, train_df['bg+1:00'], epochs=10, batch_size=32, verbose=1)

test_pred = model.predict(X_test_scaled)
save_result(test_df, test_pred, res_folder + current_date + '_nn_keep-all-with-norm.csv')

train_rmse = np.sqrt(mean_squared_error(train_df['bg+1:00'], model.predict(X_train_scaled)))
print(f"Train RMSE: {round(train_rmse, 4)}")