In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Some global variables

data_folder = "../Data/"
res_folder = "../Res/"
features = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity']
current_date = datetime.now().strftime('%Y%m%d')

In [None]:
# Read activity types

with open(data_folder + 'activities.txt', 'r') as file:
    activities = file.read().splitlines()

# Add missing activities
activities.extend(['Not Available', 'CoreTraining', 'Cycling'])

print("Activity Types: ", activities)

In [None]:
# Read training data and test data

train_df = pd.read_csv(data_folder + '20241127_train_cleaned.csv')
test_df = pd.read_csv(data_folder + '20241127_test_cleaned.csv')

print("Training Data Shape: ", train_df.shape)
print("Test Data Shape: ", test_df.shape)
train_df.head()

In [None]:
# One hot encode the activity columns

encode_activity = True

if encode_activity:
    
    # Initialize the encoder
    activity_cols = [col for col in train_df.columns if col.startswith('activity')]
    encoder = OneHotEncoder(categories=[activities]*len(activity_cols), sparse_output=False, handle_unknown='error')

    # Fit and transform the training data
    encoded_train = encoder.fit_transform(train_df[activity_cols])
    encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(activity_cols))

    # Transform the test data
    encoded_test = encoder.transform(test_df[activity_cols])
    encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(activity_cols))

    # Drop the original 'activity' columns and concatenate the encoded columns
    train_df = train_df.drop(activity_cols, axis=1).reset_index(drop=True)
    test_df = test_df.drop(activity_cols, axis=1).reset_index(drop=True)
    train_df = pd.concat([train_df, encoded_train_df], axis=1)
    test_df = pd.concat([test_df, encoded_test_df], axis=1)

print("Training Data Shape: ", train_df.shape)
print("Test Data Shape: ", test_df.shape)
train_df.head()

In [None]:
def save_result(test_df, pred, file_name):
    
    # Prepare data for submission
    test_df['bg+1:00'] = pred
    res_df = test_df[['id', 'bg+1:00']]
    
    res_df.to_csv(file_name, index=False)
    print(f"Result saved to {file_name}")
    
    return None

# save_result(test_df, test_df['bg-0:00'], res_folder + current_date + '_simple_baseline.csv')

In [None]:
# Prepare the validation data similar to test data for evaluation
# With few patient data not seen completely while the rest split into training and validation data

# Following patient numbers will be skipped for validation data
# These patients are found to have few data points compared to others
skip_p_nums = ['p01', 'P06', 'P05']

# Select the feature columns for training
feature_cols = []
for feature in features:
    # Skip categorical features if not encoded
    if not encode_activity and feature == 'activity':
        continue
    feature_cols.extend([col for col in train_df.columns if col.startswith(feature)])
feature_cols.remove('bg+1:00')

# Split the data into training and validation sets
X = train_df[~train_df['p_num'].isin(skip_p_nums)][feature_cols]
y = train_df[~train_df['p_num'].isin(skip_p_nums)]['bg+1:00']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=0)

# Append validation data with the skipped patient numbers data
X_val = pd.concat([X_val, train_df[train_df['p_num'].isin(skip_p_nums)][feature_cols]])
y_val = pd.concat([y_val, train_df[train_df['p_num'].isin(skip_p_nums)]['bg+1:00']])

print("Training Shape: ", X_train.shape)
print("Validation Shape: ", X_val.shape)

In [None]:
# Find the best performing model using param search

# Define the parameter grid
param_grid = {
    'n_estimators': [5, 10, 25],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

run_param_search = False
if run_param_search:
    
    # Initialize the RandomForestRegressor
    rf = RandomForestRegressor(max_features='log2', random_state=0)

    # Initialize the GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=3, scoring='neg_root_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and the best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    print("Best Parameters: ", best_params)
    print("Best Model: ", best_model)

In [None]:
# Model training
model = RandomForestRegressor(n_estimators=25, max_features='log2', random_state=0)
model.fit(X_train, y_train)

# Model evaluation
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Train RMSE: {round(train_rmse, 4)}, Validation RMSE: {round(val_rmse, 4)}")

In [None]:
# Test data prediction

model = RandomForestRegressor(n_estimators=25, max_features='log2', random_state=0)
# model = RandomForestRegressor(n_estimators=25, random_state=0)
model.fit(train_df[feature_cols], train_df['bg+1:00'])  

test_pred = model.predict(test_df[feature_cols])
# save_result(test_df, test_pred, res_folder + current_date + '_rf_keep-all.csv')

train_rmse = np.sqrt(mean_squared_error(train_df['bg+1:00'], model.predict(train_df[feature_cols])))
print(f"Train RMSE: {round(train_rmse, 4)}")