In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Some global variables

data_folder = "../Data/"
res_folder = "../Res/"
features = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity']
current_date = datetime.now().strftime('%Y%m%d')

In [None]:
# Read training data and test data

train_df = pd.read_csv(data_folder + '20241122_train_cleaned.csv')
test_df = pd.read_csv(data_folder + '20241122_test_cleaned.csv')

print("Training Data Shape: ", train_df.shape)
print("Test Data Shape: ", test_df.shape)
train_df.head()

In [None]:
# Check the base performance if the prediction is same as the last value 

train_rmse_last_seen = np.sqrt(mean_squared_error(train_df['bg+1:00'], train_df['bg-0:00']))
print(f"Baseline RMSE: {round(train_rmse_last_seen, 4)}")

In [None]:
def save_result(test_df, pred, file_name):
    
    # Prepare data for submission
    test_df['bg+1:00'] = pred
    res_df = test_df[['id', 'bg+1:00']]
    
    res_df.to_csv(file_name, index=False)
    print(f"Result saved to {file_name}")
    
    return None

# save_result(test_df, test_df['bg-0:00'], res_folder + current_date + '_simple_baseline.csv')

In [None]:
# Check the baseline model performance with KFold cross validation

# Select the feature columns for model training
feature_cols = []
for feature in features:
    
    # Consider only the bg feature
    '''
    if feature != 'bg':
        continue
    '''
    
    # Skip categorical features
    if feature == 'activity':
        continue
    feature_cols.extend([col for col in train_df.columns if col.startswith(feature)])
feature_cols.remove('bg+1:00')
print(f"Feature Columns: {feature_cols}")

run_kfold_cv = True
if run_kfold_cv:
    kf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    for fold, (train_index, val_index) in enumerate(kf.split(train_df, train_df['p_num'])):
        print("**********")
        print(f"Fold {fold}")
        
        # Prepare the training and validation data
        train_fold = train_df.iloc[train_index]
        val_fold = train_df.iloc[val_index]
        print(f"Train Fold Shape: {train_fold.shape}, Validation Fold Shape: {val_fold.shape}")
        
        # Make predictions using the baseline model 
        model = RandomForestRegressor(n_estimators=10, random_state=0)
        model.fit(train_fold[feature_cols], train_fold['bg+1:00'])

        # Make predictions
        train_pred = model.predict(train_fold[feature_cols])
        val_pred = model.predict(val_fold[feature_cols])
        
        # Calculate the RMSE
        train_rmse = np.sqrt(mean_squared_error(train_fold['bg+1:00'], train_pred))
        val_rmse = np.sqrt(mean_squared_error(val_fold['bg+1:00'], val_pred))
        print(f"Train RMSE: {round(train_rmse, 4)}, Validation RMSE: {round(val_rmse, 4)}")

In [None]:
# Save baseline result

baseline_model = RandomForestRegressor(n_estimators=10, random_state=0)
baseline_model.fit(train_df[feature_cols], train_df['bg+1:00'])

pred = baseline_model.predict(test_df[feature_cols])
# save_result(test_df, pred, res_folder + current_date + '_baseline_rf_keep-all-except-cat.csv')

train_rmse = np.sqrt(mean_squared_error(train_df['bg+1:00'], baseline_model.predict(train_df[feature_cols])))
print(f"Train RMSE: {round(train_rmse, 4)}")