In [112]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
import xgboost as xgb
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [9]:
path = '/Users/peggylee/Desktop/python/Kaggle_comptete/02_Data'
train_df = pd.read_csv(f"{path}/original_data/train.csv")
test_df = pd.read_csv(f"{path}/original_data/test.csv")

  train_df = pd.read_csv(f"{path}/original_data/train.csv")


## Data Pre Processing

In [162]:
class DataTransform:
    """
    1. drop columns that missing rate were high
    2. transform time format(hour, minute) by using sin/cos 
    3. impute data by back/forward as this was a time-series data
    
    """
    def  __init__(self, train, test):
        self.train = train
        self.test = test
        
    def dropcolumn(self,df):
        drop_activity = df.columns[df.columns.str.startswith('activity-')]
        drop_carbs = df.columns[df.columns.str.startswith('carbs-')]
        drop_step = df.columns[df.columns.str.startswith('steps-')]
        columns_to_drop = list(drop_activity) + list(drop_carbs) + list(drop_step)
        df_drop = df.drop(columns = columns_to_drop)
        
        return df_drop
    
    def create_time(self, df):
        df['hr'] = pd.to_datetime(df['time'],format = '%H:%M:%S').dt.hour
        df['min'] = pd.to_datetime(df['time'],format = '%H:%M:%S').dt.minute
        #df['sc'] = pd.to_datetime(df['time'],format = '%H:%M:%S').dt.second
        df['hour_sin'] = np.sin(2 * np.pi * df['hr'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hr'] / 24)
        df['min_sin'] = np.sin(2 * np.pi * df['min'] / 60)
        df['min_cos'] = np.cos(2 * np.pi * df['min'] / 60)
        df = df.drop(columns = ['hr','min'])
        
        return df
    def impute(self,df):
        bg_impute = df[df.columns[df.columns.str.startswith('bg-')]]
        bg_impute = bg_impute.T.fillna(method = 'ffill',axis = 0).fillna(method='bfill', axis=0).T
        insu_impute = df[df.columns[df.columns.str.startswith('insulin-')]]
        insu_impute = insu_impute.T.fillna(method = 'ffill',axis = 0).fillna(method='bfill', axis=0).T
        cals_impute = df[df.columns[df.columns.str.startswith('cals-')]]
        cals_impute = cals_impute.T.fillna(method = 'ffill',axis = 0).fillna(method='bfill', axis=0).T
        cals_impute = cals_impute.fillna(method = 'bfill',axis = 0).fillna(method = 'ffill',axis = 0)
        hr_impute = df[df.columns[df.columns.str.startswith('hr-')]]
        hr_impute = hr_impute.T.fillna(method = 'ffill',axis = 0).fillna(method='bfill', axis=0).T
        hr_impute = hr_impute.fillna(method = 'bfill',axis = 0).fillna(method = 'ffill',axis = 0)
        impute_final = pd.concat([bg_impute,insu_impute,cals_impute,hr_impute],axis = 1)
        return impute_final
    
    def final(self,df,type_data):
        drop_df = self.dropcolumn(df)
        time_new_df = self.create_time(drop_df)
        impute_df = self.impute(time_new_df)
        time_df = time_new_df[['hour_sin','hour_cos','min_sin','min_cos']]
        final_pre = pd.concat([impute_df,time_df],axis = 1)
        if type_data == 'train':
            outcome_df = df[['bg+1:00']]
            final = pd.concat([final_pre,outcome_df],axis = 1)
            return final 
        else:
            return final_pre

In [167]:
transformer = DataTransform(train_df, test_df)
train_impute = transformer.final(train_df,"train")
test_impute = transformer.final(test_df,"test")

In [168]:
train_X = train_impute.drop(columns = ['bg+1:00'])
train_Y = train_impute[['bg+1:00']]
print("train_X: ", train_X.shape)
print("train_Y: ", train_Y.shape)
X_train, X_valid, Y_train,Y_valid = train_test_split(train_X,train_Y, test_size = 0.2,random_state = 2024)

train_X:  (177024, 292)
train_Y:  (177024, 1)


In [169]:
def objective(trial):
    """
    A function to train model by using different hyperparameters combinations by Optuna.
    """
    
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'eta': trial.suggest_loguniform('eta', 0.05, 0.3),  # learning rate
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 8),
        'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.09, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 4, 10),
        'alpha': trial.suggest_loguniform('alpha', 1e-2, 10),
        'tree_method': 'hist'
    }
    reg = xgb.XGBRegressor(**params)
    reg.fit(X_train,Y_train,
           eval_set = [(X_valid,Y_valid)],
           verbose = False)
    y_pred_valid = reg.predict(X_valid)
    rmse = mean_squared_error(Y_valid, y_pred_valid, squared=False)
    return rmse

In [170]:
%%time
# Creating Optuna object and defining its parameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 10,timeout=900)

# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)
best_params = study.best_params

[I 2024-11-15 12:43:14,585] A new study created in memory with name: no-name-48eef1b6-079d-486a-a513-68b251727496
[I 2024-11-15 12:49:36,044] Trial 0 finished with value: 1.3728244516794064 and parameters: {'n_estimators': 988, 'eta': 0.24920528128068808, 'max_depth': 9, 'min_child_weight': 0.3629705819190647, 'subsample': 0.8010834782461491, 'colsample_bytree': 0.5075052446222936, 'lambda': 4.448749620072329, 'alpha': 2.281933179976858}. Best is trial 0 with value: 1.3728244516794064.
[I 2024-11-15 12:53:43,340] Trial 1 finished with value: 1.5525076174910437 and parameters: {'n_estimators': 1465, 'eta': 0.22075360263561594, 'max_depth': 5, 'min_child_weight': 0.006963114827416474, 'subsample': 0.6887101924939971, 'colsample_bytree': 0.3613161906518685, 'lambda': 7.716280094344977, 'alpha': 2.186105853910366}. Best is trial 0 with value: 1.3728244516794064.
[I 2024-11-15 12:56:54,041] Trial 2 finished with value: 1.6455171162582634 and parameters: {'n_estimators': 907, 'eta': 0.269249

Number of finished trials: 4
Best trial parameters: {'n_estimators': 988, 'eta': 0.24920528128068808, 'max_depth': 9, 'min_child_weight': 0.3629705819190647, 'subsample': 0.8010834782461491, 'colsample_bytree': 0.5075052446222936, 'lambda': 4.448749620072329, 'alpha': 2.281933179976858}
Best score: 1.3728244516794064
CPU times: user 55min 24s, sys: 30.1 s, total: 55min 54s
Wall time: 15min 37s


In [171]:
best_params = study.best_params
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, Y_train, 
                eval_set=[(X_valid, Y_valid)], 
                eval_metric='rmse', 
                verbose=False)

In [173]:
y_pred = final_model.predict(test_impute)

In [175]:
combined_test_pred_df = pd.DataFrame(
    {'bg+1:00' : y_pred,
     'id': test_df['id']})

In [177]:
combined_test_pred_df.to_csv("/Users/peggylee/Desktop/python/Kaggle_comptete/02_Data/temp/submission.csv", index=False)