In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
path = '/Users/peggylee/Desktop/python/Kaggle_comptete/02_Data/original_data'
sample = pd.read_csv(f"{path}/sample_submission.csv")
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")

In [3]:
print("train shape: ",train.shape)
print("test shape: ", test.shape)
print("train unique participants: ", len(train['p_num'].unique()))
print("train unique participants: ", train['p_num'].unique())
print("test unique participants: ", len(test['p_num'].unique()))
print("test unique participants: ", test['p_num'].unique())

train shape:  (177024, 508)
test shape:  (3644, 507)
train unique participants:  9
train unique participants:  ['p01' 'p02' 'p03' 'p04' 'p05' 'p06' 'p10' 'p11' 'p12']
test unique participants:  15
test unique participants:  ['p01' 'p02' 'p04' 'p05' 'p06' 'p10' 'p11' 'p12' 'p15' 'p16' 'p18' 'p19'
 'p21' 'p22' 'p24']


In [4]:
## train data ##
id_train = train[['id','p_num']]
id_train1 = id_train.groupby('p_num').agg(count = ('p_num','size')).reset_index()
id_train1['count'].describe()

count        9.000000
mean     19669.333333
std       8482.964959
min       8288.000000
25%       8459.000000
50%      24686.000000
75%      25454.000000
max      26028.000000
Name: count, dtype: float64

In [5]:
## test data ##
id_test = test[['id','p_num']]
id_test1 = id_test.groupby('p_num').agg(count = ('p_num','size')).reset_index()
id_test1['count'].describe()

count     15.000000
mean     242.933333
std       30.674481
min      179.000000
25%      229.000000
50%      244.000000
75%      259.500000
max      294.000000
Name: count, dtype: float64

In [6]:
fixed_train = ['id','p_num','time','bg+1:00']
fixed_test = ['id','p_num','time']

### train ###
bg_bf_train = train.columns[train.columns.str.startswith('bg-')]
insulin_bf_train = train.columns[train.columns.str.startswith('insulin-')]
carbs_bf_train = train.columns[train.columns.str.startswith('carbs-')]
hr_bf_train = train.columns[train.columns.str.startswith('hr-')]
step_bf_train = train.columns[train.columns.str.startswith('steps-')]
cals_bf_train = train.columns[train.columns.str.startswith('cals-')]
activity_bf_train = train.columns[train.columns.str.startswith('activity-')]

### test ###
bg_bf_test = test.columns[test.columns.str.startswith('bg-')]
insulin_bf_test = test.columns[test.columns.str.startswith('insulin-')]
carbs_bf_test = test.columns[test.columns.str.startswith('carbs-')]
hr_bf_test = test.columns[test.columns.str.startswith('hr-')]
step_bf_test = test.columns[test.columns.str.startswith('steps-')]
cals_bf_test = test.columns[test.columns.str.startswith('cals-')]
activity_bf_test = test.columns[test.columns.str.startswith('activity-')]

In [7]:
def melt_column(colnames,df,newcolname,newcolvalue,fixed_col):
    df_melt = pd.melt(df,id_vars = fixed_col, value_vars = colnames, var_name = newcolname, value_name = newcolvalue)
    return(df_melt)


def reshape_df(df,type_data):
    temp1 = df
    temp1.loc[:, 'time_temp'] = pd.to_datetime(temp1['time'], format="%H:%M:%S").dt.time
    ### day made ###
    day = [1]*len(temp1)
    k = 1
    for i in range(1, len(temp1)):
        if temp1['time_temp'].iloc[i] <= temp1['time_temp'].iloc[i - 1]:
            k += 1
        day[i] = k
    temp1['day'] = day
    max_time = max(day)
    
    ### time sequence ###
    time_sequence = pd.date_range("00:00", "23:55", freq="5T").time
    time_sequence_data = pd.DataFrame({
        'time_seq': np.tile([t.strftime("%H:%M:%S") for t in time_sequence], max_time),
        'day': np.repeat(np.arange(1, max_time + 1), len(time_sequence)),
        'bg_measure': np.nan
    })
    temp1.loc[:,'time_pre'] = pd.to_datetime(temp1['time'], format = "%H:%M:%S")
    ### outcome ###
    if type_data == "train":
        temp1.loc[:,'time_pre_seconds'] = temp1['time_pre'].dt.hour * 3600 + temp1['time_pre'].dt.minute * 60 + temp1['time_pre'].dt.second
        temp1.loc[:,'time_pre_seconds'] += 3600  # 加一小時
        temp1.loc[:,'day_outcome'] = np.where(temp1['time_pre_seconds'] >= 86400, temp1['day'] + 1, temp1['day']) #day_new
        temp1.loc[:,'time_pre_outcome'] = pd.to_timedelta(temp1['time_pre_seconds'], unit='s')
        temp1.loc[:,'time_pre_outcome'] = temp1['time_pre_outcome'].apply(lambda x: str(x).split()[2])
    if type_data == "test":
        temp1.loc[:,'time_pre_seconds'] = temp1['time_pre'].dt.hour * 3600 + temp1['time_pre'].dt.minute * 60 + temp1['time_pre'].dt.second
        temp1.loc[:,'time_pre_seconds'] += 3600  # 加一小時
        temp1.loc[:,'day_outcome'] = np.where(temp1['time_pre_seconds'] >= 86400, temp1['day'] + 1, temp1['day']) #day_new
        temp1.loc[:,'time_pre_outcome'] = pd.to_timedelta(temp1['time_pre_seconds'], unit='s')
        temp1.loc[:,'time_pre_outcome'] = temp1['time_pre_outcome'].apply(lambda x: str(x).split()[2])
        temp2 = temp1[['day_outcome','time_pre_outcome',"id","p_num"]]
    ### reshape: wide to long ### 
    col_temp = temp1.columns[3:507]
    fixed_colname = temp1.drop(columns=col_temp).columns # fixed_colname_train


    ### feature ###
    bg_bf = temp1.columns[temp1.columns.str.startswith('bg-')]
    insulin_bf = temp1.columns[temp1.columns.str.startswith('insulin-')]
    carbs_bf = temp1.columns[temp1.columns.str.startswith('carbs-')]
    hr_bf = temp1.columns[temp1.columns.str.startswith('hr-')]
    step_bf = temp1.columns[temp1.columns.str.startswith('steps-')]
    cals_bf = temp1.columns[temp1.columns.str.startswith('cals-')]
    activity_bf = temp1.columns[temp1.columns.str.startswith('activity-')]
    
    ### wideform to longform ###
    bg_melt = melt_column(bg_bf,temp1,"bg_time","bg_value",fixed_col = fixed_colname)
    insulin_melt = melt_column(insulin_bf,temp1,"insulin_time","insulin_value",fixed_col = fixed_colname)
    carbs_melt = melt_column(carbs_bf,temp1,"carbs_time","carbs_value",fixed_col = fixed_colname)
    hr_melt = melt_column(hr_bf,temp1,"hr_time","hr_value",fixed_col = fixed_colname)
    step_melt = melt_column(step_bf,temp1,"step_time","step_value",fixed_col = fixed_colname)
    cals_melt = melt_column(cals_bf,temp1,"cals_time","cals_value",fixed_col = fixed_colname)
    activity_melt = melt_column(activity_bf,temp1,"activity_time","activity_value",fixed_col = fixed_colname)

    ### combine ###
    melt_pre1 = pd.concat([bg_melt,insulin_melt.drop(columns = fixed_colname)],axis = 1)
    melt_pre1 = pd.concat([melt_pre1,carbs_melt.drop(columns = fixed_colname)],axis = 1)
    melt_pre1 = pd.concat([melt_pre1,hr_melt.drop(columns = fixed_colname)],axis = 1)
    melt_pre1 = pd.concat([melt_pre1,step_melt.drop(columns = fixed_colname)],axis = 1)
    melt_pre1 = pd.concat([melt_pre1,cals_melt.drop(columns = fixed_colname)],axis = 1)
    melt_final = pd.concat([melt_pre1,activity_melt.drop(columns = fixed_colname)],axis = 1)
    ### organize train data ###
    melt_final.loc[:,'bg_time'] = melt_final['bg_time'].str.extract(r'(.*)-(.*)')[1]
    melt_final.loc[:,'bg_time1'] = pd.to_datetime(melt_final['bg_time'].str.pad(5, fillchar='0') + ":00", format = "%H:%M:%S")
    melt_final.loc[:,'bg_time2'] = (melt_final['time_pre'] - melt_final['bg_time1']).dt.total_seconds()
    melt_final.loc[:,'final_time'] = pd.to_timedelta(melt_final['bg_time2'], unit='s')
    melt_final.loc[:,'day_final'] = np.where(melt_final['bg_time2']< 0,melt_final['day'] - 1, melt_final['day'])
    melt_final.loc[:,'final_time1'] = melt_final['final_time'].apply(lambda x: str(x).split()[-1].replace('+', '').replace('-', ''))

    
    ### outcome ###
    if type_data == "train":
        melt_final_outcome = melt_final[['day_outcome','time_pre_outcome','bg+1:00','insulin_value','carbs_value','hr_value','step_value','cals_value','activity_value']]
        melt_final_outcome = melt_final_outcome.sort_values(by = ['day_outcome','time_pre_outcome']).drop_duplicates(subset = ['day_outcome','time_pre_outcome'],keep = 'first').reset_index(drop = True)

    ### original ###
    melt_final_original = melt_final[['day_final','final_time1','bg_value','insulin_value','carbs_value','hr_value','step_value','cals_value','activity_value']]
    ### arrange and drop duplicates two dataset ###
    melt_final_original =  melt_final_original.sort_values(by=['day_final','final_time1']).drop_duplicates(subset=['day_final','final_time1'], keep='first').reset_index(drop=True)
    melt_final_original2 = pd.merge(melt_final_original,time_sequence_data , left_on= ['day_final', 'final_time1'], right_on = ['day','time_seq'], how='outer')
    melt_final_original2 = melt_final_original2.drop(columns = ['day_final','final_time1','bg_measure']) 
    melt_final_original2 = melt_final_original2.sort_values(by = ['day','time_seq']).drop_duplicates(subset=['day','time_seq'], keep='first').reset_index(drop=True)
    melt_final_original2 = melt_final_original2.dropna(subset=['day','time_seq'], how='all')
    ## rename data ##
    if type_data == "train":    
        melt_final_outcome1 = melt_final_outcome.rename(columns = {
        'bg+1:00' : 'bg_value',
        'time_pre_outcome' : 'time_seq',
        'day_outcome' :'day'
        })
        col_order = melt_final_original2.columns
        melt_final_outcome1 = melt_final_outcome1[col_order]
    
        ### only for train data ###
        train_final = pd.concat([melt_final_original2,melt_final_outcome1],axis = 0).sort_values(by = ['day','time_seq']).drop_duplicates(subset = ['day','time_seq'],keep = 'first').reset_index(drop = True)
    
    if type_data == "train":
        return{'train': train_final}
    else:
        return{'test': melt_final_original2,'output':temp2}




In [8]:
train_dict = {f'p{str(i).zfill(2)}_train': train[train['p_num'] == f'p{str(i).zfill(2)}'] for i in [1, 2, 3, 4, 5, 6, 10, 11, 12]}
reshaped_train_dict = {}
for key, df in train_dict.items():
    reshaped_train_dict[key] = reshape_df(df, "train")

In [9]:
test_dict = {f'p{str(i).zfill(2)}_test': test[test['p_num'] == f'p{str(i).zfill(2)}'] for i in [1, 2, 4, 5, 6, 10, 11, 12, 15, 16, 18 ,19, 21, 22, 24]}
reshaped_test_dict = {}

In [10]:
for key, df in test_dict.items():
    reshaped_test_dict[key] = reshape_df(df, "test")

In [11]:
missing_rate_dict = {}

for key, sub_dict in reshaped_train_dict.items():
    if 'train' in sub_dict:
        df = sub_dict['train']
        missing_rate = df.isnull().sum() / len(df) * 100
        missing_rate_dict[key] = missing_rate
        
missing_rate_dict
## carbs, step, activity_value missing value are high so dropout 

{'p01_train': bg_value          51.129316
 insulin_value      0.158652
 carbs_value       98.352900
 hr_value           5.754752
 step_value        30.980471
 cals_value         1.300949
 activity_value    96.030807
 time_seq           0.000000
 day                0.000000
 dtype: float64,
 'p02_train': bg_value           1.054079
 insulin_value      0.307440
 carbs_value       98.732050
 hr_value          78.727467
 step_value        84.146807
 cals_value        71.492133
 activity_value    99.060495
 time_seq           0.000000
 day                0.000000
 dtype: float64,
 'p03_train': bg_value           0.753941
 insulin_value      0.131369
 carbs_value       98.808164
 hr_value          10.793161
 step_value        41.902749
 cals_value         5.652654
 activity_value    99.573528
 time_seq           0.000000
 day                0.000000
 dtype: float64,
 'p04_train': bg_value           0.601102
 insulin_value      0.004021
 carbs_value       98.144425
 hr_value          49.39286

In [12]:
train_dict_pre = {}

for key,train_dict in reshaped_train_dict.items():
    
    if 'train' in train_dict:
        df = train_dict['train']
        df = df.fillna(method='ffill').fillna(method='bfill')
        train_dict_pre[key] = df.drop(columns = ['carbs_value','step_value','activity_value','time_seq','day'])

In [13]:
test_dict_pre = {}

for key,test_dict in reshaped_test_dict.items():
    if 'test' in test_dict:
        df = test_dict['test']
        df = df.fillna(method='ffill').fillna(method='bfill')
        test_dict_pre[key] = df.drop(columns = ['carbs_value','step_value','activity_value','time_seq','day'])

In [14]:
combined_train = pd.concat(train_dict_pre.values(), ignore_index=True)
scaler = MinMaxScaler()
combined_train_scaled = scaler.fit_transform(combined_train)
combined_train_scaled_df = pd.DataFrame(combined_train_scaled, columns=combined_train.columns)

In [15]:
def lstm_format(df, time_steps):
    X_data = []
    y_data = []
    L = len(df)
    
    for i in range(L - time_steps):
        X_data.append(df[i:i+time_steps])
        y_data.append(df[i+time_steps:i+time_steps+1]['bg_value'])
    return np.array(X_data), np.array(y_data)

In [16]:
lstm_x_train,lstm_y_train = lstm_format(combined_train_scaled_df,36)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

2024-11-06 17:19:05.765341: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
time_steps = 36  # 時間步長
num_features = len(combined_train_scaled_df.columns) # 特徵數量

In [19]:
model = Sequential([
    GRU(units=64, activation='tanh', input_shape=(time_steps, num_features), return_sequences=False),
    Dropout(0.2),
    Dense(units=64),
    Dense(units=1)  # 最終輸出一個值（例如血糖預測值）
])

In [20]:
model.summary()

In [21]:
model.compile(
    loss='mean_squared_error',
    optimizer=Adam(learning_rate=0.002)
)

In [22]:
history = model.fit(
    x=lstm_x_train,  # 輸入的訓練數據（形狀為 [samples, time_steps, num_features]）
    y=lstm_y_train,   # 預測目標（例如 1 小時後的血糖值）
    epochs=50,         # 訓練迭代次數
    batch_size=32,
    validation_split=0.2
)

Epoch 1/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 18ms/step - loss: 7.9928e-04 - val_loss: 2.0089e-04
Epoch 2/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 17ms/step - loss: 4.5544e-04 - val_loss: 1.8494e-04
Epoch 3/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 16ms/step - loss: 4.4439e-04 - val_loss: 3.3345e-04
Epoch 4/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 16ms/step - loss: 4.3289e-04 - val_loss: 1.8689e-04
Epoch 5/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 20ms/step - loss: 4.2890e-04 - val_loss: 1.9336e-04
Epoch 6/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 18ms/step - loss: 4.0948e-04 - val_loss: 1.9438e-04
Epoch 7/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 22ms/step - loss: 4.0046e-04 - val_loss: 2.5985e-04
Epoch 8/50
[1m10308/10308[0m [32m━━━━━━━━━━━━━━━━━━━

In [23]:
for key, df in test_dict_pre.items():
    clean_key = key.replace("_test","")
    df['p_num'] = clean_key
## test ##
combined_test = pd.concat(test_dict_pre.values(), ignore_index=True)
combined_no_pnum = combined_test.drop(columns = ['p_num'])
combined_test_scaled = scaler.transform(combined_no_pnum)
combined_test_scaled_df = pd.DataFrame(combined_test_scaled, columns=combined_no_pnum.columns)

In [24]:
lstm_x_test,lstm_y_test = lstm_format(combined_test_scaled_df,36)

In [25]:
predicted_y = model.predict(lstm_x_test)

[1m16514/16514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 5ms/step


In [26]:
# 反標準化 combined_test_scaled
predicted_inverse = predicted_y*(scaler.data_max_[0] - scaler.data_min_[0])+scaler.data_min_[0]
predicted_inverse1 = np.concatenate((np.full((36, 1), np.nan), predicted_inverse), axis=0)
predicted_inverse_df = pd.DataFrame(predicted_inverse1, columns=['predicted'])

In [27]:
print("combined test",len(combined_test_scaled_df))
print("predicted_y",len(predicted_y))


combined test 528480
predicted_y 528444


In [28]:
test_predicted = {}
test_outcome = {}
for key,test_dict in reshaped_test_dict.items():
    if 'test' in test_dict:
        df = test_dict['test']
        df = df.drop(columns = ['carbs_value','step_value','activity_value'])
        clean_key = key.replace("_test","")
        df['p_num'] = clean_key
        test_predicted[key] = df
    if 'output' in test_dict:
        temp = test_dict['output']
        temp1 = temp.rename(columns = {
            'time_pre_outcome' : 'time_seq',
            'day_outcome' :'day'
            })
        clean_key = key.replace("_test","")
        temp1['p_num'] = clean_key
        test_outcome[key] = temp1

In [29]:
test_predict_df = pd.concat(test_predicted.values(), ignore_index=True)
test_predict_df1 = pd.concat([test_predict_df,predicted_inverse_df],axis = 1)
test_outcome_df = pd.concat(test_outcome.values())

In [33]:
test_predict_df1[['day']] = test_predict_df1[['day']].astype(int).astype('object')
test_predict_df1[['time_seq']] = test_predict_df1[['time_seq']].astype('object')
test_outcome_df['time_seq'] = test_outcome_df['time_seq'].astype(str).str.split(' ').str[-1]
test_outcome_df[['day', 'time_seq']] = test_outcome_df[['day', 'time_seq']].astype('object')
test_final = pd.merge(test_predict_df1,test_outcome_df, on = ['day', 'time_seq','p_num'],how = 'inner')

In [34]:
test_final1 = test_final[['id','predicted']].rename(columns = {
    'predicted' : 'bg+1:00'
})
test_final1.to_csv(f"{path}/submission.csv", index=False)