# Load data set

In [1]:
import pandas as pd

# load dataset
df = pd.read_csv('2022_04_22_hour_heartbeat_merged.csv')

# Convert 'time' column to datetime format
df['Time'] = pd.to_datetime(df['time'])
df.describe()

Unnamed: 0,Beats,Id,Intensity,Steps
count,6252.0,6252.0,6246.0,6246.0
mean,73.84517,5638328000.0,16.101185,431.416747
std,14.799422,1864900000.0,25.19096,811.960176
min,46.0,2022484000.0,0.0,0.0
25%,63.0,4558610000.0,0.0,0.0
50%,71.0,5577150000.0,7.0,130.0
75%,81.0,6962181000.0,22.0,542.0
max,167.0,8877689000.0,180.0,10554.0


# Clean datset (fill in missing values)

In [2]:
# Fill the missing values with zeros for the specific patient 5553957443 in the full dataset
df.loc[df['Id'] == 5553957443, ['Intensity', 'Steps']] = df.loc[df['Id'] == 5553957443, ['Intensity', 'Steps']].fillna(0)

# Verify by checking for any remaining missing values for patient 5553957443
remaining_missing_values_patient = df[df['Id'] == 5553957443].isnull().sum()

remaining_missing_values_patient

time         0
Beats        0
Id           0
Intensity    0
Steps        0
Time         0
dtype: int64

In [3]:
# Checking for missing values in the dataset
missing_values_by_patient = df[df.isnull().any(axis=1)]

# Applying the previous valid observation to fill the remaining missing values for each patient with missing data
for patient_id, patient_data in missing_values_by_patient.groupby('Id'):
    
    patient_df = df[df['Id'] == patient_id]
    missing_indices = patient_data.index
    # Apply the previous valid observation forward to fill the missing values
    df.loc[patient_df.index, ['Intensity', 'Steps']] = patient_df[['Intensity', 'Steps']].fillna(method='ffill')

# Check if all missing values are gone
all_missing_values_filled = df.isna().sum()
all_missing_values_filled

time         0
Beats        0
Id           0
Intensity    0
Steps        0
Time         0
dtype: int64

# Split and scale datasets

In [4]:
from sklearn . preprocessing import MinMaxScaler


split_index = int(len(df) * 0.7)

# Split the data into training and testing sets
train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]

# Define the columns we want to scale
columns_to_scale = ['Beats', 'Intensity', 'Steps']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data aand transform it
train_data_scaled = scaler.fit_transform(train_data[columns_to_scale])

# Use the same scaler to transform the testing data
test_data_scaled = scaler.transform(test_data[columns_to_scale])

train_data_scaled_df = pd.DataFrame(train_data_scaled, index=train_data.index, columns=columns_to_scale)
test_data_scaled_df = pd.DataFrame(test_data_scaled, index=test_data.index, columns=columns_to_scale)

train_data_scaled_df['Time'] = train_data['Time']
test_data_scaled_df['Time'] = test_data['Time']

train_data_scaled_df.head(), test_data_scaled_df.head()


(      Beats  Intensity  Steps                Time
 0  0.456790        0.0    0.0 2016-04-12 01:00:00
 1  0.395062        0.0    0.0 2016-04-12 02:00:00
 2  0.246914        0.0    0.0 2016-04-12 03:00:00
 3  0.271605        0.0    0.0 2016-04-12 04:00:00
 4  0.271605        0.0    0.0 2016-04-12 05:00:00,
          Beats  Intensity     Steps                Time
 4376  0.444444   0.255556  0.234160 2016-04-19 09:00:00
 4377  0.555556   0.627778  0.702996 2016-04-19 10:00:00
 4378  0.333333   0.044444  0.016185 2016-04-19 11:00:00
 4379  0.271605   0.011111  0.007748 2016-04-19 12:00:00
 4380  0.283951   0.066667  0.044421 2016-04-19 13:00:00)

# Feature engineering

In [5]:
# intensity an hour before
train_data_scaled_df['Intensity_1h_before'] = train_data_scaled_df['Intensity'].shift(1)
test_data_scaled_df['Intensity_1h_before'] = test_data_scaled_df['Intensity'].shift(1)

# average beart beat from the last 3hours
# train_data_scaled_df['Avg_3h_heartbeat'] = train_data_scaled_df['Beats'].rolling(3).mean()
# test_data_scaled_df['Avg_3h_heartbeat'] = test_data_scaled_df['Beats'].rolling(3).mean()

# time based feature, hour of the day
train_data_scaled_df['Hour_of_day'] = train_data_scaled_df['Time'].dt.hour
test_data_scaled_df['Hour_of_day'] = test_data_scaled_df['Time'].dt.hour
all_missing_values_filled = train_data_scaled_df.isna().sum()
all_missing_values_filled

Beats                  0
Intensity              0
Steps                  0
Time                   0
Intensity_1h_before    1
Hour_of_day            0
dtype: int64

# Feature selection


In [6]:
# Calculate the correlation matrix
correlation_matrix_train = train_data_scaled_df.corr()

# Get the correlation of all features with 'TotalIntensity'
correlation_with_target_train = correlation_matrix_train['Intensity'].sort_values(ascending=False)

# Display the correlation values
correlation_with_target_train

  correlation_matrix_train = train_data_scaled_df.corr()


Intensity              1.000000
Steps                  0.899235
Intensity_1h_before    0.411949
Hour_of_day            0.192062
Beats                  0.170927
Name: Intensity, dtype: float64

# data preperation for random forest

In [7]:
from sklearn.model_selection import train_test_split


#drop rows with nan values
train_data_scaled_df = train_data_scaled_df.dropna()

y = train_data_scaled_df['Intensity'] 
X = train_data_scaled_df.drop(['Intensity', 'Time'], axis=1)  # Features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)



# Feature Importance

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf.fit(X_train, y_train)


In [9]:
# Get feature importances
importances = rf.feature_importances_

# Convert the importances into a DataFrame
feature_importances = pd.DataFrame(importances, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)


                     importance
Steps                  0.870404
Intensity_1h_before    0.051188
Beats                  0.050610
Hour_of_day            0.027799


# lstm data preperation

In [10]:
import numpy as np
train_data_scaled_df = train_data_scaled_df.drop(['Time'], axis=1)
test_data_scaled_df = test_data_scaled_df.drop(['Time'], axis=1)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

np_array = np.array(train_data_scaled_df['Hour_of_day']).reshape(-1, 1)

# Fit the scaler on the training data aand transform it
train_data_scaled = scaler.fit_transform(np_array)

test_np_array = np.array(test_data_scaled_df['Hour_of_day']).reshape(-1, 1)

# Use the same scaler to transform the testing data
test_data_scaled = scaler.transform(test_np_array)

train_data_scaled_df['Hour_of_day'] = train_data_scaled
test_data_scaled_df['Hour_of_day'] = test_data_scaled

train_data_scaled_df.head(), test_data_scaled_df.head()

(      Beats  Intensity  Steps  Intensity_1h_before  Hour_of_day
 1  0.395062        0.0    0.0                  0.0     0.086957
 2  0.246914        0.0    0.0                  0.0     0.130435
 3  0.271605        0.0    0.0                  0.0     0.173913
 4  0.271605        0.0    0.0                  0.0     0.217391
 5  0.358025        0.0    0.0                  0.0     0.260870,
          Beats  Intensity     Steps  Intensity_1h_before  Hour_of_day
 4376  0.444444   0.255556  0.234160                  NaN     0.391304
 4377  0.555556   0.627778  0.702996             0.255556     0.434783
 4378  0.333333   0.044444  0.016185             0.627778     0.478261
 4379  0.271605   0.011111  0.007748             0.044444     0.521739
 4380  0.283951   0.066667  0.044421             0.011111     0.565217)

In [11]:
# Define sequence length and prepare the final sequences for training
import numpy as np

sequence_length = 20  # Length of each sequence

train_data_scaled_df.dropna(inplace=True)
train_data_scaled_df.reset_index(drop=True, inplace=True)
print(train_data_scaled_df)
y = train_data_scaled_df['Intensity'] 
X = train_data_scaled_df.drop(['Intensity'], axis=1)  # Features
X_seq = []
y_seq = []

for i in range(sequence_length, len(train_data_scaled_df)):
    print(X[i-sequence_length:i])
    X_seq.append(X[i-sequence_length:i])
    y_seq.append(y[i])

X_train_seq = np.array(X_seq)
y_train_seq = np.array(y_seq)

# Display shape of the sequences
X_train_seq.shape, y_train_seq.shape

         Beats  Intensity     Steps  Intensity_1h_before  Hour_of_day
0     0.395062   0.000000  0.000000                  0.0     0.086957
1     0.246914   0.000000  0.000000                  0.0     0.130435
2     0.271605   0.000000  0.000000                  0.0     0.173913
3     0.271605   0.000000  0.000000                  0.0     0.217391
4     0.358025   0.000000  0.000000                  0.0     0.260870
...        ...        ...       ...                  ...          ...
4370  0.407407   0.000000  0.000000                  0.0     0.173913
4371  0.382716   0.000000  0.000000                  0.0     0.217391
4372  0.271605   0.000000  0.000000                  0.0     0.260870
4373  0.333333   0.000000  0.000000                  0.0     0.304348
4374  0.407407   0.122222  0.113464                  0.0     0.347826

[4375 rows x 5 columns]
       Beats     Steps  Intensity_1h_before  Hour_of_day
0   0.395062  0.000000             0.000000     0.086957
1   0.246914  0.00000

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [12]:
"""
import numpy as np

sequence_length = 5 # Length of each sequence

X_seq = []
y_seq = []

# Loop with steps of size 'sequence_length' to avoid overlap
for i in range(sequence_length, len(train_data_scaled_df), sequence_length):
    print(i)
    X_seq.append(X[i-sequence_length:i])
    y_seq.append(y[i])

X_train_seq = np.array(X_seq)
y_train_seq = np.array(y_seq)

# Display shape of the sequences
X_train_seq.shape, y_train_seq.shape
"""

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
440
445
450
455
460
465
470
475
480
485
490
495
500
505
510
515
520
525
530
535
540
545
550
555
560
565
570
575
580
585
590
595
600
605
610
615
620
625
630
635
640
645
650
655
660
665
670
675
680
685
690
695
700
705
710
715
720
725
730
735
740
745
750
755
760
765
770
775
780
785
790
795
800
805
810
815
820
825
830
835
840
845
850
855
860
865
870
875
880
885
890
895
900
905
910
915
920
925
930
935
940
945
950
955
960
965
970
975
980
985
990
995
1000
1005
1010
1015
1020
1025
1030
1035
1040
1045
1050
1055
1060
1065
1070
1075
1080
1085
1090
1095
1100
1105
1110
1115
1120
1125
1130
1135
1140
1145
1150
1155
1160
1165
1170
1175
1180
1185
1190
1195
1200
1205
1210
1215
1220

((874, 5, 4), (874,))

In [13]:
# Define sequence length and prepare the final sequences for test data
import numpy as np
print(test_data_scaled_df)
test_data_scaled_df.dropna(inplace=True)
test_data_scaled_df.reset_index(drop=True, inplace=True)
y = test_data_scaled_df['Intensity'] 
X = test_data_scaled_df.drop(['Intensity'], axis=1)  # Features
print(test_data_scaled_df.loc[11])

sequence_length =5 # Length of each sequence

X_seq = []
y_seq = []

for i in range(sequence_length, len(test_data_scaled_df)):
    X_seq.append(X[i-sequence_length:i])
    y_seq.append(y[i])

X_test_seq = np.array(X_seq)
y_test_seq = np.array(y_seq)

# Display shape of the sequences
X_test_seq.shape, y_test_seq.shape

         Beats  Intensity     Steps  Intensity_1h_before  Hour_of_day
4376  0.444444   0.255556  0.234160                  NaN     0.391304
4377  0.555556   0.627778  0.702996             0.255556     0.434783
4378  0.333333   0.044444  0.016185             0.627778     0.478261
4379  0.271605   0.011111  0.007748             0.044444     0.521739
4380  0.283951   0.066667  0.044421             0.011111     0.565217
...        ...        ...       ...                  ...          ...
6247  0.222222   0.088889  0.131715             0.105556     0.347826
6248  0.197531   0.022222  0.028237             0.088889     0.391304
6249  0.259259   0.066667  0.088499             0.022222     0.434783
6250  0.296296   0.161111  0.242252             0.066667     0.478261
6251  0.493827   0.516667  0.539773             0.161111     0.521739

[1876 rows x 5 columns]
Beats                  0.283951
Intensity              0.011111
Steps                  0.003788
Intensity_1h_before    0.044444
Hour_of

((1870, 5, 4), (1870,))

# Build LSTM model

In [14]:
# Define the model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # out stands for hidden state h in every time step
        # _ stands for cell state c in every time step
        out, _ = self.lstm(x)

        # out[:, -1, :] selects the hidden state of the last time step for each sequence in the batch.
        # self.linear maps the hidden state to the output dimesnion feature
        out = self.linear(out[:, -1, :])
        return out

In [15]:
# model configuration
input_size = X_train_seq.shape[2]  # number of features
hidden_size = 64  # number of features in hidden states
num_layers = 2  # number of stacked LSTM layers
output_size = 1  # number of output features

# Initialize the model
model = LSTM(input_size, hidden_size, num_layers, output_size)

# Define the loss function and the optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Train LSTM model

split into train and eval

In [16]:
from sklearn.model_selection import train_test_split
# Split data into training and evaluation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_seq, y_train_seq, test_size=0.2, shuffle =False)

In [17]:
# Convert to PyTorch tensors
import torch
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).view(-1, 1)

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).view(-1,1)
# Prepare DataLoader for training data
batch_size = 64
train_dataset = TensorDataset(X_train_tensor , y_train_tensor)
train_loader = DataLoader(train_dataset , batch_size=batch_size , shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

train model

In [18]:
# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 50
model.to(device)
for epoch in range(num_epochs):
    total_loss = 0.0 # To accumulate loss for each epoch
    model.train()
    for batch_X , batch_y in train_loader:
        # Move data to the appropriate device if necessary (e.g., GPU)
        batch_X , batch_y = batch_X.to(device), batch_y.to(device)
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * batch_X.size(0)
        # Calculate average loss for the epoch
    epoch_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training_Loss: {epoch_loss:.4f}")
    
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X , batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item() * batch_X.size(0)
        epoch_loss = total_loss / len(val_loader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Eval_Loss: {epoch_loss:.4f}")
            
            
        

Epoch [1/50], Training_Loss: 0.0198
Epoch [1/50], Eval_Loss: 0.0097
Epoch [2/50], Training_Loss: 0.0196
Epoch [2/50], Eval_Loss: 0.0089
Epoch [3/50], Training_Loss: 0.0193
Epoch [3/50], Eval_Loss: 0.0091
Epoch [4/50], Training_Loss: 0.0190
Epoch [4/50], Eval_Loss: 0.0092
Epoch [5/50], Training_Loss: 0.0190
Epoch [5/50], Eval_Loss: 0.0081
Epoch [6/50], Training_Loss: 0.0184
Epoch [6/50], Eval_Loss: 0.0077
Epoch [7/50], Training_Loss: 0.0180
Epoch [7/50], Eval_Loss: 0.0077
Epoch [8/50], Training_Loss: 0.0177
Epoch [8/50], Eval_Loss: 0.0072
Epoch [9/50], Training_Loss: 0.0177
Epoch [9/50], Eval_Loss: 0.0069
Epoch [10/50], Training_Loss: 0.0177
Epoch [10/50], Eval_Loss: 0.0087
Epoch [11/50], Training_Loss: 0.0176
Epoch [11/50], Eval_Loss: 0.0070
Epoch [12/50], Training_Loss: 0.0175
Epoch [12/50], Eval_Loss: 0.0068
Epoch [13/50], Training_Loss: 0.0173
Epoch [13/50], Eval_Loss: 0.0071
Epoch [14/50], Training_Loss: 0.0172
Epoch [14/50], Eval_Loss: 0.0070
Epoch [15/50], Training_Loss: 0.0171
E

In [19]:
## Evaluate on evaluation set

In [20]:
X_test_tensor = torch.FloatTensor(X_test_seq)
y_test_tensor = torch.FloatTensor(y_test_seq).view(-1, 1)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
model.eval()
total_loss = 0.0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X , batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        total_loss += loss.item() * batch_X.size(0)
    mse_loss = total_loss / len(test_loader.dataset)
    print(f"Test MSE {mse_loss:.4f}")

Test MSE 0.0209
