In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy import stats
import json
import warnings

# Turn off warnings
warnings.filterwarnings("ignore")

# Load the JSON file
with open('../../datasets/hr/heartrate.json') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)
df['dateTime'] = pd.to_datetime(df['dateTime'])
df['bpm'] = df['value'].apply(lambda x: x['bpm'])
df['confidence'] = df['value'].apply(lambda x: x['confidence'])

# Filter out records with confidence less than 2
df = df[df['confidence'] >= 2]

# Outlier detection and removal using Z-score
z_scores = np.abs(stats.zscore(df['bpm']))
df = df[(z_scores < 3)]

# Imputation: Forward fill to handle any missing values
df['bpm'].fillna(method='ffill', inplace=True)

# Sort by dateTime
df = df.sort_values(by='dateTime')

# Check time-series consistency
df['time_diff'] = df['dateTime'].diff().dt.total_seconds()

# Define expected time interval (e.g., 5 seconds)
expected_interval = 5  # in seconds

# Find and remove inconsistent intervals
inconsistent_intervals = df[df['time_diff'] != expected_interval]
num_inconsistent = len(inconsistent_intervals)
df = df[df['time_diff'] == expected_interval]

print(f"Number of inconsistent time intervals deleted: {num_inconsistent}")

# Drop the time_diff column as it's no longer needed
df.drop(columns=['time_diff'], inplace=True)

# Feature engineering: Add lag features (e.g., previous bpm values)
df['bpm_lag1'] = df['bpm'].shift(1)
df.dropna(inplace=True)

# Normalize the bpm values for GAN training
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_bpm = scaler.fit_transform(df[['bpm']])

# Split data into sequences for GAN input
sequence_length = 10
X_train = []
y_train = []
for i in range(sequence_length, len(scaled_bpm)):
    X_train.append(scaled_bpm[i-sequence_length:i, 0])
    y_train.append(scaled_bpm[i, 0])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Reshape X_train for LSTM input [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

# Build the Generator model
def build_generator():
    model = Sequential()
    model.add(LSTM(units=64, return_sequences=True, input_shape=(sequence_length, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='tanh'))
    return model

# Build the Discriminator model
def build_discriminator():
    model = Sequential()
    model.add(LSTM(units=64, return_sequences=True, input_shape=(sequence_length, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))
    return model

# Compile the GAN model
def build_gan(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])
    discriminator.trainable = False
    
    gan_input = Input(shape=(sequence_length, 1))
    generated_sequence = generator(gan_input)
    
    gan_output = discriminator(generated_sequence)
    
    gan = Model(gan_input, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0001, 0.5))
    return gan

# Initialize the models
generator = build_generator()
discriminator = build_discriminator()
gan = build_gan(generator, discriminator)

# Training the GAN model
epochs = 1000  # Reduced epochs
batch_size = 32  # Increased batch size for faster training

real_labels = np.ones((batch_size, 1))
fake_labels = np.zeros((batch_size, 1))

for epoch in range(epochs):
    # Train the discriminator
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    real_sequences = X_train[idx]
    
    noise = np.random.normal(0, 1, (batch_size, sequence_length, 1))
    generated_sequences = generator.predict(noise)
    
    d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
    d_loss_fake = discriminator.train_on_batch(generated_sequences, fake_labels)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    
    # Train the generator
    g_loss = gan.train_on_batch(noise, real_labels)
    
    # Print the progress
    if epoch % 100 == 0:
        print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100 * d_loss[1]}%] [G loss: {g_loss}]")

# Generate a new heart rate sequence using the trained generator
noise = np.random.normal(0, 1, (1, sequence_length, 1))
generated_bpm_sequence = generator.predict(noise)
generated_bpm_sequence = scaler.inverse_transform(generated_bpm_sequence)

print("Generated BPM sequence: ", generated_bpm_sequence)

# Evaluate the model
predictions = []
real_values = []

for i in range(sequence_length, len(X_train)):
    noise = np.random.normal(0, 1, (1, sequence_length, 1))
    generated_sequence = generator.predict(noise)
    predicted_bpm = scaler.inverse_transform(generated_sequence)[0][0]
    predictions.append(predicted_bpm)
    real_values.append(df.iloc[i]['bpm'])

# Convert to numpy arrays for evaluation
predictions = np.array(predictions)
real_values = np.array(real_values)

# Calculate evaluation metrics
mse = mean_squared_error(real_values, predictions)
mae = mean_absolute_error(real_values, predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Number of inconsistent time intervals deleted: 60949
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 915ms/step
0 [D loss: 0.6899533271789551, acc.: 65.625%] [G loss: [array(0.6910176, dtype=float32), array(0.6910176, dtype=float32), array(0.625, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[