In [15]:
import cv2
import mediapipe as mp

# Step 1: Load the video file and extract each frame
cap = cv2.VideoCapture('data1.avi')

# Step 2: Initialize MediaPipe's face detection model
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection()

# Step 3: Detect faces in each frame and select ROI for each face
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the image to RGB format for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect faces using MediaPipe's face detection model
    results = face_detection.process(frame_rgb)

    # Iterate through each detected face and select ROI
    for detection in results.detections:
        bbox = detection.location_data.relative_bounding_box
        h, w, _ = frame.shape
        x, y, w, h = int(bbox.xmin * w), int(bbox.ymin * h), int(bbox.width * w), int(bbox.height * h)

        # Select ROI for each face (forehead, cheek, and nose)
        forehead_roi = frame[y:y+h//3, x:x+w]
        cheek_roi = frame[y+h//3:y+2*h//3, x:x+w]
        nose_roi = frame[y+2*h//3:y+h, x:x+w]

        # Draw bounding box and ROI on the frame
        # cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        cv2.rectangle(frame, (x, y), (x+w, y+h//3), (255, 0, 0), 2) # Forehead ROI
        cv2.rectangle(frame, (x, y+h//3), (x+w, y+2*h//3), (0, 0, 255), 2) # Cheek ROI
        cv2.rectangle(frame, (x, y+2*h//3), (x+w, y+h), (255, 255, 0), 2) # Nose ROI

    # Show the frame with detected face area and ROI area
    cv2.imshow('frame', frame)
    if cv2.waitKey(1) == ord('q'):
        break

# Release the video capture and destroy all windows
cap.release()
cv2.destroyAllWindows()


In [16]:
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18
import numpy as np
import torchvision
import torchvision.transforms as transforms

# Step 1: Load the pre-trained ResNet18 model
# model = resnet18(pretrained=True)
# model.fc = torch.nn.Identity() # Remove the last fully connected layer
# Remove the last layer of the ResNet18 model
resnet18 = torchvision.models.resnet18()
modules = list(resnet18.children())[: -4]   # Select up to the 14th layer
model = torch.nn.Sequential(*modules)

# Step 2: Normalize the pixel values in each map
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# Step 3: Extract spatial-temporal features from ROIs in each frame
feature_maps = []
cap = cv2.VideoCapture('data1.avi')
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the ROIs to PyTorch tensor and normalize the pixel values
    forehead_tensor = normalize(transforms.ToTensor()(forehead_roi)).unsqueeze(0)
    cheek_tensor = normalize(transforms.ToTensor()(cheek_roi)).unsqueeze(0)
    nose_tensor = normalize(transforms.ToTensor()(nose_roi)).unsqueeze(0)

    # Extract features using the pre-trained ResNet18 model
    with torch.no_grad():
        forehead_features = model(forehead_tensor).squeeze().numpy()
        cheek_features = model(cheek_tensor).squeeze().numpy()
        nose_features = model(nose_tensor).squeeze().numpy()

    # Step 2: Normalize the pixel values in each map to have a mean of zero and a standard deviation of one
    forehead_features = (forehead_features - np.mean(forehead_features)) / np.std(forehead_features)
    cheek_features = (cheek_features - np.mean(cheek_features)) / np.std(cheek_features)
    nose_features = (nose_features - np.mean(nose_features)) / np.std(nose_features)

    # Step 3: Create a spatial-temporal map by combining the normalized maps
    spatial_temporal_map = np.concatenate((forehead_features, cheek_features,nose_features), axis=0)
    feature_maps.append(spatial_temporal_map)

# Convert the feature maps to a NumPy array
feature_maps = np.array(feature_maps)

# Release the video capture and destroy all windows
cap.release()
cv2.destroyAllWindows()


In [18]:
# print(feature_maps)
print(feature_maps.shape)

(1205, 384, 6, 18)


In [84]:
import numpy as np
from scipy import signal
from sklearn.decomposition import PCA

# Step 1: Load spatial-temporal maps data
feature_maps = feature_maps
total_samples = feature_maps.shape[0]
# Step 2: Extract RPPG signals using PCA
pca = PCA(n_components=1)
rppg_signal = pca.fit_transform(feature_maps.reshape(-1, feature_maps.shape[-1])).squeeze()

# Step 3: Filter and process RPPG signals using band-pass filtering
sampling_rate = total_samples / 60 # Hz
fs = sampling_rate

nyquist = fs / 2
low_freq= 5  # Lower frequency bound in Hz
high_freq = 15  # Upper frequency bound in Hz
order = 4  # Filter order

    # Define bandpass filter
nyquist_freq = 0.5 * sampling_rate
low = low_freq / nyquist_freq
high = high_freq / nyquist_freq

b, a = signal.butter(order, [low, high], btype='band')
filtered_signal = signal.filtfilt(b, a, rppg_signal)

# Step 4: Estimate HR from the processed RPPG signals using FFT
nfft = 2**np.ceil(np.log2(len(filtered_signal)))
freq = np.fft.rfftfreq(int(nfft), 1/fs)
fft = np.abs(np.fft.rfft(filtered_signal, int(nfft)))**2
dominant_freq = freq[np.argmax(fft)]
HR = dominant_freq * 60

# Step 5: Save the output (BMP) in a variable named "BMP"
BMP = HR  # Replace with any additional computations or post-processing of the HR value
print(BMP)


ValueError: Digital filter critical frequencies must be 0 < Wn < 1

In [64]:
import numpy as np
from sklearn.decomposition import PCA

# load the spatial-temporal feature maps generated in step 4
spatial_temporal_map = feature_maps

# preprocess the spatial-temporal maps by normalizing pixel values
# normalized_maps = []
# for i in range(spatial_temporal_map.shape[0]):
#     map_i = spatial_temporal_map[i]
#     map_i = (map_i - np.mean(map_i)) / np.std(map_i)
#     normalized_maps.append(map_i)
# normalized_maps = np.array(normalized_maps)

# reshape the normalized maps to prepare them for PCA
reshaped_maps = spatial_temporal_map.reshape(spatial_temporal_map.shape[0], -1)

# apply PCA to the reshaped maps to extract RPPG signals
pca = PCA(n_components=1)
rppg_signals = pca.fit_transform(reshaped_maps)

print('Extracted RPPG signals:', rppg_signals)
print(rppg_signals.shape)
step5_output = rppg_signals

Extracted RPPG signals: [[0.00027987]
 [0.00027988]
 [0.00027988]
 ...
 [0.00027988]
 [0.00027988]
 [0.00027988]]
(1205, 1)


In [65]:
import scipy.signal as signal



# Define the band-pass filter parameters
fs = 20  # sampling rate
lowcut = 8  # lower cutoff frequency (Hz)
highcut = 12 # higher cutoff frequency (Hz)

# Define the filter coefficients
b, a = signal.butter(4, [lowcut, highcut], btype='band')

# Filter the RPPG signals using the band-pass filter
filtered_signals = signal.filtfilt(b, a, step5_output.flatten())

# Apply baseline drift correction using detrend function
processed_signals = signal.detrend(filtered_signals)

# Save the processed signals in a variable named "step6_output"
step6_output = processed_signals.reshape(-1, 1)
print(step6_output)
print(step6_output.shape)


ValueError: Digital filter critical frequencies must be 0 < Wn < 1

In [56]:
import numpy as np
from scipy.fft import fft, fftfreq

# Define sampling rate
fs = 30  # assuming 30 frames per second

# Compute power spectral density using FFT
freqs = fftfreq(step6_output.shape[0], 1/fs)
fft_vals = fft(step6_output[:, 0])
psd = np.abs(fft_vals) ** 2

# Find the peak frequency in the power spectral density
max_idx = np.argmax(psd)
hr = freqs[max_idx] * 60

# Print estimated heart rate
print("Estimated heart rate: {:.2f} bpm".format(hr))

# Save output in a variable
step7_output = hr


Estimated heart rate: 43.32 bpm


In [44]:
import cv2
import mediapipe as mp
import numpy as np

# Load the video file
cap = cv2.VideoCapture('data1.avi')

# Initialize MediaPipe face detection model
mp_face_detection = mp.solutions.face_detection.FaceDetection()

# Create an empty list to store ROIs
ROI = []

# Loop through each frame of the video
while True:
    # Read a frame from the video
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB for MediaPipe face detection model
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect faces in the frame using MediaPipe face detection model
    results = mp_face_detection.process(frame_rgb)

    # Extract ROIs for each detected face
    for detection in results.detections:
        # Get the bounding box coordinates for the face detection
        bbox = detection.location_data.relative_bounding_box
        x1, y1, w, h = int(bbox.xmin * frame.shape[1]), int(bbox.ymin * frame.shape[0]), \
                        int(bbox.width * frame.shape[1]), int(bbox.height * frame.shape[0])
        x2, y2 = x1 + w, y1 + h

        # Define the ROIs for the forehead, cheek, and nose
        forehead_roi = frame[y1:y1+int(h*0.2), x1:x2]
        cheek_roi = frame[y1+int(h*0.2):y2-int(h*0.2), x1:x2]
        # nose_roi = frame[y2-int(h*0.2):y2, x1:x2]

        # Combine the ROIs into one image
        roi = np.vstack((forehead_roi, cheek_roi))

        # Add the ROI to the list of ROIs
        ROI.append(roi)

        # Draw the bounding box and ROIs on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.rectangle(frame, (x1, y1), (x2, y1+int(h*0.2)), (0, 255, 255), 2) #forehead_roi
        cv2.rectangle(frame, (x1, y1+int(h*0.2)), (x2, y2-int(h*0.2)), (0, 255, 255), 2) #cheek_roi
        # cv2.rectangle(frame, (x1, y2-int(h*0.2)), (x2, y2), (0, 255, 255), 2) #nose_roi

    # Show the frame with bounding box and ROIs
    cv2.imshow('frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture and destroy all windows
cap.release()
cv2.destroyAllWindows()



In [45]:
print(ROI, len(ROI))

[array([[[ 27,  22,  21],
        [ 26,  21,  20],
        [ 23,  18,  17],
        ...,
        [ 15,  18,  16],
        [  0,   2,   0],
        [ 40,  43,  41]],

       [[ 22,  17,  16],
        [ 22,  17,  16],
        [ 22,  17,  16],
        ...,
        [  7,   9,  11],
        [  8,  10,  12],
        [  2,   4,   6]],

       [[ 22,  17,  16],
        [ 22,  17,  16],
        [ 22,  17,  16],
        ...,
        [  7,   9,  11],
        [  8,  10,  12],
        [  2,   4,   6]],

       ...,

       [[199, 185, 178],
        [199, 185, 178],
        [199, 185, 178],
        ...,
        [169, 166, 165],
        [169, 166, 165],
        [169, 166, 165]],

       [[195, 181, 174],
        [195, 181, 174],
        [195, 181, 174],
        ...,
        [172, 169, 168],
        [172, 169, 168],
        [172, 169, 168]],

       [[195, 181, 174],
        [195, 181, 174],
        [195, 181, 174],
        ...,
        [172, 169, 168],
        [172, 169, 168],
        [172, 169, 168]

In [49]:
import numpy
ROI = numpy.ndarray(ROI)

ValueError: maximum supported dimension for an ndarray is 32, found 1205

In [48]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms

# Load the pre-trained ResNet18 model
resnet18 = models.resnet18(pretrained=True)
resnet18.eval()

# Define a transform to pre-process the ROI image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Create an empty list to store spatial-temporal representations
spatial_temporal_representations = []

# Loop through each ROI in the list of ROIs
for roi in ROI:
    # Pre-process the ROI image
    roi_tensor = transform(roi).unsqueeze(0)

    # Extract features using the ResNet18 model
    with torch.no_grad():
        features = resnet18(roi_tensor)

    # Add the features to the list of spatial-temporal representations
    spatial_temporal_representations.append(features)

# Stack the list of spatial-temporal representations along the time dimension to create a single tensor
spatial_temporal_representations = torch.stack(spatial_temporal_representations, dim=0)



TypeError: Unexpected type <class 'numpy.ndarray'>