In [11]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [12]:
def extract_frame(video_path, frame_number=0):
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    
    # Check if video opened successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return
    
    # Get video properties
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # print(f"Total frames: {total_frames}")
    # print(f"FPS: {fps}")
    
    # Set frame position
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    
    # Read frame
    ret, frame = cap.read()
    
    if not ret or frame is None:
        print("Error: Could not read frame.")
        return None, total_frames, fps
    
    # cut frame to scan area using mft_crop_coordinates.csv
    df_crop = pd.read_csv('mft_video_crop_coordinates.csv')
    row = df_crop[df_crop['video_path'] == video_path]
    if not row.empty:
        x_start = int(row['x_start'].values[0])
        x_end = int(row['x_end'].values[0])
        y_start = int(row['y_start'].values[0])
        y_end = int(row['y_end'].values[0])
        frame = frame[y_start:y_end, x_start:x_end]
    else:
        print("Warning: No crop coordinates found for this video. Using full frame.")
    
    # Release video capture object
    cap.release()

    return frame, total_frames, fps

# video_path ='/cosma7/data/dp004/rrtx34/ultrasound/JCUH/024/JCUH_024_LUS_3/20240118_142615_0003.AVI'
# # Extract single frame (frame 10)
# frame, total_frames, fps = extract_frame(video_path, 10)

# ## checking where to cut frame
# plt.figure(figsize=(10,8))
# plt.imshow(frame)
# plt.grid(True,'both', color='white', linestyle='-', linewidth=0.5, alpha=0.7)
# # Control grid spacing (every N pixels)
# plt.grid(True, which='major', color='white', linewidth=0.5)
# ax = plt.gca()
# ax.set_xticks(np.arange(0, frame.shape[1], 20))  # Every 50 pixels horizontally
# ax.set_yticks(np.arange(0, frame.shape[0], 20))  # Every 50 pixels vertically
# plt.title('Full Frame')
# plt.show()



# count total pixels in cut frame
# total_pixels = frame.shape[0] * frame.shape[1]
# print("Total pixels in cut frame:", total_pixels)

# # find mean pixel value of the frame
# mean_pixel_value = np.mean(frame)
# # # find variance of pixel values in the frame
# variance_pixel_value = np.var(frame)
# std_dev_pixel_value = np.std(frame)
# print("Mean pixel value of the frame:", mean_pixel_value)
# print("Standard deviation of pixel values in the frame:", std_dev_pixel_value)

In [13]:
# find the variance in mean pixel value for a video
# find the variance in mean pixel value for a video
def calculate_video_stats(video_path):
    # Open the video file to get total frames
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return None
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    
    mean_values = []
    frame_variance = []   
    
    # Loop through all frames in the video
    for frame_num in range(total_frames):
        frame_data = extract_frame(video_path, frame_number=frame_num)
        if frame_data is None or frame_data[0] is None:
            continue
            
        frame = frame_data[0]
        
        mean_pixel_value = np.mean(frame)
        frame_std_dev_value = np.std(frame)
        mean_values.append(mean_pixel_value)
        frame_variance.append(frame_std_dev_value)

    # Calculate variance of means across all frames
    variance_of_means = np.std(mean_values)
    average_frame_variance = np.mean(frame_variance)
    return variance_of_means, average_frame_variance

# video_variance = calculate_video_stats(video_path)
# print("Variance of mean pixel values across video frames:", video_variance[0], "Average variance of pixel values within frames:", video_variance[1])

# video_variance = calculate_video_stats(video_path)
# print("Variance of mean pixel values across video frames:", video_variance[0], "Average variance of pixel values within frames:", video_variance[1])

In [14]:
# compare video stats between duplicate videos
# video_3 = calculate_video_stats('/cosma7/data/dp004/rrtx34/ultrasound/JCUH/034/JCUH_034_LUS_4_Supine/20240430_171120_0003.MP4')
# video_4 = calculate_video_stats('/cosma7/data/dp004/rrtx34/ultrasound/JCUH/034/JCUH_034_LUS_4_Supine/20240430_171120_0004.MP4')
# print("Video 3 - Variance of mean pixel values across video frames:", video_3[0], "Average variance of pixel values within frames:", video_3[1])
# print("Video 4 - Variance of mean pixel values across video frames:", video_4[0], "Average variance of pixel values within frames:", video_4[1])

def select_duplicate_videos(vid1_path, vid2_path):
    vid1_stats = calculate_video_stats(vid1_path)
    vid2_stats = calculate_video_stats(vid2_path)

    frame_variance_diff = abs(vid1_stats[1] - vid2_stats[1])
    video_variance_diff = abs(vid1_stats[0] - vid2_stats[0])    
    
    # return video with higher variance in frames
    if frame_variance_diff > video_variance_diff:
        if vid1_stats[1] > vid2_stats[1]:
            return vid1_path
        else:
            return vid2_path
        
    elif video_variance_diff > frame_variance_diff:
        if vid1_stats[0] < vid2_stats[0]:
            return vid1_path
        else:
            return vid2_path


In [15]:
from tqdm.auto import tqdm

df_duplicates = pd.read_csv('data_tables/MFT_data_duplicate_scan_labels.csv')

# Group by Patient ID, Scan No, and Scan Label to find all duplicate groups
grouped = df_duplicates.groupby(['Patient ID', 'Scan No', 'Scan Label'])

preferences_list = []

# Process each group of duplicates
for group_key, group_df in tqdm(grouped, desc="Processing duplicate groups"):
    # Skip groups with only 1 video (not duplicates)
    if len(group_df) < 2:
        continue
    
    # Get all video paths in this duplicate group
    video_paths = group_df['File Path'].tolist()
    
    # Calculate stats for all videos in the group
    video_stats = {}
    for video_path in video_paths:
        stats = calculate_video_stats(video_path)
        if stats is not None:
            video_stats[video_path] = stats

    # Extract all frame variances and video variances for normalization
    frame_vars = [stats[1] for stats in video_stats.values()]
    video_vars = [stats[0] for stats in video_stats.values()]
    
    # Normalize to 0-1 range
    frame_var_min, frame_var_max = min(frame_vars), max(frame_vars)
    video_var_min, video_var_max = min(video_vars), max(video_vars)
    
    # Avoid division by zero
    frame_var_range = frame_var_max - frame_var_min if frame_var_max != frame_var_min else 1
    video_var_range = video_var_max - video_var_min if video_var_max != video_var_min else 1
    
    # Calculate normalized composite score for each video
    video_scores = {}
    for video_path, stats in video_stats.items():
        # Normalize frame variance (higher is better)
        norm_frame_var = (stats[1] - frame_var_min) / frame_var_range
        
        # Normalize video variance (lower is better, so invert)
        norm_video_var = 1 - (stats[0] - video_var_min) / video_var_range
        
        # Equal weighting (0.5 each)
        composite_score = 0.5 * norm_frame_var + 0.5 * norm_video_var
        video_scores[video_path] = composite_score
    
    # Select the best video from the group
    best_video = max(video_scores.keys(), key=lambda v: video_scores[v])
    
    # Record all videos in this group with the preferred one
    preferences_list.append({
        'Patient ID': group_key[0],
        'Scan No': group_key[1],
        'Scan Label': group_key[2],
        'num_duplicates': len(video_paths),
        'all_videos': video_paths,
        'frame_stddev': [video_stats[vp][1] for vp in video_paths],
        'video_stddev': [video_stats[vp][0] for vp in video_paths],
        'preferred': best_video
    })

df_preferences = pd.DataFrame(preferences_list)
df_preferences['frame_stddev'] = df_preferences['frame_stddev'].apply(lambda x: [str(val) for val in x])
df_preferences['video_stddev'] = df_preferences['video_stddev'].apply(lambda x: [str(val) for val in x])


Processing duplicate groups:   0%|          | 0/311 [00:00<?, ?it/s]



In [16]:
print(df_preferences)

     Patient ID Scan No Scan Label  num_duplicates  \
0             1   LUS_1        LPA               2   
1             1   LUS_1        LPB               2   
2             1   LUS_1        RPB               2   
3             1   LUS_2        LAA               2   
4             1   LUS_2        LAX               2   
..          ...     ...        ...             ...   
306          34   LUS_2        RPA               2   
307          34   LUS_3        LAB               2   
308          35   LUS_1        LAX               2   
309          35   LUS_1        RAA               2   
310          35   LUS_2        Rax               2   

                                            all_videos  \
0    [/cosma5/data/durham/dc-fras4/ultrasound/SLURP...   
1    [/cosma5/data/durham/dc-fras4/ultrasound/SLURP...   
2    [/cosma5/data/durham/dc-fras4/ultrasound/SLURP...   
3    [/cosma5/data/durham/dc-fras4/ultrasound/SLURP...   
4    [/cosma5/data/durham/dc-fras4/ultrasound/SLURP...   
.. 

In [17]:
df_preferences.to_csv('data_tables/MFT_duplicate_video_preferences.csv', index=False)