In [None]:
import numpy as np
import cv2
from glob import glob
import matplotlib.pyplot as plt
import os
from pathlib import Path

In [None]:
import cv2
import numpy as np
import os
from pathlib import Path

def calculate_similarity(frame1, frame2):
    """
    Calculate the similarity between two frames using Mean Squared Error (MSE)
    Return: 
        - The smaller the number, the more similar the frames are
        - 0 means the two images are identical
    """
    # Resize to the same size for comparison
    frame1 = cv2.resize(frame1, (64, 64))  # Reduce size to increase processing speed
    frame2 = cv2.resize(frame2, (64, 64))
    
    # Convert to grayscale to reduce the effect of color
    gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    
    # Calculate MSE
    err = np.sum((gray1.astype("float") - gray2.astype("float")) ** 2)
    err /= float(gray1.shape[0] * gray1.shape[1])
    
    return err

def extract_frames_with_dedup(video_folder, output_base_folder, similarity_threshold=50):
    """
    Read all videos and extract frames, removing duplicate frames
    
    Args:
        video_folder (str): Path to the folder containing videos
        output_base_folder (str): Base folder to save the frames
        similarity_threshold (float): Threshold for determining duplicate frames
                                      (The smaller the number, the stricter the criteria)
    """
    Path(output_base_folder).mkdir(parents=True, exist_ok=True)
    
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
    video_files = []
    for ext in video_extensions:
        video_files.extend(list(Path(video_folder).glob(f'*{ext}')))
    
    for video_path in video_files:
        video_name = video_path.stem
        current_output_folder = os.path.join(output_base_folder, video_name)
        Path(current_output_folder).mkdir(parents=True, exist_ok=True)
        
        print(f"Processing video: {video_name}")
        
        cap = cv2.VideoCapture(str(video_path))
        
        frame_count = 0
        saved_count = 0
        previous_frame = None
        
        while True:
            ret, frame = cap.read()
            
            if not ret:
                break
            
            # Always save the first frame
            if previous_frame is None:
                frame_filename = os.path.join(
                    current_output_folder, 
                    f'{video_name}_{saved_count:06d}.jpg'
                )
                cv2.imwrite(frame_filename, frame)
                previous_frame = frame
                saved_count += 1
            else:
                # Calculate similarity with the previous frame
                similarity = calculate_similarity(frame, previous_frame)
                
                # Save the frame if it is sufficiently different from the previous one
                if similarity > similarity_threshold:
                    frame_filename = os.path.join(
                        current_output_folder, 
                        f'{video_name}_{saved_count:06d}.jpg'
                    )
                    cv2.imwrite(frame_filename, frame)
                    previous_frame = frame
                    saved_count += 1
            
            frame_count += 1
            
            # Print progress
            if frame_count % 100 == 0:
                print(f"Processed {frame_count} frames, saved {saved_count} frames for video {video_name}")
        
        print(f"Finished processing video {video_name}")
        print(f"Total frames: {frame_count}")
        print(f"Frames after duplicate filtering: {saved_count}")
        print(f"Removed {frame_count - saved_count} duplicate frames")
        print("-----------------------------------")
        
        cap.release()
    
    print("Completed processing all videos!")


if __name__ == "__main__":
    # Folder containing input videos
    video_folder = "./data/video_data/"
    
    # Folder to save the frames
    output_folder = "./data/image_data"
    
    # Run the frame extraction function
    extract_frames_with_dedup(video_folder, output_folder)


In [None]:
import cv2
import os
from pathlib import Path

def process_images_in_folder(input_folder, output_base_folder):
    """
    Process all images in the input folder, detect faces, and save them to the corresponding output folder.
    
    Args:
        input_folder (str): Folder containing subfolders with input images
        output_base_folder (str): Base folder to save processed images
    """
    # Initialize Haar Cascade face detector
    haar = cv2.CascadeClassifier('./model/haarcascade_frontalface_default.xml')
    
    # Create output folder if it doesn't exist
    Path(output_base_folder).mkdir(parents=True, exist_ok=True)
    
    # Get list of subfolders in input_folder
    input_subfolders = [f for f in Path(input_folder).iterdir() if f.is_dir()]
    
    # Process each subfolder
    for input_subfolder in input_subfolders:
        # Get folder name
        folder_name = input_subfolder.name
        
        # Create corresponding output folder
        output_subfolder = os.path.join(output_base_folder, folder_name)
        Path(output_subfolder).mkdir(parents=True, exist_ok=True)
        
        print(f"\nProcessing folder: {folder_name}")
        
        # Get list of all image files in the folder
        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
        image_files = []
        for ext in image_extensions:
            image_files.extend(list(input_subfolder.glob(f'*{ext}')))
        
        # Process each image
        for i, image_path in enumerate(image_files):
            try:
                # Read image and convert to RGB
                img = cv2.imread(str(image_path))
                if img is None:
                    print(f'Cannot read image: {image_path}')
                    continue
                    
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                # Apply Haar cascade classifier
                gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
                faces_list = haar.detectMultiScale(gray, 1.5, 5)
                
                # Process each detected face
                for j, (x, y, w, h) in enumerate(faces_list):
                    # Crop face
                    crop_img = img[y:y+h, x:x+w]
                    
                    # Create output file name
                    output_filename = f'{folder_name}_{i}_{j}.jpg'
                    output_path = os.path.join(output_subfolder, output_filename)
                    
                    # Save image
                    cv2.imwrite(output_path, crop_img)
                    print(f'Successfully processed: {output_filename}')
                
                if len(faces_list) == 0:
                    print(f'No face detected in image: {image_path}')
                    
            except Exception as e:
                print(f'Error processing image {image_path}: {str(e)}')
        
        print(f"Completed processing folder {folder_name}")
    
    print("\nCompleted processing all folders!")


if __name__ == "__main__":
    input_folder = "./data/image_data"
    output_folder = "./data/crop_image"

process_images_in_folder(input_folder, output_folder)


In [None]:
root_folder = './data/crop_image'
subfolders = [f.path for f in os.scandir(root_folder) if f.is_dir()]
total_images = 0
for folder in subfolders:
    image_paths = glob(os.path.join(folder, '*.[jp][pn]*[gG]'))
    num_images = len(image_paths)
    total_images += num_images
    print(f"Folder '{os.path.basename(folder)}' contains {num_images} images.")
print(f"Total images in all subfolders: {total_images}")
