In [50]:
import pandas as pd
import os

# Transition df: now in frame_label.ipynb

In [51]:
def transition_dataframe(directory: str, just_num: bool = True, save: bool = False) -> pd.DataFrame:
    """
    Create a Pandas DataFrame with subdirectory names as columns and non-hidden file names, which are changepoints as rows.

    Args:
        directory (str): The path to the main directory containing "holding_transitions"
                        and "not_holding_transitions" subdirectories.
        just_num (bool): If True, index of transition kept, not whole filename
        save (bool): if True, saves to directory in file called "changepoints.csv"

    Returns:
        pd.DataFrame: A DataFrame with subdirectory names as columns and non-hidden file names as rows.
    """
    # Initialize an empty dictionary to store data
    data = {}

    # List of subdirectories to look for
    subdirectories = ("holding_transitions", "not_holding_transitions")

    # Iterate through the subdirectories
    for subdirectory in subdirectories:
        # Get the full path to the subdirectory
        subdirectory_path = os.path.join(directory, subdirectory)

        # Check if the subdirectory exists
        if os.path.exists(subdirectory_path) and os.path.isdir(subdirectory_path):
            # List non-hidden files in the subdirectory
            files = [f for f in os.listdir(subdirectory_path) if not f.startswith('.')] # avoid hidden files starting with "."
            if just_num:
                files = [int(f.split("_")[0]) for f in files]
            
            files.sort()
            # Add the file names to the data dictionary with subdirectory name as key
            data[subdirectory] = files

    # Create a DataFrame from the data dictionary
    df = pd.DataFrame(data)
    
    if save:
        df.to_csv(directory + "/changepoints.csv")
        print("Saved")
    return df

In [52]:
df = transition_dataframe("/Users/NoahRipstein/Downloads/shafee group stuff/nr labels/all", True, True)

Saved


# visual inspection of video

In [37]:
import cv2
import numpy as np
from tqdm import tqdm

def overlay_changepoints(video_path: str, changepoints: np.ndarray, output_path: str, frame_rate: float = 29.97) -> None:
    """
    Overlay a square with alpha 0.8 in alternating red and green colors based on changepoint frames.

    Args:
        video_path (str): Path to the input video file.
        changepoints (numpy array): Numpy array of integers representing changepoint frames.
        output_path (str): Path to save the output video file.
        frame_rate (float, optional): Frame rate of the output video. Default is 29.97 fps.

    Returns:
        None
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Unable to open video at {video_path}")
        return

    ret, frame = cap.read()
    height, width, _ = frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))

    idx = 0
    is_green = False

    height, width, _ = frame.shape
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # for progress bar
    with tqdm(total=total_frames) as progress_bar_tqdm:
        while ret:
            if idx in changepoints:
                is_green = not is_green
            
            # Determine square color based on is_green
            square_color = (0, 255, 0) if is_green else (0, 0, 255)  # Green or red (BGR format)
    
            # Create a square with alpha 0.8
            overlay = frame.copy()
            
            square_size = int(0.15 * width)
            x_pos = width - square_size - int(0.1 * width)
            y_pos = height - square_size - int(0.1 * height)
            alpha = 0.7
            cv2.rectangle(overlay, (x_pos, y_pos), (x_pos + square_size, y_pos + square_size), square_color, -1)  # -1 fills the rectangle
            cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)  # Apply alpha blending
            
            out.write(frame)
            idx += 1
            
            ret, frame = cap.read()
            progress_bar_tqdm.update(1)

    cap.release()
    out.release()
    cv2.destroyAllWindows()
    print("Done!")

# Make train test split

In [2]:
import os
import shutil
import random
from tqdm import tqdm

def img_classifier_train_test_split(dirs: list[str], split_size: float, output_dir: str = '.', seed: int = None) -> None:
    """
    This function merges images from multiple directories into a train and test split structure.
    
    Args:
        dirs (List[str]): List of directory paths. Each directory should contain 'holding' and 'not_holding' subdirectories.
        split_size (float): Fraction of images to be used for training (between 0 and 1).
        output_dir (str, optional): Directory to save the 'train_test_split' folder. Defaults to current directory.
        seed (int, optional): Seed for random number generator to ensure reproducibility.
    
    The function creates the following directory structure:
    train_test_split
    - train
      -- holding
      -- not_holding
    - test
      -- holding
      -- not_holding
    """
    # Set the seed for reproducibility
    if seed is not None:
        random.seed(seed)

    # Paths for train and test directories
    train_path = os.path.join(output_dir, 'train_test_split', 'train')
    test_path = os.path.join(output_dir, 'train_test_split', 'test')

    # Create the necessary directories
    for path in (train_path, test_path):
        for category in ('holding', 'not_holding'):
            os.makedirs(os.path.join(path, category), exist_ok=True)

    # Function to split and copy files
    def split_and_copy_files(source_dir: str, train_dir: str, test_dir: str) -> None:
        pics = [f for f in os.listdir(source_dir) if f.endswith((".jpeg", ".jpg", ".png"))] # avoids hidden files
        random.shuffle(pics)
        train_size = int(len(pics) * split_size)

        named_part = "/".join(source_dir.split("/")[-2:]) 
        # Copy files to train and test directories
        for i, pic in tqdm(enumerate(pics), total=len(pics), desc=f"Copying {named_part}"):
            if i < train_size:
                shutil.copy(os.path.join(source_dir, pic), train_dir)
            else:
                shutil.copy(os.path.join(source_dir, pic), test_dir)

    # Process each directory in the list
    for dir in dirs:
        for category in ('holding', 'not_holding'):
            source_dir = os.path.join(dir, category)
            train_dir = os.path.join(train_path, category)
            test_dir = os.path.join(test_path, category)
            split_and_copy_files(source_dir, train_dir, test_dir)

    print("Train-test split created successfully.")

In [3]:
base = "/Users/NoahRipstein/Downloads/shafee group stuff/"
sr = [base + "sr labels/sr1", base + "sr labels/sr2", base + "sr labels/sr3", base + "sr labels/sr4", base + "sr labels/sr_extra1", base + "sr labels/sr_extra2"]
sv = [base + "sv labels/sv1_frames", base + "sv labels/sv2_frames", base + "sv labels/sv3_frames", base + "sv labels/sv4_frames", base + "sv labels/sv5_frames", base + "sv labels/sv_extra_frames"]
nr = [base + "nr labels/all"]

sr_sv_nr = sr + sv + nr
img_classifier_train_test_split(sr_sv_nr, split_size=0.8, output_dir="/Users/NoahRipstein/Downloads/shafee group stuff/sr_sv_nr_train", seed=42)

Copying sr1/holding: 100%|██████████| 7879/7879 [00:03<00:00, 2529.76it/s]
Copying sr1/not_holding: 100%|██████████| 7340/7340 [00:02<00:00, 2517.89it/s]
Copying sr2/holding: 100%|██████████| 2060/2060 [00:00<00:00, 2593.09it/s]
Copying sr2/not_holding: 100%|██████████| 1912/1912 [00:00<00:00, 2601.34it/s]
Copying sr3/holding: 100%|██████████| 6290/6290 [00:02<00:00, 2643.63it/s]
Copying sr3/not_holding: 100%|██████████| 5560/5560 [00:02<00:00, 2500.70it/s]
Copying sr4/holding: 100%|██████████| 1909/1909 [00:00<00:00, 2356.41it/s]
Copying sr4/not_holding: 100%|██████████| 1529/1529 [00:00<00:00, 2388.75it/s]
Copying sr_extra1/holding: 0it [00:00, ?it/s]
Copying sr_extra1/not_holding: 100%|██████████| 792/792 [00:00<00:00, 2274.40it/s]
Copying sr_extra2/holding: 100%|██████████| 1727/1727 [00:00<00:00, 2348.76it/s]
Copying sr_extra2/not_holding: 100%|██████████| 148/148 [00:00<00:00, 2292.41it/s]
Copying sv1_frames/holding: 100%|██████████| 7687/7687 [00:03<00:00, 2383.13it/s]
Copying s

Train-test split created successfully.





In [16]:
print(len(os.listdir("/Users/NoahRipstein/Downloads/shafee group stuff/sr labels/train_test_split/train/holding")))
print(len(os.listdir("/Users/NoahRipstein/Downloads/shafee group stuff/sr labels/train_test_split/train/not_holding")))

print(len(os.listdir("/Users/NoahRipstein/Downloads/shafee group stuff/sr labels/train_test_split/test/holding")))
print(len(os.listdir("/Users/NoahRipstein/Downloads/shafee group stuff/sr labels/train_test_split/test/not_holding")))

7951
7401
1988
1851


# RENAME TOOL

In [31]:
def rename_files(directory: str, suffix: str) -> None:
    """
    Rename and move JPEG files from a specified directory to an 'output_folder' in its parent directory.
    
    Used for when frames have been extarcted but have some suffix problem like unclear naming or an extra underscore

    This function iterates over all files in the given directory. For each file with a '.jpg' extension,
    it extracts the number part of the filename (before the first underscore), creates a new filename
    in the format 'x_srextra2.jpg', and then moves and renames this file to the 'output_folder'.
    The 'output_folder' is created in the parent directory of the specified directory if it doesn't already exist.

    Args:
        directory (str): The path to the directory containing the files to be renamed and moved.

    Returns:
        None: The function doesn't return anything but renames and moves files.

    Example:
        >>> rename_files("/path/to/directory")
        This will rename all '.jpg' files in '/path/to/directory' and move them to '/path/to/output_folder'.
    """

    # Create the output folder if it doesn't exist
    output_folder = os.path.join(os.path.dirname(directory), "output_folder")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List and process all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".jpg"):
            # Extract the number part of the filename
            number = filename.split("_")[0]
            # Construct the new filename
            new_filename = f"{number}_{suffix}.jpg"
            # Full path for the source and destination files
            src = os.path.join(directory, filename)
            dest = os.path.join(output_folder, new_filename)
            # Rename and move the file to the output folder
            os.rename(src, dest)


rename_files("/Users/NoahRipstein/Downloads/shafee group stuff/sv labels/sv_extra_frames/frames", suffix="svextra")
# This will rename and move all the files in the specified directory to the output_folder in the parent directory.

# Rename tool just for NR frames

In [40]:
def rename_files_nr(directory: str, suffix: str) -> None:
    # Create the output folder if it doesn't exist
    output_folder = os.path.join(os.path.dirname(directory), "output_folder")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List and process all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".jpg"):
            # Extract the number part of the filename
            number = int(os.path.splitext(filename.split("_")[1])[0])
            # Construct the new filename
            new_filename = f"{number}_{suffix}.jpg"
            # Full path for the source and destination files
            src = os.path.join(directory, filename)
            dest = os.path.join(output_folder, new_filename)
            # Rename and move the file to the output folder
            os.rename(src, dest)

# how to use:
# rename_files_nr("/Users/NoahRipstein/Downloads/shafee group stuff/nr_frames/holding", suffix="nr")

## add transition frames for NR

I want to make a python function according to the following specification:

I have a directory which contains folders called "holding" and "not_holding". both folders contain pngs with filenames of this format: "x_nr.png" where x is an integer. "not_holding" has 1_nr.png, 2_nr.png, 3_nr.png, ..., 22_nr.png. "holding" has 23_nr.png, 24_nr.png, 25_nr.png, ... 498_nr.png, then "not_holding" has 499_nr.png, 500_nr.png. This pattern continues, where they go back and fourth. I want to identify all of the jumps in numbers, and to copy them to a different file. So for example.

In [48]:
import os
import shutil
import re

def copy_transition_files(input_dir: str):
    """Make transition frame folders if there are labeled frames but no identified transition frames

    This function scans a specified directory for PNG files named in the format
    "x_nr.png", where x is an integer. It identifies files where 'x' represents
    the lowest number following a numerical jump in the sequence of filenames.
    These identified files are then copied to an 'output_folder_trns' directory,
    which is created in the parent directory of the provided input directory.

    Args:
        input_dir (str): A path to the directory containing the target PNG files.

    Raises:
        FileNotFoundError: If the input directory does not exist or is invalid.
        OSError: For issues related to file reading, writing, or permissions.

    Example:
        >>> copy_jump_files('/path/to/your/directory')
        This will copy files like '499_nr.png' from the specified directory to
        '/path/to/output_folder_trns', assuming '499_nr.png' is a file right
        after a numerical jump in the sequence.

    Note:
        - The function assumes the input directory exists and contains PNG files.
        - The output directory 'output_folder_trns' is created if it does not
          already exist in the parent directory of 'input_dir'.
        - Files are identified and sorted based on the integer 'x' in their names.
          The function expects filenames to strictly follow the 'x_nr.png' format.
        - Only files that are directly after a numerical jump (e.g., from '22_nr.png'
          to '499_nr.png') are copied.
    """
    # Check if the output folder exists; if not, create it
    output_dir = os.path.join(os.path.dirname(input_dir), 'output_folder_trns')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Regular expression to extract the number from filenames
    pattern = re.compile(r"(\d+)_nr\.jpg")

    # List all png files in the directory
    files = [f for f in os.listdir(input_dir) if f.endswith('.jpg')]

    # Extract numbers and sort them
    numbers = sorted([int(pattern.search(f).group(1)) for f in files if pattern.search(f)])

    # Find the lowest number after each jump
    jump_numbers = [numbers[0]]
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i-1] > 1:
            jump_numbers.append(numbers[i])

    # Copy files with identified numbers to the output directory
    for num in jump_numbers:
        filename = f"{num}_nr.jpg"
        shutil.copy2(os.path.join(input_dir, filename), output_dir)

    print(f"Copied {len(jump_numbers)} files to {output_dir}")

copy_transition_files("/Users/NoahRipstein/Downloads/shafee group stuff/nr_frames/holding")

Copied 48 files to /Users/NoahRipstein/Downloads/shafee group stuff/nr_frames/output_folder_trns


# check corrupted images in train test

In [2]:
from PIL import Image
import os

train_dir = "/Users/NoahRipstein/Downloads/shafee group stuff/sr_sv_nr_train/train_test_split/train"
valid_dir = "/Users/NoahRipstein/Downloads/shafee group stuff/sr_sv_nr_train/train_test_split/test"


def test_load_images(image_dir):
    bad_files = []
    for subdir, dirs, files in os.walk(image_dir):
        for file in files:
            try:
                img_path = os.path.join(subdir, file)
                img = Image.open(img_path)
                img.verify()  # Verify if it's an image
            except (IOError, SyntaxError):
                bad_files.append(img_path)  # Add the path of the bad file to the list
    return bad_files

# Example of how to use the function
bad_train_files = test_load_images(train_dir)
bad_valid_files = test_load_images(valid_dir)

print("Number of bad files in training set:", len(bad_train_files))
print("Bad files in training set:", bad_train_files)

print("Number of bad files in validation set:", len(bad_valid_files))
print("Bad files in validation set:", bad_valid_files)

Number of bad files in training set: 0
Bad files in training set: []
Number of bad files in validation set: 0
Bad files in validation set: []
