In [19]:
import os
import shutil
import pandas as pd

def copy_selected_files_by_metadata(folder_path, n_per_class=500, metadata_name='metadata.csv', new_folder_name='selected_files'):
    """
    Copies all files listed in the metadata.csv file to a new folder, keep only n_per_class files for each class.

    Parameters:
        folder_path (str): Path to the folder to copy from.
        n_per_class (int): Number of files per class to keep
    """
    metadata_file = os.path.join(folder_path, metadata_name)
    new_folder_path = os.path.join(folder_path, new_folder_name)

    # Check if metadata.csv exists
    if not os.path.isfile(metadata_file):
        print(f"Error: metadata.csv not found in {folder_path}")
        return

    # Read the metadata.csv file
    try:
        df = pd.read_csv(metadata_file)
    except Exception as e:
        print(f"Error reading metadata.csv: {e}")
        return

    # Create the new folder if it doesn't exist
    os.makedirs(new_folder_path, exist_ok=True)

    # Filter rows containing 'tumor' and 'healthy', select n_per_class rows for each class
    tumor_rows = df[df['Prompt'].str.contains('tumor', case=False, na=False)]
    healthy_rows = df[df['Prompt'].str.contains('healthy', case=False, na=False)]
    print(f"Found {len(tumor_rows)} tumor rows and {len(healthy_rows)} healthy rows")
    tumor_rows = tumor_rows.sample(n=n_per_class, random_state=38).reset_index(drop=True)
    healthy_rows = healthy_rows.sample(n=n_per_class, random_state=38).reset_index(drop=True)
    print(f"Selected {n_per_class} tumor rows and {n_per_class} healthy rows")

    # Concatenate the selected tumor and healthy rows and shuffle them
    filtered_df = pd.concat([tumor_rows, healthy_rows])
    filtered_df = filtered_df.sample(frac=1, random_state=38).reset_index(drop=True)
    
    # change the headers
    filtered_df.columns = ['file_name', 'text']
    # Copy metadata.csv to the new folder
    filtered_df.to_csv(os.path.join(new_folder_path, 'metadata.csv'), index=False, columns=['file_name', 'text'])
    print(f"Copied metadata.csv to {new_folder_path}")

    # Copy files listed in metadata.csv to the new folder
    for file_name in filtered_df['file_name']:
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            try:
                shutil.copy(file_path, new_folder_path)
                print(f"Copied: {file_name}")
            except Exception as e:
                print(f"Error copying {file_name}: {e}")


In [21]:
path = r'C:\Users\ances\Downloads\01-12_21h50m24s\selora_outputs\loras\full_output'
n = 500
copy_selected_files_by_metadata(path, n, metadata_name='metadata_image_generation.csv')

Found 1260 tumor rows and 840 healthy rows
Selected 500 tumor rows and 500 healthy rows
Copied metadata.csv to C:\Users\ances\Downloads\01-12_21h50m24s\selora_outputs\loras\full_output\selected_files
Copied: 1374.png
Copied: 1030.png
Copied: 1653.png
Copied: 1082.png
Copied: 1029.png
Copied: 1785.png
Copied: 1876.png
Copied: 1892.png
Copied: 1984.png
Copied: 1702.png
Copied: 1690.png
Copied: 59.png
Copied: 1889.png
Copied: 76.png
Copied: 591.png
Copied: 502.png
Copied: 95.png
Copied: 377.png
Copied: 1899.png
Copied: 700.png
Copied: 1739.png
Copied: 1179.png
Copied: 1077.png
Copied: 1539.png
Copied: 895.png
Copied: 981.png
Copied: 1159.png
Copied: 1839.png
Copied: 1333.png
Copied: 1848.png
Copied: 1730.png
Copied: 1494.png
Copied: 761.png
Copied: 959.png
Copied: 902.png
Copied: 1427.png
Copied: 1905.png
Copied: 1615.png
Copied: 1011.png
Copied: 1598.png
Copied: 18.png
Copied: 1020.png
Copied: 871.png
Copied: 92.png
Copied: 1658.png
Copied: 1086.png
Copied: 855.png
Copied: 1949.png
Copie