<a href="https://colab.research.google.com/github/neetushibu/IontheFold-Team6/blob/main/SplitFolder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The below script split the files within the folder into batches of 500 within the same subfolder.

In [2]:
import os
import shutil
from pathlib import Path

def split_files(source_folder, batch_size=500, output_directory=None, start_batch_num=1):
    source_path = Path(source_folder)
    if not source_path.exists():
        print(f"Error: Source folder '{source_folder}' not found.")
        return

    # List files and sort them to ensure consistent processing order
    files = sorted(list(source_path.iterdir()))

    # Determine the base directory for batch folders
    # If output_directory is None, use the current working directory (non-persistent in Colab)
    # Otherwise, use the specified output_directory
    base_output_path = Path(output_directory) if output_directory else Path(".")

    # Calculate the starting index in the files list
    start_index = (start_batch_num - 1) * batch_size

    # Check if the starting index is valid
    if start_index >= len(files):
        print(f"Error: Starting batch number {start_batch_num} is too high. No files remaining to process.")
        return

    print(f"Starting file splitting from index {start_index} (corresponding to batch {start_batch_num}).")

    for i in range(start_index, len(files), batch_size):
        # Calculate the current batch number based on the starting batch number
        current_batch_num = start_batch_num + (i - start_index) // batch_size
        batch_folder_name = f"batch_{current_batch_num:02d}"
        batch_folder_path = base_output_path / batch_folder_name

        # Use exist_ok=True and parents=True to create the directory and any necessary parent directories
        batch_folder_path.mkdir(exist_ok=True, parents=True)

        batch = files[i:i + batch_size]
        print(f"Processing batch {current_batch_num} with {len(batch)} files.")

        for file in batch:
            # Construct the full source and destination paths
            source_file_path = source_path / file.name
            destination_file_path = batch_folder_path / file.name

            # Ensure the source file still exists before attempting to move (in case of previous partial run)
            if source_file_path.exists():
                try:
                    shutil.move(str(source_file_path), str(destination_file_path))
                except FileExistsError:
                    print(f"Warning: File '{file.name}' already exists in '{batch_folder_path}'. Skipping move.")
                except Exception as e:
                    print(f"Error moving file '{file.name}': {e}")
            else:
                print(f"Warning: Source file '{file.name}' not found. Skipping.")


# Usage: Split files from the source folder into batch folders within the same source folder, starting from batch_01
source_folder_path = "/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_05/folder_2"
split_files(source_folder_path, batch_size=500, output_directory=source_folder_path, start_batch_num=1)

Starting file splitting from index 0 (corresponding to batch 1).
Processing batch 1 with 500 files.
Processing batch 2 with 500 files.
Processing batch 3 with 500 files.
Processing batch 4 with 500 files.
Processing batch 5 with 500 files.
Processing batch 6 with 500 files.
Processing batch 7 with 500 files.
Processing batch 8 with 500 files.
Processing batch 9 with 500 files.
Processing batch 10 with 500 files.


/content/batch_01

In [None]:
import os

folder_path = "/content/drive/MyDrive/IontheFold/downloads/70-80K/"
if os.path.exists(folder_path):
    files_in_folder = os.listdir(folder_path)
    if files_in_folder:
        print(f"Files in {folder_path}:")
        for file_name in files_in_folder:
            print(file_name)
    else:
        print(f"The folder {folder_path} is empty.")
else:
    print(f"The folder {folder_path} does not exist.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6BPA.pdb.gz
6BPA-sf.cif.gz
6BPB.cif.gz
6BPB.pdb.gz
6BPB-sf.cif.gz
6BPC.cif.gz
6BPC.pdb.gz
6BPC-sf.cif.gz
6BPD.cif.gz
6BPD.pdb.gz
6BPD-sf.cif.gz
6BR8.cif.gz
6BR8.pdb.gz
6BR8-sf.cif.gz
6BR9.cif.gz
6BR9.pdb.gz
6BR9-sf.cif.gz
6BRS.cif.gz
6BRS.pdb.gz
6BRS-sf.cif.gz
6BWO.cif.gz
6BWO.pdb.gz
6BWO-sf.cif.gz
6BWQ.cif.gz
6BWQ.pdb.gz
6BWQ-sf.cif.gz
6BWR.cif.gz
6BWR.pdb.gz
6BWR-sf.cif.gz
6BZU.cif.gz
6BZU.pdb.gz
6BZU-sf.cif.gz
6BZV.cif.gz
6BZV.pdb.gz
6BZV-sf.cif.gz
6BZW.cif.gz
6BZW.pdb.gz
6BZW-sf.cif.gz
6BZY.cif.gz
6BZY.pdb.gz
6BZY-sf.cif.gz
6C80.cif.gz
6C80.pdb.gz
6C80-sf.cif.gz
6C93.cif.gz
6C93.pdb.gz
6C93-sf.cif.gz
6DGB-sf.cif.gz
6DGC.cif.gz
6DGC.pdb.gz
6DGC-sf.cif.gz
6DKU.cif.gz
6DKU.pdb.gz
6DKU-sf.cif.gz
6DNE.cif.gz
6DNE.pdb.gz
6DNE-sf.cif.gz
6DRW.cif.gz
6DRW.pdb.gz
6DRW-sf.cif.gz
6DUS.cif.gz
6DUS.pdb.gz
6DUS-sf.cif.gz
6DVV.cif.gz
6DVV.pdb.gz
6DVV-sf.cif.gz
6DXH.cif.gz
6DXH.pdb.gz
6DXH-sf.cif.gz
6DZS.cif.gz
6DZS.pdb.gz
6DZS-sf.cif

In [None]:
import shutil
import os

source_folder = "/content/batch_04"
destination_base_folder = "/content/drive/MyDrive/IontheFold/downloads/70-80K"

# Create the destination base folder if it doesn't exist
if not os.path.exists(destination_base_folder):
    os.makedirs(destination_base_folder)

# The destination path for the source folder
destination_folder_path = os.path.join(destination_base_folder, os.path.basename(source_folder))

# Check if the source folder exists
if not os.path.exists(source_folder):
    print(f"Error: Source folder '{source_folder}' not found.")
else:
    try:
        shutil.move(source_folder, destination_base_folder)
        print(f"Successfully moved folder '{source_folder}' to '{destination_base_folder}'")
    except Exception as e:
        print(f"An error occurred: {e}")

Successfully moved folder '/content/batch_04' to '/content/drive/MyDrive/IontheFold/downloads/70-80K'


I have modified the `split_files` function in the first code cell (`8Fae-LfmuCde`) to allow specifying a starting batch number.

Here are the key changes:

*   Added a `start_batch_num` parameter to the `split_files` function, defaulting to 1.
*   Calculated the `start_index` in the list of files based on the `start_batch_num` and `batch_size`.
*   Added a check to see if the `start_index` is valid (i.e., not beyond the total number of files).
*   Adjusted the loop to start from the calculated `start_index`.
*   Modified the batch folder naming to correctly number the batches starting from `start_batch_num`.
*   Added print statements to show the starting index and batch number being processed.
*   Updated the usage example at the bottom of the cell to call `split_files` with `start_batch_num=5` and the source folder `/content/drive/MyDrive/IontheFold/downloads/70-80K` as both the source and output directory.

You can now run the first code cell to split the remaining files into batch folders starting from `batch_05` within the `/content/drive/MyDrive/IontheFold/downloads/70-80K` folder.

# Task
Combine the content of every 10 subfolders (named batch_XX, starting from batch_05 up to batch_57) within the directory "/content/drive/MyDrive/IontheFold/downloads/70-80K" into new folders named sequentially (folder_1, folder_2, folder_3, etc.) within the same parent directory.

## List batch folders

### Subtask:
Get a list of all the existing batch folders in the source directory.


**Reasoning**:
List all entries in the source directory and filter for directories starting with "batch_".



In [None]:
import os

source_directory = "/content/drive/MyDrive/IontheFold/downloads/70-80K"

# Get all entries in the source directory
all_entries = os.listdir(source_directory)

# Filter for entries that are directories and start with "batch_"
batch_folders = sorted([
    entry for entry in all_entries
    if os.path.isdir(os.path.join(source_directory, entry)) and entry.startswith("batch_")
])

print(f"Found {len(batch_folders)} batch folders:")
for folder in batch_folders:
    print(folder)

Found 53 batch folders:
batch_05
batch_06
batch_07
batch_08
batch_09
batch_10
batch_11
batch_12
batch_13
batch_14
batch_15
batch_16
batch_17
batch_18
batch_19
batch_20
batch_21
batch_22
batch_23
batch_24
batch_25
batch_26
batch_27
batch_28
batch_29
batch_30
batch_31
batch_32
batch_33
batch_34
batch_35
batch_36
batch_37
batch_38
batch_39
batch_40
batch_41
batch_42
batch_43
batch_44
batch_45
batch_46
batch_47
batch_48
batch_49
batch_50
batch_51
batch_52
batch_53
batch_54
batch_55
batch_56
batch_57


In [None]:
# Group batch folders into sets of 10
batch_group_size = 10
grouped_batches = [
    batch_folders[i:i + batch_group_size]
    for i in range(0, len(batch_folders), batch_group_size)
]

print(f"Grouped {len(batch_folders)} folders into {len(grouped_batches)} groups of {batch_group_size}:")
for i, group in enumerate(grouped_batches):
    print(f"Group {i+1}: {group}")

Grouped 53 folders into 6 groups of 10:
Group 1: ['batch_05', 'batch_06', 'batch_07', 'batch_08', 'batch_09', 'batch_10', 'batch_11', 'batch_12', 'batch_13', 'batch_14']
Group 2: ['batch_15', 'batch_16', 'batch_17', 'batch_18', 'batch_19', 'batch_20', 'batch_21', 'batch_22', 'batch_23', 'batch_24']
Group 3: ['batch_25', 'batch_26', 'batch_27', 'batch_28', 'batch_29', 'batch_30', 'batch_31', 'batch_32', 'batch_33', 'batch_34']
Group 4: ['batch_35', 'batch_36', 'batch_37', 'batch_38', 'batch_39', 'batch_40', 'batch_41', 'batch_42', 'batch_43', 'batch_44']
Group 5: ['batch_45', 'batch_46', 'batch_47', 'batch_48', 'batch_49', 'batch_50', 'batch_51', 'batch_52', 'batch_53', 'batch_54']
Group 6: ['batch_55', 'batch_56', 'batch_57']


In [None]:
import os
import shutil

def merge_batch_group(source_directory, batch_list, destination_directory):
    """
    Merges the contents of a list of specified batch folders into a single destination directory.

    Args:
        source_directory (str): The path to the directory containing the batch folders.
        batch_list (list): A list of batch folder names (strings) to merge.
        destination_directory (str): The path to the directory where contents will be merged.
    """
    if not os.path.exists(source_directory):
        print(f"Error: Source directory '{source_directory}' not found.")
        return

    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)
        print(f"Created destination directory: '{destination_directory}'")

    print(f"Merging contents of batches {batch_list} into '{destination_directory}'...")

    for batch_name in batch_list:
        batch_path = os.path.join(source_directory, batch_name)

        if os.path.isdir(batch_path):
            print(f"Processing batch folder: '{batch_path}'")
            try:
                # Iterate through contents of the batch folder and move them
                for content_name in os.listdir(batch_path):
                    source_content_path = os.path.join(batch_path, content_name)
                    destination_content_path = os.path.join(destination_directory, content_name)

                    # Move the item
                    shutil.move(source_content_path, destination_content_path)
                    # print(f"Moved '{content_name}' to '{destination_directory}'") # Uncomment for verbose output
            except FileNotFoundError:
                 print(f"Warning: Content not found in '{batch_path}'. It might have already been moved.")
            except shutil.Error as e:
                print(f"Error moving content from '{batch_path}' to '{destination_directory}': {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing '{batch_path}': {e}")
        else:
            print(f"Warning: '{batch_path}' is not a directory or does not exist. Skipping.")


# --- Main merging loop ---
source_directory = "/content/drive/MyDrive/IontheFold/downloads/70-80K" # Directory containing the batch folders
output_base_directory = "/content/drive/MyDrive/IontheFold/downloads/70-80K" # Base directory for the new merged folders

for i, batch_group in enumerate(grouped_batches):
    # Create sequential folder names like folder_1, folder_2, etc.
    merged_folder_name = f"folder_{i+1}"
    merged_destination_path = os.path.join(output_base_directory, merged_folder_name)

    # Call the function to merge the current group of batches
    merge_batch_group(source_directory, batch_group, merged_destination_path)

print("\nMerging process complete.")

Merging contents of batches ['batch_05', 'batch_06', 'batch_07', 'batch_08', 'batch_09', 'batch_10', 'batch_11', 'batch_12', 'batch_13', 'batch_14'] into '/content/drive/MyDrive/IontheFold/downloads/70-80K/folder_1'...
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_05'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_06'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_07'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_08'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_09'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_10'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_11'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_12'
Processing batch folder: '/content/drive/MyDrive/IontheFold/downloads/70-80K/batch_13