# Data preprocessing ([#2](https://github.com/molinari135/embryo-project/issues/2))

In [1]:
import re
import shutil

from pathlib import Path
from embryo_project.config import RAW_DATA_DIR

[32m2025-08-01 17:09:10.785[0m | [1mINFO    [0m | [36membryo_project.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Molinari\Desktop\embryo-project[0m


In [None]:
# Change this path with the dataset folder
DATASET_DIR = Path(r"C:/Users/Molinari/Desktop/Embrioni project/Dataset_campionato/Dataset_campionato - Copia")
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Automatically extract all D202X folders from DATASET_DIR
folder_pattern = re.compile(r'^D202[0-3]')
well_folder_pattern = re.compile(r'^D202[0-3].*WELL\d{1,2}')

found = 0
copied = 0
conflicts = 0

# Needed for those folders with long names
def to_long_path(path: Path) -> str:
    return r"\\?\{}".format(str(path.resolve()))

for folder in DATASET_DIR.rglob("*"):
    if folder.is_dir() and well_folder_pattern.match(folder.name):
        found += 1
        destination = RAW_DATA_DIR / folder.name

        if not destination.exists():
            shutil.copytree(to_long_path(folder), to_long_path(destination))
            copied += 1
        else:
            conflicts += 1

print(f"Total number of folders founded: {found}")
print(f"Copied folders: {copied}")
print(f"Conflicts (already copied): {conflicts}")


Total number of folders founded: 1166
Copied folders: 1150
Conflicts (already copied): 16


Every folder follows this pattern:

- `D[YEAR.MONTH.DAY]`
- `S[CODE]`
- `I[CODE]`
- `P` or `D`
- `WELL[CODE]`

Between every information, an underscore (`_`) or a whitespace (` `) can be found. Sometimes, folders have also an additional string `CAMPIONATO`, `CAMPIONATO_ok` `CAMPIONATO - Copia ([NUMBER])`.

Some examples:
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO_ok`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO - Copia (4)`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX - CAMPIONATO`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX`

In [None]:
# Rename folders according to a common pattern
pattern = r'^(.*?WELL\d{1,2})'

pattern_matched = 0
renamed_folders = 0

for folderpath in RAW_DATA_DIR.iterdir():
    if not folderpath.is_dir():
        continue

    relative_str = str(folderpath.relative_to(RAW_DATA_DIR))
    match = re.match(pattern, relative_str)
    
    if match:
        cleaned_name = match.group(1)
        new_path = folderpath.parent / cleaned_name

        if folderpath.name != cleaned_name:
            # Rename only if names differ
            if not new_path.exists():
                folderpath.rename(new_path)
                renamed_folders += 1
            else:
                print(f"Warning: {new_path} already exists.")
        
        pattern_matched += 1
    else:
        print(f"No match for {relative_str}")

print(f"Total folders: {len(list(RAW_DATA_DIR.iterdir()))}")
print(f"Matched: {pattern_matched}, Renamed: {renamed_folders}")

Total folders: 1151
Matched: 1150, Renamed: 932


In [None]:
# Delete empty folders
empty_deleted = 0

for folder in RAW_DATA_DIR.iterdir():
    if folder.is_dir() and not any(folder.iterdir()):
        folder.rmdir()
        empty_deleted += 1
        print(f"Deleted empty folder: {folder.name}")

print(f"Total deleted folders: {empty_deleted}")

Deleted empty folder: D2020.09.30_S00018_I4203_P_WELL01_CAMPIONATO - Copia (4)
Deleted empty folder: D2020.10.07_S00023_I4203_P_WELL16
Deleted empty folder: D2021.10.20_S00083_I4203_P_WELL12
Deleted empty folder: D2021.10.20_S00083_I4203_P_WELL13
Deleted empty folder: D2021.10.20_S00083_I4203_P_WELL14
Deleted empty folder: D2021.10.20_S00083_I4203_P_WELL15
Deleted empty folder: D2021.11.10_S00084_I4203_P_WELL13
Deleted empty folder: D2021.11.10_S00084_I4203_P_WELL14
Deleted empty folder: D2021.11.10_S00084_I4203_P_WELL15
Deleted empty folder: D2021.11.10_S00084_I4203_P_WELL16
Deleted empty folder: D2022.04.06_S00131_I4203_P_WELL01
Deleted empty folder: D2022.04.06_S00131_I4203_P_WELL01_CAMPIONATO - Copia
Deleted empty folder: D2022.04.06_S00131_I4203_P_WELL01_CAMPIONATO - Copia (2)
Deleted empty folder: D2022.04.06_S00131_I4203_P_WELL01_CAMPIONATO - Copia (3)
Deleted empty folder: D2022.04.06_S00131_I4203_P_WELL01_CAMPIONATO - Copia (4)
Deleted empty folder: D2022.04.06_S00131_I4203_P_

In [None]:
# Fix specific folder issue
old_to_new = {
    "D2022.03.02_S00116_I4203_P_WELL01": "D2022.03.02_S00116_I4203_P_WELL10",
    "D2022.03.02_S00116_I4203_P_WELL01_CAMPIONATO": "D2022.03.02_S00116_I4203_P_WELL01"
}

for old_name, new_name in old_to_new.items():
    old_path = RAW_DATA_DIR / old_name
    new_path = RAW_DATA_DIR / new_name

    if old_path.exists():
        if not new_path.exists():
            old_path.rename(new_path)
            print(f"Renamed: {old_name} -> {new_name}")
        else:
            print(f"Warning: {new_name} already exists.")
    else:
        print(f"Folder not found: {old_name}")

Renamed: D2022.03.02_S00116_I4203_P_WELL01 → D2022.03.02_S00116_I4203_P_WELL10
Renamed: D2022.03.02_S00116_I4203_P_WELL01_CAMPIONATO → D2022.03.02_S00116_I4203_P_WELL01


In [6]:
path = Path(RAW_DATA_DIR)
folders = [p for p in path.iterdir() if p.is_dir()]
print(f"Total number of folders: {len(folders)}")

Total number of folders: 1103
