# Data preprocessing ([#2](https://github.com/molinari135/embryo-project/issues/2))

In this notebook, we perform the following operations:

1. Extract D202X folders from data/raw directory into data/processed
2. Remove empty extracted folders in data/processed
3. Fix folder name issue in data/processed
4. Rename extracted folders in data/processed

In order to perform all these operations it is **required** do to put in data/raw directory the folders that will be processed.

In [1]:
import re, shutil

from pathlib import Path
from loguru import logger
from embryo_project.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, INTERIM_DATA_DIR

[32m2025-08-05 09:43:55.847[0m | [1mINFO    [0m | [36membryo_project.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Molinari\Desktop\embryo-project[0m


In [2]:
# If needed, change folders path in embryo_project/config.py file
INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True)
# Remove everything from data/processed
# [shutil.rmtree(f) for f in PROCESSED_DATA_DIR.iterdir() if f.is_dir()]

In [3]:
bad_folder = RAW_DATA_DIR / Path("2022_CAMPIONATO/17.08.2022_CAMPIONATO/D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO")
work_dir = INTERIM_DATA_DIR / "tmp" # copy the folder in data/interim to not change the original files

work_dir.mkdir(parents=True, exist_ok=True)

tmp_folder = work_dir / (bad_folder.name + "_tmp")
shutil.copytree(bad_folder, tmp_folder)

for subfolder in tmp_folder.iterdir():
    if subfolder.is_dir():
        destination = INTERIM_DATA_DIR / subfolder.name
        logger.info(f"Moving {subfolder} -> {destination}")
        shutil.move(str(subfolder), str(destination))

shutil.rmtree(tmp_folder)

[32m2025-08-05 09:43:56.453[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMoving C:\Users\Molinari\Desktop\embryo-project\data\interim\tmp\D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO_tmp\D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO -> C:\Users\Molinari\Desktop\embryo-project\data\interim\D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO[0m
[32m2025-08-05 09:43:56.455[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMoving C:\Users\Molinari\Desktop\embryo-project\data\interim\tmp\D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO_tmp\D2022.08.17_S00149_I4203_P_WELL02_CAMPIONATO -> C:\Users\Molinari\Desktop\embryo-project\data\interim\D2022.08.17_S00149_I4203_P_WELL02_CAMPIONATO[0m
[32m2025-08-05 09:43:56.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMoving C:\Users\Molinari\Desktop\embryo-project\data\interim\tmp\D2022.08.17_S00149_I4203_P_WELL01_CAMPIONATO_tmp\D2022.08.17_S00149_I4203_P_WELL0

In [4]:
# Automatically extract all D202X folders from DATASET_DIR
folder_pattern = re.compile(r'^D202[0-3]')
well_folder_pattern = re.compile(r'^D202[0-3].*WELL\d{1,2}')

found = 0
copied = 0
conflicts = 0

# Needed for those folders with long names
def to_long_path(path: Path) -> str:
    return r"\\?\{}".format(str(path.resolve()))

for folder in RAW_DATA_DIR.rglob("*"):
    if folder.is_dir() and well_folder_pattern.match(folder.name):
        found += 1
        destination = INTERIM_DATA_DIR / folder.name

        if not destination.exists():
            shutil.copytree(to_long_path(folder), to_long_path(destination))
            copied += 1
        else:
            conflicts += 1

logger.info(f"Total number of folders founded: {found}")
logger.success(f"Copied folders: {copied}")
logger.warning(f"Conflicts (already copied): {conflicts}")

[32m2025-08-05 09:44:12.144[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1mTotal number of folders founded: 1166[0m
[32m2025-08-05 09:44:12.144[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [32m[1mCopied folders: 1134[0m


Every folder follows this pattern:

- `D[YEAR.MONTH.DAY]`
- `S[CODE]`
- `I[CODE]`
- `P` or `D`
- `WELL[CODE]`

Between every information, an underscore (`_`) or a whitespace (` `) can be found. Sometimes, folders have also an additional string `CAMPIONATO`, `CAMPIONATO_ok` `CAMPIONATO - Copia ([NUMBER])`.

Some examples:
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO_ok`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX_CAMPIONATO - Copia (4)`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX - CAMPIONATO`
- `DXXXX.XX.XX_SXXXXX_IXXXX_P_WELLXX`

In [5]:
# Delete empty folders
empty_deleted = 0

for folder in INTERIM_DATA_DIR.iterdir():
    if folder.is_dir() and not any(folder.iterdir()):
        folder.rmdir()
        empty_deleted += 1
        logger.info(f"Deleted empty folder: {folder.name}")

logger.success(f"Deleted folders: {empty_deleted}")

[32m2025-08-05 09:44:12.152[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2020.09.30_S00018_I4203_P_WELL01_CAMPIONATO - Copia (4)[0m
[32m2025-08-05 09:44:12.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2020.10.07_S00023_I4203_P_WELL16_CAMPIONATO[0m
[32m2025-08-05 09:44:12.202[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2021.10.20_S00083_I4203_P_WELL12[0m
[32m2025-08-05 09:44:12.203[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2021.10.20_S00083_I4203_P_WELL13[0m
[32m2025-08-05 09:44:12.203[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2021.10.20_S00083_I4203_P_WELL14[0m
[32m2025-08-05 09:44:12.203[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDeleted empty folder: D2

In [6]:
# Rename folders according to a common pattern
pattern = r'^(.*?WELL\d{1,2})'

pattern_matched = 0
renamed_folders = 0

for folderpath in INTERIM_DATA_DIR.iterdir():
    if not folderpath.is_dir():
        continue

    relative_str = str(folderpath.relative_to(INTERIM_DATA_DIR))
    match = re.match(pattern, relative_str)
    
    if match:
        cleaned_name = match.group(1)
        new_path = folderpath.parent / cleaned_name

        if folderpath.name != cleaned_name:
            # Rename only if names differ
            if not new_path.exists():
                folderpath.rename(new_path)
                renamed_folders += 1
            else:
                logger.warning(f"{new_path} already exists.")
        
        pattern_matched += 1
    else:
        logger.warning(f"No match for {relative_str}")

logger.info(f"Total folders: {len(list(INTERIM_DATA_DIR.iterdir()))}")
logger.success(f"Matched: {pattern_matched}, Renamed: {renamed_folders}")

[32m2025-08-05 09:44:12.703[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mTotal folders: 1119[0m
[32m2025-08-05 09:44:12.703[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [32m[1mMatched: 1118, Renamed: 919[0m


In [7]:
# Fix specific folder issue
old_to_new = {
    "D2022.03.02_S00116_I4203_P_WELL01": "D2022.03.02_S00116_I4203_P_WELL10",
    "D2022.03.02_S00116_I4203_P_WELL01_CAMPIONATO": "D2022.03.02_S00116_I4203_P_WELL01"
}

for old_name, new_name in old_to_new.items():
    old_path = INTERIM_DATA_DIR / old_name
    new_path = INTERIM_DATA_DIR / new_name

    if old_path.exists():
        if not new_path.exists():
            old_path.rename(new_path)
            logger.success(f"Renamed: {old_name} -> {new_name}")
        else:
            logger.warning(f"{new_name} already exists.")
    else:
        logger.warning(f"Folder not found: {old_name}")

[32m2025-08-05 09:44:12.709[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [32m[1mRenamed: D2022.03.02_S00116_I4203_P_WELL01 -> D2022.03.02_S00116_I4203_P_WELL10[0m
[32m2025-08-05 09:44:12.710[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [32m[1mRenamed: D2022.03.02_S00116_I4203_P_WELL01_CAMPIONATO -> D2022.03.02_S00116_I4203_P_WELL01[0m


In [8]:
# Fix folder names that ends with WELL1 instead of WELL01
for folder in INTERIM_DATA_DIR.rglob("*"):
    if folder.is_dir() and folder.name.endswith("WELL1"):
        new_name = folder.name.replace("WELL1", "WELL01")
        new_path = folder.parent / new_name
        logger.info(f"Renaming: {folder} -> {new_path}")
        folder.rename(new_path)

[32m2025-08-05 09:44:12.747[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mRenaming: C:\Users\Molinari\Desktop\embryo-project\data\interim\D2023.09.06_S00200_I4203_P_WELL1 -> C:\Users\Molinari\Desktop\embryo-project\data\interim\D2023.09.06_S00200_I4203_P_WELL01[0m


In [9]:
original_folders = [p for p in RAW_DATA_DIR.rglob('*') if p.is_dir()]
processed_folders = [p for p in INTERIM_DATA_DIR.iterdir() if p.is_dir()]

logger.info(f"Total number of original folders: {len(original_folders)}")
logger.info(f"Total number of processed folders: {len(processed_folders)}")
logger.info(f"{empty_deleted} out of {len(original_folders)-len(processed_folders)} folders were empty")

[32m2025-08-05 09:44:13.588[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTotal number of original folders: 1208[0m
[32m2025-08-05 09:44:13.588[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mTotal number of processed folders: 1118[0m
[32m2025-08-05 09:44:13.588[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m33 out of 90 folders were empty[0m
