# Image preprocessing ([#3](https://github.com/molinari135/embryo-project/issues/3))

In this notebook, we perform the following operations:

1. Rename all columns in english
2. Remove all rows that have `NaN` in folder field
3. Remove columns that will not be used
4. Change the label using 0s and 1s instead of strings
5. Convert the original file in `.tsv`

In order to perform all these operations it is **required** do to put in `data/raw` directory the excel file renamed as `annotations.xlsx` and run the previous notebook by number convention.

In [45]:
import pandas as pd

from pathlib import Path
from loguru import logger
from embryo_project.config import RAW_DATA_DIR, ANNOTATIONS_FILE

In [46]:
file_path = Path(RAW_DATA_DIR / "annotations.xlsx")
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,#,Anno,Nome cartella,etichetta,"foto ""cruciale""",tot elementi,commenti
0,1.0,2020.0,D2020.10.07_S00020_I4203_P_WELL01,blastocisti no,D2020.10.07_S00020_I4203_P_WELL01_RUN361,765.0,
1,2.0,2020.0,D2020.10.07_S00020_I4203_P_WELL02,blastocisti no,D2020.10.07_S00020_I4203_P_WELL02_RUN104,765.0,
2,3.0,2020.0,D2020.10.07_S00020_I4203_P_WELL03,blastocisti no,D2020.10.07_S00020_I4203_P_WELL03_RUN296,765.0,
3,4.0,2020.0,D2020.10.07_S00020_I4203_P_WELL04,blastocisti no,D2020.10.07_S00020_I4203_P_WELL04_RUN198,765.0,
4,5.0,2020.0,D2020.10.07_S00020_I4203_P_WELL05,blastocisti no,D2020.10.07_S00020_I4203_P_WELL05_RUN156,765.0,bolla d'aria da D2020.10.07_S00020_I4203_P_WEL...


In [50]:
# These are the values present in "etichetta" column
df['etichetta'].unique()

array(['blastocisti no', nan, 'blastocisti si', 'Blastocisti no',
       'Blastocisti si', 'blastocisti si '], dtype=object)

In [None]:
df = df.rename(columns={
    "Anno": "year",
    "Nome cartella": "folder",
    "etichetta": "label",
    "foto \"cruciale\" ": "image",
    "tot elementi": "elements",
    "commenti": "comments"
})

original_rows = df.shape[0]
logger.info(f"This dataset has {original_rows} rows")
df.head()

[32m2025-08-04 16:43:45.801[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mThis dataset has 1071 rows[0m


Unnamed: 0,#,year,folder,label,image,elements,comments
0,1.0,2020.0,D2020.10.07_S00020_I4203_P_WELL01,blastocisti no,D2020.10.07_S00020_I4203_P_WELL01_RUN361,765.0,
1,2.0,2020.0,D2020.10.07_S00020_I4203_P_WELL02,blastocisti no,D2020.10.07_S00020_I4203_P_WELL02_RUN104,765.0,
2,3.0,2020.0,D2020.10.07_S00020_I4203_P_WELL03,blastocisti no,D2020.10.07_S00020_I4203_P_WELL03_RUN296,765.0,
3,4.0,2020.0,D2020.10.07_S00020_I4203_P_WELL04,blastocisti no,D2020.10.07_S00020_I4203_P_WELL04_RUN198,765.0,
4,5.0,2020.0,D2020.10.07_S00020_I4203_P_WELL05,blastocisti no,D2020.10.07_S00020_I4203_P_WELL05_RUN156,765.0,bolla d'aria da D2020.10.07_S00020_I4203_P_WEL...


In [None]:
df.dropna(subset=['folder'], inplace=True)
df.dropna(subset=['image'], inplace=True)   # There are 4 images with Nan values
cleaned_rows = df.shape[0]
logger.success(f"Removed {original_rows - cleaned_rows} NaN rows")

[32m2025-08-04 16:43:45.810[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [32m[1mRemoved 77 NaN rows[0m


In [None]:
df.drop(['#', 'year', 'elements', 'comments'], axis=1, inplace=True)
df.head()

Unnamed: 0,folder,label,image
0,D2020.10.07_S00020_I4203_P_WELL01,blastocisti no,D2020.10.07_S00020_I4203_P_WELL01_RUN361
1,D2020.10.07_S00020_I4203_P_WELL02,blastocisti no,D2020.10.07_S00020_I4203_P_WELL02_RUN104
2,D2020.10.07_S00020_I4203_P_WELL03,blastocisti no,D2020.10.07_S00020_I4203_P_WELL03_RUN296
3,D2020.10.07_S00020_I4203_P_WELL04,blastocisti no,D2020.10.07_S00020_I4203_P_WELL04_RUN198
4,D2020.10.07_S00020_I4203_P_WELL05,blastocisti no,D2020.10.07_S00020_I4203_P_WELL05_RUN156


In [None]:
df['label'] = df['label'].apply(
    lambda x: 1 if isinstance(x, str) and x.strip().lower() == 'blastocisti si'
    else 0 if isinstance(x, str) and x.strip().lower() == 'blastocisti no'
    else x
)

df.head()

Unnamed: 0,folder,label,image
0,D2020.10.07_S00020_I4203_P_WELL01,0,D2020.10.07_S00020_I4203_P_WELL01_RUN361
1,D2020.10.07_S00020_I4203_P_WELL02,0,D2020.10.07_S00020_I4203_P_WELL02_RUN104
2,D2020.10.07_S00020_I4203_P_WELL03,0,D2020.10.07_S00020_I4203_P_WELL03_RUN296
3,D2020.10.07_S00020_I4203_P_WELL04,0,D2020.10.07_S00020_I4203_P_WELL04_RUN198
4,D2020.10.07_S00020_I4203_P_WELL05,0,D2020.10.07_S00020_I4203_P_WELL05_RUN156


In [None]:
df.to_csv(ANNOTATIONS_FILE, sep="\t", index=False)
logger.success(f"The file has been saved in {ANNOTATIONS_FILE}")

[32m2025-08-04 16:43:45.830[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [32m[1mThe file has been saved in C:\Users\Molinari\Desktop\embryo-project\data\processed\annotations.tsv[0m
