# Create the corrected, detailed .csv file with relative paths


### Imports

In [1]:
import pandas as pd

# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Fix missing values in the file

In [2]:
# load the data
images_folder ='/content/drive/MyDrive/Deep Learing Project/images_plain'
imageDataframe = pd.read_csv('/content/drive/MyDrive/Deep Learing Project/image_data.csv')
imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X


In [3]:
# check for missing values in the column Benign or Malignant
imageDataframe[imageDataframe['Benign or Malignant'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1/histology_slides/breast/malignant/...,,,
3228,BreaKHis_v1/histology_slides/breast/malignant/...,,,
4536,BreaKHis_v1/histology_slides/breast/malignant/...,,,


In [4]:
# we see that all the missing values from the Benign or Malignant column are actually Malignant cancers (filename)
imageDataframe['Benign or Malignant'] = imageDataframe['Benign or Malignant'].fillna('Malignant')

In [5]:
# check for missing values in the column Cancer Type
imageDataframe[imageDataframe['Cancer Type'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,,
3093,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,,
3228,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,,
4536,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,,


In [6]:
# we fill all missing values with Mucinous Carcinoma
imageDataframe['Cancer Type'] = imageDataframe['Cancer Type'].fillna('Mucinous Carcinoma')

# and the one at position 4536 is overwritten with the correct value
imageDataframe.iloc[4536, 2] = 'Ductal Carcinoma'

In [7]:
# check where there is no magnification value
imageDataframe[imageDataframe['Magnification'].isnull()]

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
2871,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Mucinous Carcinoma,
3093,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Mucinous Carcinoma,
3228,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Mucinous Carcinoma,
4536,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Ductal Carcinoma,


In [8]:
# and fill the magnification accordingly
imageDataframe.iloc[2871, 3] = '100X'
imageDataframe.iloc[3093, 3] = '200X'
imageDataframe.iloc[3228, 3] = '400X'
imageDataframe.iloc[4536, 3] = '40X'

In [9]:
# check results for inconsistencies
imageDataframe.isna().sum()

Unnamed: 0,0
path_to_image,0
Benign or Malignant,0
Cancer Type,0
Magnification,0


### Add relative path to the files

In [10]:
# take the file name and put it into a separate column
imageDataframe["file_name"] = imageDataframe["path_to_image"].str.split("/").str[-1]

# the relative path will point to the temporary tmp folder
imageDataframe['rel_path'] = imageDataframe['file_name'].apply(lambda x: '/tmp/' + x)

imageDataframe

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification,file_name,rel_path
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-011.png,/tmp/SOB_B_A-14-22549AB-100-011.png
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-005.png,/tmp/SOB_B_A-14-22549AB-100-005.png
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-004.png,/tmp/SOB_B_A-14-22549AB-100-004.png
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-010.png,/tmp/SOB_B_A-14-22549AB-100-010.png
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-006.png,/tmp/SOB_B_A-14-22549AB-100-006.png
...,...,...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-028.png,/tmp/SOB_M_LC-14-12204-200-028.png
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-029.png,/tmp/SOB_M_LC-14-12204-200-029.png
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-006.png,/tmp/SOB_M_LC-14-12204-200-006.png
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-039.png,/tmp/SOB_M_LC-14-12204-200-039.png


In [11]:
# check again for inconsistencies
imageDataframe.isna().sum()

Unnamed: 0,0
path_to_image,0
Benign or Malignant,0
Cancer Type,0
Magnification,0
file_name,0
rel_path,0


### Save .csv file

In [12]:
# save the new CSV
imageDataframe.to_csv('/content/drive/MyDrive/Deep Learing Project/delivery/processed_image_data_final.csv',index = False)

# Build file structure for running the model



In [18]:
from tqdm import tqdm
from PIL import Image
import os
import concurrent.futures

# Define input and output directories
input_dir = '/content/drive/MyDrive/Deep Learing Project/'
output_dir = os.path.join(input_dir, 'images_plain')
os.makedirs(output_dir, exist_ok=True)

# Function to process and save a single image
def process_and_save_image(row):
    input_path = os.path.join(input_dir, row.path_to_image)
    output_path = os.path.join(output_dir, row.file_name)
    try:
        with Image.open(input_path) as img:
            img = img.resize((256, 256))
            img.save(output_path)
    except Exception as e:
        # Optionally log the error
        pass

# Use itertuples for faster iteration
rows = imageDataframe.itertuples(index=False)

# Utilize ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Wrap the executor.map with tqdm for a progress bar
    list(tqdm(executor.map(process_and_save_image, rows), total=len(imageDataframe)))

100%|██████████| 7909/7909 [1:35:21<00:00,  1.38it/s]
