In [8]:
import pandas as pd
import cv2
import os

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import PIL



Extracting image for filepath.csv


In [None]:

# Load the CSV file
df = pd.read_csv("../data/train.csv")  # Change to your actual file path

# Define output folder to save images
output_folder = "../data/extracted_images"
os.makedirs(output_folder, exist_ok=True)

# Function to load and save images
def extract_images(df, output_folder, target_size=(150, 150)):
    for idx, row in df.iterrows():
        filepath = row["filepath"]
        image = cv2.imread(filepath)  # Read image
        if image is not None:
            image = cv2.resize(image, target_size)  # Resize
            save_path = os.path.join(output_folder, f"image_{idx}.png")
            cv2.imwrite(save_path, image)  # Save image
            print(f"Saved: {save_path}")
        else:
            print(f"Could not load: {filepath}")

# Extract and save images
extract_images(df, output_folder)


In [4]:
import numpy as np

def preprocess_image(filepath, target_size=(128, 128)):
    """Load and preprocess an image from the given file path."""
    image = cv2.imread(filepath)  # Read the image
    if image is None:
        return None  # Skip missing/corrupt images
    image = cv2.resize(image, target_size)  # Resize image
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image



def get_image_data(path, x_label="filepath", y_label="label"):
    df = pd.read_csv(path)
    print(df.columns)
    X = []
    y = []
    for filepath, label in df[['filepath', 'label']].values:  # Ignore magnification for now
        image = preprocess_image(filepath)
        if image is not None:
            X.append(image)
            y.append(1 if label == "malignant" else 0)  # Convert labels to binary (0: benign, 1: malignant)
    
    X = np.array(X)
    y = np.array(y)
    return X, y


In [5]:
X, y = get_image_data("../data/toy_dataset.csv")
print(f"Loaded {len(X)} images and {len(y)} labels.")
print(f"Image shape: {X.shape}, Label shape: {y.shape}")

Index(['Unnamed: 0', 'filepath', 'label', 'magnification', 'tumor_subtype'], dtype='object')
Loaded 10 images and 10 labels.
Image shape: (10, 128, 128, 3), Label shape: (10,)


Check if file exist with given file path

In [3]:
# Load the CSV file
csv_path = "../data/toy_dataset.csv"  # Update with your actual CSV file path
df = pd.read_csv(csv_path)

# Check if file paths exist
df["file_exists"] = df["filepath"].apply(os.path.isfile)

# Print missing files
missing_files = df[~df["file_exists"]]

if not missing_files.empty:
    print(f"❌ {len(missing_files)} missing image files detected!")
    print(missing_files[["filepath"]].head())  # Show first few missing files
else:
    print("✅ All image files exist!")


✅ All image files exist!


Data Augmentation

In [11]:

train_df = pd.read_csv("../data/toy_dataset.csv")  # CSV containing file paths & labels

save_dir = "../data/augmented_images"
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist


# Create an image data generator with augmentation
datagen = ImageDataGenerator(
    rotation_range = 45,   # Rotate images up to 30 degrees
    horizontal_flip = True,   # Flip images horizontally
    rescale=1./255      #normalise pixel values
)


# Load images in batches
train_generator = datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = None ,  # Folder where images are stored
    x_col="filepath",  # Column containing image file paths
    y_col="label",  # Column with target labels (label or tumor_subtype)
    
    #target_size=(150, 150),  # resize image
    batch_size=32,  # 32 images per batch
    class_mode='binary',  #outcome ('categprical' for multiclass)

    save_to_dir=save_dir,      # Save augmented images
    save_prefix='aug',         # Prefix for saved images
    save_format='png'         # Format of saved images
)





Found 10 validated image filenames belonging to 2 classes.


In [12]:
batch = next(iter(train_generator))  # Load one batch
print(f"Batch shape: {batch[0].shape}, Labels: {batch[1]}")

ImportError: Could not import PIL.Image. The use of `load_img` requires PIL.