In [8]:
import pandas as pd
import cv2
import os
import numpy as np
import tensorflow as tf


In [2]:
import PIL
print(PIL.__version__)  # Check if it’s installed correctly

11.1.0


In [None]:

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import array_to_img

Extracting image for filepath.csv


In [None]:

# Load the CSV file
df = pd.read_csv("../data/train.csv")  # Change to your actual file path

# Define output folder to save images
output_folder = "../data/extracted_images"
os.makedirs(output_folder, exist_ok=True)

# Function to load and save images
def extract_images(df, output_folder, target_size=(150, 150)):
    for idx, row in df.iterrows():
        filepath = row["filepath"]
        image = cv2.imread(filepath)  # Read image
        if image is not None:
            image = cv2.resize(image, target_size)  # Resize
            save_path = os.path.join(output_folder, f"image_{idx}.png")
            cv2.imwrite(save_path, image)  # Save image
            print(f"Saved: {save_path}")
        else:
            print(f"Could not load: {filepath}")

# Extract and save images
extract_images(df, output_folder)


In [4]:
import numpy as np

def preprocess_image(filepath, target_size=(128, 128)):
    """Load and preprocess an image from the given file path."""
    image = cv2.imread(filepath)  # Read the image
    if image is None:
        return None  # Skip missing/corrupt images
    image = cv2.resize(image, target_size)  # Resize image
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image



def get_image_data(path, x_label="filepath", y_label="label"):
    df = pd.read_csv(path)
    print(df.columns)
    X = []
    y = []
    for filepath, label in df[['filepath', 'label']].values:  # Ignore magnification for now
        image = preprocess_image(filepath)
        if image is not None:
            X.append(image)
            y.append(1 if label == "malignant" else 0)  # Convert labels to binary (0: benign, 1: malignant)
    
    X = np.array(X)
    y = np.array(y)
    return X, y


In [5]:
X, y = get_image_data("../data/toy_dataset.csv")
print(f"Loaded {len(X)} images and {len(y)} labels.")
print(f"Image shape: {X.shape}, Label shape: {y.shape}")

Index(['Unnamed: 0', 'filepath', 'label', 'magnification', 'tumor_subtype'], dtype='object')
Loaded 10 images and 10 labels.
Image shape: (10, 128, 128, 3), Label shape: (10,)


Check if file exist with given file path

In [3]:
# Load the CSV file
csv_path = "../data/toy_dataset.csv"  # Update with your actual CSV file path
df = pd.read_csv(csv_path)

# Check if file paths exist
df["file_exists"] = df["filepath"].apply(os.path.isfile)

# Print missing files
missing_files = df[~df["file_exists"]]

if not missing_files.empty:
    print(f"❌ {len(missing_files)} missing image files detected!")
    print(missing_files[["filepath"]].head())  # Show first few missing files
else:
    print("✅ All image files exist!")


✅ All image files exist!


Data Augmentation

In [26]:

train_df = pd.read_csv("../data/toy_dataset.csv")  # CSV containing file paths & labels

save_dir = "../data/augmented_images"
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist


# Create an image data generator with augmentation
datagen = ImageDataGenerator(
    rotation_range = 45,   # Rotate images up to 30 degrees
    horizontal_flip = True,   # Flip images horizontally
    rescale=1./255      #normalise pixel values
)


# Load images in batches
train_generator = datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = "" ,  # Folder where images are stored
    x_col="filepath",  # Column containing image file paths
    y_col="label",  # Column with target labels (label or tumor_subtype)
    
    #target_size=(150, 150),  # resize image
    batch_size=32,  # 32 images per batch
    class_mode='binary',  #outcome ('categprical' for multiclass)

    save_to_dir=save_dir,      # Save augmented images
    save_prefix='aug',         # Prefix for saved images
    save_format='png'         # Format of saved images
)





Found 10 validated image filenames belonging to 2 classes.


In [27]:

# List to store new image metadata
augmented_data = []

# Process each image in the dataset
for index, row in train_df.iterrows():
    img_path = row["filepath"]
    
    # Load image
    try:
        image = load_img(img_path)  # Load original image
        image = img_to_array(image)  # Convert to array
        image = np.expand_dims(image, axis=0)  # Add batch dimension

        # Generate one augmented image
        batch = next(datagen.flow(image, batch_size=1))
        new_filename = f"aug_{index}.png"
        new_filepath = os.path.join(save_dir, new_filename)

        # Save the augmented image
        array_to_img(batch[0]).save(new_filepath)

        # Append new metadata row
        augmented_data.append([new_filepath] + row.tolist()[1:])  # Keep original metadata

    except Exception as e:
        print(f"Error processing {img_path}: {e}")

# Create new DataFrame with augmented data
augmented_df = pd.DataFrame(augmented_data, columns=train_df.columns)

# Save new CSV with augmented image paths & metadata
augmented_df.to_csv("../data/augmented_dataset.csv", index=False)

print(f"Augmented dataset saved to ../data/augmented_dataset.csv")

Augmented dataset saved to ../data/augmented_dataset.csv


Count the number of images in folder

In [28]:
folder_path = "../data/augmented_images"
image_count = len([f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))])

print(f"Total images: {image_count}")

Total images: 10


In [24]:

try_data = pd.read_csv("../data/metadata.csv")
try_data.info

<bound method DataFrame.info of       Unnamed: 0                                           filepath  \
0              0  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
1              1  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
2              2  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
3              3  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
4              4  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
...          ...                                                ...   
7904        7904  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7905        7905  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7906        7906  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7907        7907  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7908        7908  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   

          label magnification      tumor_subtype  
0        benign          100X    tubular_adenoma  
1        beni

Unnamed: 0.1,Unnamed: 0,filepath,label,magnification,tumor_subtype
0,5445,../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...,malignant,200X,ductal_carcinoma
1,558,../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...,benign,200X,tubular_adenoma
2,5308,../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...,malignant,200X,ductal_carcinoma
3,4521,../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...,malignant,400X,ductal_carcinoma
4,7230,../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...,malignant,40X,ductal_carcinoma


In [16]:
try_data.info

<bound method DataFrame.info of       Unnamed: 0                                           filepath  \
0              0  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
1              1  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
2              2  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
3              3  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
4              4  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
...          ...                                                ...   
7904        7904  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7905        7905  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7906        7906  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7907        7907  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   
7908        7908  ../data/versions/4/BreaKHis_v1/BreaKHis_v1/his...   

          label magnification      tumor_subtype  
0        benign          100X    tubular_adenoma  
1        beni

ResNet50


In [None]:
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dense, GlobalAveragePooling2D, Add
from tensorflow.keras import Model

#defined function to build convolution block

def conv_block(x, filters, kernel_size, strides, padding='same'):
    x = Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    return x

In [None]:
# defined identity_block and projection block

def identity_block(x, filters):
    shortcut = x
    x = conv_block(x, filters=filters, kernel_size=(1, 1), strides=(1, 1))
    x = conv_block(x, filters=filters, kernel_size=(3, 3), strides=(1, 1))
    x = Conv2D(filters=filters * 4, kernel_size=(1, 1))(x)
    x = BatchNormalization()(x)
    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    return x

def projection_block(x, filters, strides):
    shortcut = x
    x = conv_block(x, filters=filters, kernel_size=(1, 1), strides=strides)
    x = conv_block(x, filters=filters, kernel_size=(3, 3), strides=(1, 1))
    x = Conv2D(filters=filters * 4, kernel_size=(1, 1))(x)
    x = BatchNormalization()(x)
    shortcut = Conv2D(filters=filters * 4, kernel_size=(1, 1), strides=strides)(shortcut)
    shortcut = BatchNormalization()(shortcut)
    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    return x

In [None]:
def ResNet50(input_shape=(224, 224, 3), num_classes=1000):
    inputs = Input(shape=input_shape)
    
    # initial conv layer
    x = conv_block(inputs, filters=64, kernel_size=(7, 7), strides=(2, 2), padding='')
    x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)

    # conv block 1
    x = projection_block(x, filters=64, strides=(1, 1))
    x = identity_block(x, filters=64)
    x = identity_block(x, filters=64)

    # conv block 2
    x = projection_block(x, filters=128, strides=(2, 2))
    x = identity_block(x, filters=128)
    x = identity_block(x, filters=128)
    x = identity_block(x, filters=128)

    # conv block 3
    x = projection_block(x, filters=256, strides=(2, 2))
    x = identity_block(x, filters=256)
    x = identity_block(x, filters=256)
    x = identity_block(x, filters=256)
    x = identity_block(x, filters=256)
    x = identity_block(x, filters=256)

    # conv block 4
    x = projection_block(x, filters=512, strides=(2, 2))
    x = identity_block(x, filters=512)
    x = identity_block(x, filters=512)

    # global average pooling and dense layer
    x = GlobalAveragePooling2D()(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, in_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)

    def forward(self, x):
        residual = x  # Skip connection
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out += residual  # Adding input back
        return self.relu(out)


In [1]:
import torch
import torch.nn as nn

In [None]:
class block(nn.Module):
    def __init__(self,in_channels,out_channels, identity_downsample =None, stride=1):
        super(block,self).__init__()
        self.expansion =4 
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1,stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=1,stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1,stride=1, padding=0)
        self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
        self.relu = nn.Relu()
        self.identity_downsample = identity_downsample

    def forward(self,x):
        identity = x
        x=self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x=self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x=self.conv3(x)
        x = self.bn3(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x =self.relu(x)
        return x
       

class ResNet(nn.Module):  #[3,4,6,3] how many time we want to reuse the block in a list
    def __init__(self, block,layers, image_channels, num_classes):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels,64, kernel_size=7, stride=2,padding=3) #initial layer
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2,padding=1)

        #ResNet layers
        self.layer1 = self._make_layer(block,layers[0],out_channels=64, stride =1)
        self.layer2 = self._make_layer(block,layers[1],out_channels=128, stride =2)
        self.layer3 = self._make_layer(block,layers[2],out_channels=256, stride =2)
        self.layer4 = self._make_layer(block,layers[3],out_channels=512, stride =2)  #2048 out_channels at the end

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc == nn.Linear(512*4, num_classes)

    def forward(self,x):
        x = self.conv1(x)
        x =self.bn1(x)
        x = self.rule(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x=self.avgpool(x)
        x = x.reshape(x.shape[0], -1)

        return x

    def _make_layer(self,block, num_residual_block, out_channels, stride):
        identity_downsample =None
        layers = []

        if stride !=1 or self.in_channels != out_channels * 4
            identity_downsample = nn.Sequential(nn.Conv2d(self.in_channels, out_channels*4, kernel_size=1, stride =stride),
                                                nn.BatchNorm2d(out_channels*4))
            
        layers.append(block(self.in_channels, out_channels, identity_downsample,stride))
        self.in_channels = self.out_channels*4 

        for i in range(num_residual_block -1 ):
            layers.append(block(self.in_channels, out_channels))  # input 256 -> 64 -> 256 again

        return nn.Sequential(layers)


def ResNet50(img_channels=3, num_classes=1000):
    return ResNet(block, [3,4,6,3], img_channels, num_classes)

def ResNet101(img_channels=3, num_classes=1000):
    return ResNet(block, [3,4,23,3], img_channels, num_classes)  #change the numbr of layers

def test():
    net = ResNet50
    x = torch.randn(2,3,224,224)
    y = net(x).to('cuda')
    print(y.shape)