In [4]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision
from torchvision import models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from tqdm import tqdm
import shutil
import glob
import cv2 as cv
import matplotlib.pyplot as plt

In [15]:
os.chdir("/rds/general/user/ft824/home/ML_BreakHis/scr")

In [4]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#download pretrained weights 
#model = ResNet50(weights='imagenet', include_top=False)


In [23]:
# get pretrained weight without last layer
resnet_weights_path = '/rds/general/user/ft824/home/.keras/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

CHANNELS = 3
IMAGE_RESIZE = 224
NUM_CLASSES = 2 # change this to match your number of output classes
DENSE_LAYER_ACTIVATION = 'sigmoid'  # use 'softmax' for categorical classification
RESNET50_POOLING_AVERAGE = 'avg'  
OBJECTIVE_FUNCTION = 'categorical_crossentropy'

# Common accuracy metric for all outputs, but can use different metrics for different output
LOSS_METRICS = ['accuracy']

# EARLY_STOP_PATIENCE must be < NUM_EPOCHS
NUM_EPOCHS = 10
EARLY_STOP_PATIENCE = 3

# These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively
STEPS_PER_EPOCH_TRAINING = 10
STEPS_PER_EPOCH_VALIDATION = 10

#BATCH_SIZE sould be FACTOR of no of img in train and validation
BATCH_SIZE_TRAINING = 32
BATCH_SIZE_VALIDATION = 16

In [6]:
# Build the model
model = Sequential()

# Add pre-trained ResNet50 as the base (without the top classifier layer)
model.add(ResNet50(
    include_top=False,
    pooling=RESNET50_POOLING_AVERAGE,
    weights=resnet_weights_path,
    input_shape=(224, 224, 3)  # or your image size
))

# Freeze the base model, not to train first layer
model.layers[0].trainable = False

# Add output layer for classification
model.add(Dense(NUM_CLASSES, activation=DENSE_LAYER_ACTIVATION))

In [7]:
model.summary()

In [8]:
from tensorflow.keras.optimizers import SGD

# Define optimizer
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)

# Compile model
model.compile(optimizer=sgd, loss=OBJECTIVE_FUNCTION, metrics=LOSS_METRICS)




In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input

# Load your CSV files
train_df = pd.read_csv('../data/augmented_train_dataset.csv')
#augment_train = pd.read_csv('../data/augmented_dataset.csv')
#train_df = pd.concat([train_df, augment_train], axis=0, ignore_index=True)

test_df = pd.read_csv('../data/new_test.csv')

image_size = IMAGE_RESIZE  # for ResNet50


# Define the ImageDataGenerator with preprocessing
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)


# Training generator
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='filepath',    # column with image file paths
    y_col='label',       # column with image labels
    target_size=(image_size, image_size),  # resizing to match ResNet50 input size
    batch_size=BATCH_SIZE_TRAINING,
    class_mode='categorical' # multi-class classification
)

# Test generator
test_generator = datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='filepath',    # column with image file paths
    y_col='label',       # column with image labels
    target_size=(image_size, image_size),  # resizing to match ResNet50 input size
    batch_size=BATCH_SIZE_VALIDATION,
    class_mode='categorical'
)

Found 4458 validated image filenames belonging to 2 classes.
Found 1483 validated image filenames belonging to 2 classes.


In [21]:
(BATCH_SIZE_TRAINING, len(train_generator), BATCH_SIZE_VALIDATION, len(test_generator))

(32, 140, 16, 93)

In [None]:
##check for missign files
print(train_df['filepath'].head())
missing = train_df[~train_df['filepath'].apply(os.path.exists)]
print(f"Missing files: {len(missing)}")
print(missing.head())


0    ../../.cache/kagglehub/datasets/ambarish/break...
1    ../../.cache/kagglehub/datasets/ambarish/break...
2    ../../.cache/kagglehub/datasets/ambarish/break...
3    ../../.cache/kagglehub/datasets/ambarish/break...
4    ../../.cache/kagglehub/datasets/ambarish/break...
Name: filepath, dtype: object
Missing files: 0
Empty DataFrame
Columns: [filepath, label, magnification, tumor_subtype, Unnamed: 0]
Index: []


In [22]:
# Early stopping & checkpointing the best model in ../working dir & restoring that as our model for prediction
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = '../working/best.hdf5', monitor = 'val_loss', save_best_only = True, mode = 'auto')

In [None]:
from sklearn.model_selection import ParameterGrid

# Define grid
param_grid = {'epochs': [5, 10, 15], 'steps_per_epoch': [10, 20, 50]}

# Create all parameter combinations
grid = ParameterGrid(param_grid)

# Placeholder for best model tracking
best_model = None
lowest_val_loss = float('inf')
history_list = []

# Loop through all parameter combinations
for params in grid:
    print(f"Running with params: {params}")
    
    # Train your model here using params['epochs'], params['steps_per_epoch'], etc.
    # Example:
    # model = build_model()
    # history = model.fit(..., epochs=params['epochs'], steps_per_epoch=params['steps_per_epoch'], ...)
    
    # Placeholder for validation loss from this training
    val_loss = ...  # replace with actual value from history.history['val_loss'][-1]

    # Save history
    history_list.append({'params': params, 'val_loss': val_loss})
    
    # Track best model
    if val_loss < lowest_val_loss:
        lowest_val_loss = val_loss
        best_model = ...  # store model or weights

# Analyze history_list to understand trends


In [25]:
fit_history = model.fit(
        train_generator,
        steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
        epochs = NUM_EPOCHS,
        validation_data=test_generator,
        validation_steps=STEPS_PER_EPOCH_VALIDATION,
        callbacks=[cb_checkpointer, cb_early_stopper]
)
model.load_weights("/working/best.hdf5")

  self._warn_if_super_not_called()


Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - accuracy: 0.5872 - loss: 2.4717 

ValueError: The following argument(s) are not supported: ['options']

In [6]:
# Check accuracy function
def check_accuracy(output,labels):
    _,predpos=output.max(1)
    num_samples=len(labels)
    num_correct=(predpos==labels).sum()
    return (num_correct/num_samples)*100


In [None]:
def save_checkpoint(state,filename='clahe.pth.tar'):
    print('Saving weights-->')
    torch.save(state,filename)

In [None]:
def load_checkpoint(filename):
    print('Loading weights-->')
    model.load_state_dict(checkpoint['state_dict'])
    optim.load_state_dict(checkpoint['optimizer'])

In [None]:
# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32
shuffle_dataset = True
random_seed= 42
num_workers=2
learning_rate=0.001
print(device)
num_epochs=25
load_model=False

In [None]:
# Create resnet model, with respecitve transform
model = models.resnet50(pretrained=False)
model.fc=nn.Sequential(nn.Linear(2048,1024),
                      nn.LeakyReLU(),
                      nn.Linear(1024,512),
                      nn.LeakyReLU(),
                      nn.Linear(512,2))
print(model)

In [None]:
# Loss and Optimizer
criterion=nn.CrossEntropyLoss()
optim=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
if load_model:
    load_checkpoint(torch.load('weights.pth.tar'))

In [None]:
##Load data
#train, validation, test
train_loader = DataLoader(train_set, batch_size=batch_size,num_workers=num_workers, shuffle=True)
validation_loader = DataLoader(valid_set, batch_size=batch_size,num_workers=num_workers,shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size,num_workers=num_workers,shuffle=True)

In [None]:

# Put model on cuda 
model.to(device)
# Put the model on train mode
model.train()
print()

In [None]:
i,y=next(iter(train_loader))
i=i.to(device)
y=y.to(device)
y_pred=model(i)
print(y_pred.shape)

In [None]:
# Training loop for the model
min_loss=None
for epoch in range(num_epochs):
    losses=[]
    accuracies=[]
    loop= tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    for batch_idx, (data,labels) in loop:
        # Put data on cuda
        data=data.to(device)
        labels=labels.to(device).long()
        
        # Forward pass
        output=model(data)
        
        # Find out loss
        loss=criterion(output,labels)
        accuracy=check_accuracy(output,labels)
        losses.append(loss.detach().item())
        accuracies.append(accuracy.detach().item())
        
        optim.zero_grad()
        
        # Back prop
        loss.backward()
        
        # Step
        optim.step()
        
        # Update TQDM progress bar
        loop.set_description(f"Epoch [{epoch}/{num_epochs}] ")
        loop.set_postfix(loss=loss.detach().item(),accuracy=accuracy.detach().item())
        
    moving_loss=sum(losses)/len(losses)
    moving_accuracy=sum(accuracies)/len(accuracies)
    checkpoint={'state_dict': model.state_dict(),'optimizer': optim.state_dict()}
    # Save check point
    if min_loss==None:
        min_loss=moving_loss
        save_checkpoint(checkpoint)
    elif moving_loss<min_loss:
        min_loss=moving_loss
        save_checkpoint(checkpoint)
    print('Epoch {0} : Loss = {1} , Accuracy={2}'.format(epoch,moving_loss,moving_accuracy))

In [None]:
# Validation accuracy
correct=0
samples=0
for data,labels in validation_loader:
    data=data.to(device)
    labels=labels.to(device)
    # Forward pass
    y_pred=model(data)
    # Accuracy over entire dataset
    _,predpos=y_pred.max(1)
    samples+=len(labels)
    correct+=(predpos==labels).sum().detach().item()
print('Validation accuracy : ',(correct/samples)*100)


In [None]:

# Test accuracy
correct=0
samples=0
for data,labels in test_loader:
    data=data.to(device)
    labels=labels.to(device)
    # Forward pass
    y_pred=model(data)
    # Accuracy over entire dataset
    _,predpos=y_pred.max(1)
    samples+=len(labels)
    correct+=(predpos==labels).sum().detach().item()
print('Test accuracy : ',(correct/samples)*100)