In [None]:
import kagglehub
from os import listdir
# Download latest version
path = kagglehub.dataset_download("paultimothymooney/breast-histopathology-images")

print("Path to dataset files:", path)

base_path = path+'/'+"IDC_regular_ps50_idx5/"
folder = listdir(base_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/breast-histopathology-images?dataset_version_number=1...


100%|██████████| 3.10G/3.10G [00:37<00:00, 89.7MB/s]

Extracting files...





In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = '/content/drive/MyDrive/phase2'

In [None]:
from drive.MyDrive.phase2.utils.utils import *

batch_size = 128
epochs = 50
lr = 0.001
weight_decay = 1e-4


# base_paht: path to main dataset
# base_dir: sharedd phase2 dirctory
train_dataloader, dev_dataloader, test_dataloader = get_train_data(folder, base_path, batch_size, base_dir)

In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the target directory
target_directory = '/content/drive/MyDrive/phase2/dataset/agmented'

# Count the number of files in the directory
file_count = sum(len(files) for _, _, files in os.walk(target_directory))

print(f"Total number of files in '{target_directory}': {file_count}")

In [None]:

# Define the ResNet structure for your task
networks = {
    'resnet18_light': {
        'block': ResidualBlock,
        'stage_args': [
            (32, 64, 2, False)
        ],
        'dropout': True,  # Enable dropout
        'p': 0.5  # Dropout probability
    }
}

def get_resnet(name):
    return ResNet(**networks[name])


to_float= torch.float
to_long = torch.long
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Model and file paths
name = 'resnet18_light'
version = 10
checkpoint_path = f'{base_dir}/checkpoint/{name}_{version}_checkpoint.pth'
model_path = f'{base_dir}/models/{name}_{version}_checkpoint.pth'
history_path = f'{base_dir}/history/{name}_{version}.pth'

if os.path.exists(checkpoint_path):
    # Resume training from checkpoint
    print(f"Resuming training from checkpoint: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path)
    model = get_resnet(name).to(device)
    model.load_state_dict(checkpoint['model_state'])

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    optimizer.load_state_dict(checkpoint['optimizer_state'])

    scheduler = CosineAnnealingLR(optimizer, T_max=50)
    if checkpoint['scheduler_state'] is not None:
        scheduler.load_state_dict(checkpoint['scheduler_state'])

    start_epoch = checkpoint['epoch']
    best_val_acc = checkpoint['best_val_acc']
    train_metrics_history = checkpoint['train_history']
    val_metrics_history = checkpoint['val_history']
    lr_history = checkpoint['lr_history']

    print(f"Training will resume from epoch {start_epoch}.\n")

else:
    # Start new training
    print(f"Training new model: {name}\n")

    # Initialize model and optimizer
    model = get_resnet(name).to(device)
    model.apply(initialize_weights)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Define scheduler
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)

    # Initialize metrics and state
    start_epoch = 0
    best_val_acc = 0.0
    train_metrics_history = {'loss': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    val_metrics_history = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    lr_history = []

# Train model
train_metrics_history, val_metrics_history, lr_history = train_model(
    model, optimizer, train_dataloader, dev_dataloader,
    device=device, dtype=torch.float32, epochs=epochs,
    scheduler=scheduler, schedule=[20, 40], verbose=True,
    checkpoint_path=checkpoint_path,
    history_path=history_path
)

# Save final model and history after training completes
torch.save(model.state_dict(), model_path)
print(f"Final model saved at: {model_path}")

with open(history_path, 'wb') as f:
    pickle.dump((train_metrics_history, val_metrics_history, lr_history), f)
print(f"Training history saved at: {history_path}")

# Evaluate model on the test set
test_accuracy, test_precision, test_recall, test_f1 = calculate_metrics(test_dataloader, model, device=device)
print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, "
      f"Recall: {test_recall:.4f}, F1 Score: {test_f1:.4f}")

# Plot metrics
plot_all_metrics(train_metrics_history, val_metrics_history)
plot_learning_rate(lr_history)
plot_loss(train_metrics_history['loss'])