In [5]:
import torch
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torchvision import models
import os
import shutil
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn as nn
import time
import copy


In [3]:
# create file directory structure and train-test split (one time run when the file structure is not created)
main_dir = "/media/ist/Drive2/MANSOOR/Neuroimaging-Project/Breast_Cancer_Classification_Project"

source_dir =  f"{main_dir}/data"  # Directory where all images are stored
base_dir = f"{main_dir}/data"  # Destination directory where you want to set up train/val folders

# Make sure the base directory exists
os.makedirs(base_dir, exist_ok=True)

# Define subdirectories for training and validation sets
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Subdirectories for class 'positive' and 'negative' in both training and validation
train_pos_dir = os.path.join(train_dir, 'SUB_p')
train_neg_dir = os.path.join(train_dir, 'SUB_n')
val_pos_dir = os.path.join(val_dir, 'SUB_p')
val_neg_dir = os.path.join(val_dir, 'SUB_n')

# Create these directories
os.makedirs(train_pos_dir, exist_ok=True)
os.makedirs(train_neg_dir, exist_ok=True)
os.makedirs(val_pos_dir, exist_ok=True)
os.makedirs(val_neg_dir, exist_ok=True)

# List all files in the source directory
files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]

# Split files based on their prefix
pos_files = [f for f in files if f.startswith('SUB_p')]
neg_files = [f for f in files if f.startswith('SUB_n')]

# Function to split data and move files
def split_and_move_files(files, train_dir, val_dir, test_size=0.2):
    train_files, val_files = train_test_split(files, test_size=test_size, random_state=42)
    for f in train_files:
        shutil.move(os.path.join(source_dir, f), train_dir)
    for f in val_files:
        shutil.move(os.path.join(source_dir, f), val_dir)

# Apply the function to positive and negative files
split_and_move_files(pos_files, train_pos_dir, val_pos_dir)
split_and_move_files(neg_files, train_neg_dir, val_neg_dir)

print("Files have been organized into training and validation sets.")


Files have been organized into training and validation sets.


In [9]:

# Define transformations
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Load data
data_dir = source_dir
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: DataLoader(image_datasets[x], batch_size=4,
                             shuffle=True, num_workers=4)
               for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
print(class_names)

['SUB_n', 'SUB_p']


In [10]:
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features

# Here the size of each output sample is set to 2.
# Alternatively, it could be generalized to nn.Linear(num_ftrs, len(class_names)).
model.fc = torch.nn.Linear(num_ftrs, 2)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /home/ist/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:04<00:00, 11.4MB/s]


In [11]:
# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


In [13]:
# model training/val loop
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and a validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                # inputs = inputs.to(device)
                # labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # Deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model

train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=10)