# Options

In [None]:
# Paths
root = 'path to the root folder'
images_folder = root + 'subpath to the folder with the images'
save_path = root + 'models/'
table_path = root + 'subpath to multicare_multiplex.csv'
splits_path = root + 'subpath to multicare_splits.csv'

# Settings
use_oversampling = False # select True or False
submodel_type = 'imaging_type' # select one of [imaging_type, imaging_type:endoscopy, imaging_type:pathology, imaging_type:pathology.other_staining, imaging_type:radiology(main|attribute_angiography), imaging_type:radiology.ultrasound]

# Data setup

In [None]:
from fastai.vision.all import *
import albumentations
from DLOlympus.training.transforms import AlbumentationsTransform
from DLOlympus.training.utils import get_model
from DLOlympus.training.unbalanced import oversampled_epoch

In [5]:
# Hyperparameters

h, w = 224, 224

hyperparameters = {
    'model_description': submodel_type,
    'BS': 16,
    'EPOCHS': 30,
    'IMG_SIZE': (h, w),      # (height, width)
    'WD': 0.0,
    'TRANSFORMS': [
        albumentations.HorizontalFlip(p=0.5),
        albumentations.VerticalFlip(p=0.5),
        albumentations.Rotate(p=0.5),
        albumentations.Sharpen(p=0.5),
        albumentations.ColorJitter(brightness=0.3, contrast=0.5, saturation=0.5, hue=0.0, p=0.5),
        albumentations.RGBShift(p=0.5),
        albumentations.GaussianBlur(p=0.5),
        albumentations.GaussNoise(p=0.5),
        albumentations.RandomSizedCrop((int(0.75*h),h), h, w, p=1.0)
        ],
    'ARCH': 'resnet50',
    'ARCH_TYPE': 'torchvision',
    'LOSS_FUNC': 'LabelSmoothingCrossEntropyFlat',
    'OPT_FUNC': 'Adam',
    'USE_OVERSAMPLING': use_oversampling,
    'SEED': 18,
}

# Metrics and callbacks
metrics = [accuracy, F1Score(average='macro')]
callbacks = [SaveModelCallback(monitor='f1_score', with_opt=True), ShowGraphCallback]

In [6]:
import pandas as pd

def get_data(table_path, splits_path):
    # Read data
    data = pd.read_csv(table_path)
    # Get relevant info
    image_files = np.array([f'{images_folder}{s[:4]}/{s[:6]}/{s}' for s in (data['file'].values)])
    labels = data[hyperparameters['model_description']].values
    splits = pd.read_csv(splits_path)['is_valid'].values
    use_ids = [type(d) == str for d in labels]
    return image_files[use_ids], labels[use_ids], splits[use_ids]

def create_df(image_files, labels, splits):
    # Initiate dataframe
    df = pd.DataFrame()
    df['file_path'] = image_files
    df['label'] = labels
    # Add a binary column to the dataframe
    df['is_valid'] = splits
    return df

In [None]:
# Dataframe
image_files, labels, splits = get_data(table_path, splits_path)
df = create_df(image_files, labels, splits)

df['label'].value_counts()

In [None]:
set_seed(hyperparameters['SEED'], True)

# Datablock
if hyperparameters['model_description'] == 'imaging_type:radiology(main|attribute_angiography)':
    def get_rad(x):
        return [i[1:-1] for i in str(ColReader('label')(x))[1:-1].split(', ')][0]
    def get_ang(x):
        return [i[1:-1] for i in str(ColReader('label')(x))[1:-1].split(', ')][1]
    block = DataBlock(
        blocks=(ImageBlock, CategoryBlock, CategoryBlock),
        n_inp=1,
        get_x=ColReader('file_path'),
        get_y=[get_rad, get_ang],
        splitter=ColSplitter(col='is_valid'),
        item_tfms=[
            Resize(hyperparameters['IMG_SIZE'], method='squish'), 
            AlbumentationsTransform(albumentations.Compose(hyperparameters['TRANSFORMS']))])
else:
    block = DataBlock(
        blocks=(ImageBlock, CategoryBlock),
        get_x=ColReader('file_path'),
        get_y=ColReader('label'),
        splitter=ColSplitter(col='is_valid'),
        item_tfms=[
            Resize(hyperparameters['IMG_SIZE'], method='squish'), 
            AlbumentationsTransform(albumentations.Compose(hyperparameters['TRANSFORMS']))])
    
# Dataloaders
dls = block.dataloaders(df, bs=hyperparameters['BS'], shuffle=True)
dls.rng.seed(hyperparameters['SEED'])

# Sanity check
num_classes = dls.c
classes = dls.vocab
print('Number of clases: ', num_classes)
print('Names of classes: ', classes)

In [None]:
# Show batch
dls.train.show_batch(max_n=16, figsize=(15,12))

In [None]:
# Show transforms
dls.train.show_batch(max_n=16, unique=True, figsize=(15,12))

In [12]:
# Learner
if hyperparameters['model_description'] == 'imaging_type:radiology(main|attribute_angiography)':
    def rad_loss(inp, rad, ang):
        return getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])()(inp[:,:dls.c[0]], rad)
    def ang_loss(inp, rad, ang):
        return getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])()(inp[:,dls.c[0]:], ang)
    def combined_loss(inp, rad, ang): return rad_loss(inp, rad, ang) + ang_loss(inp, rad, ang)   

    from sklearn.metrics import f1_score
    mapping = {
        (0, 0): 0,
        (0, 1): 1,
        (1, 0): 2,
        (1, 1): 3,
        (2, 0): 4,
        (2, 1): 5,
        (3, 0): 6,
        (3, 1): 7
    }
    def new_classes(x):
        # Convert input tensor to a tuple of integers
        key = tuple(x.int().tolist())
        # Return the mapped value or default to the first element of x
        return mapping.get(key, int(x[0].item()))    
    def multioutput2multiclass(probs, ground_truths_1, ground_truths_2):
        probs = probs.cpu()
        rad_probs = probs[:,:dls.c[0]]
        ang_probs = probs[:,dls.c[0]:]
        rad_truths, ang_truths = ground_truths_1.cpu(), ground_truths_2.cpu()
        new_preds = []
        new_gts = []
        for rp, ap, rt, at in zip(rad_probs, ang_probs, rad_truths, ang_truths):
            # Get the id of the top prediction
            rad_pred, ang_pred = rp.argmax(), ap.argmax()
            pred = new_classes(torch.stack((rad_pred, ang_pred)))
            gt = new_classes(torch.stack((rt, at)))
            new_preds.append(pred)
            new_gts.append(gt)
        return torch.Tensor(new_preds), torch.Tensor(new_gts)
    def new_accuracy(probs, ground_truths_1, ground_truths_2):
        predictions, ground_truths = multioutput2multiclass(probs, ground_truths_1, ground_truths_2)
        return (predictions == ground_truths).float().mean()
    def _accumulate(self, learn):
        m = nn.Sigmoid()
        pred = learn.pred
        targ_1 = learn.y[0]
        targ_2 = learn.y[1]
        pred,targ_1,targ_2 = to_detach(pred),to_detach(targ_1),to_detach(targ_2)
        pred,targ= multioutput2multiclass(pred, targ_1, targ_2)
        self.preds.append(pred)
        self.targs.append(targ)
    AccumMetric.accumulate = _accumulate
    def NewF1Score():
        return skm_to_fastai(f1_score, average='macro')

    learn = vision_learner(dls,
                            get_model(hyperparameters),
                            n_out=dls.c.sum(),
                            normalize=True,
                            pretrained=True,
                            loss_func=combined_loss,
                            opt_func=getattr(sys.modules[__name__], hyperparameters['OPT_FUNC']),
                            metrics=[new_accuracy, NewF1Score()],
                            wd=hyperparameters['WD']).to_fp16()
    # Fix issue with pickling while calling learn.export
    import typing, functools
    learn.loss_func.__annotations__ = typing.get_type_hints(learn.loss_func, globalns=globals(), localns=locals())
    functools.update_wrapper(learn.loss_func, learn.loss_func)    
else:
    learn = vision_learner(dls,
                            get_model(hyperparameters),
                            normalize=True,
                            pretrained=True,
                            loss_func=getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])(weight=loss_weights),
                            opt_func=getattr(sys.modules[__name__], hyperparameters['OPT_FUNC']),
                            metrics=metrics,
                            wd=hyperparameters['WD']).to_fp16()    
    # Fix issue with pickling while calling learn.export
    import typing, functools
    learn.loss_func.func.__annotations__ = typing.get_type_hints(learn.loss_func.func, globalns=globals(), localns=locals())
    functools.update_wrapper(learn.loss_func, learn.loss_func.func)

In [13]:
# Oversampling
if hyperparameters['USE_OVERSAMPLING']:
    class_weights = pd.DataFrame(1 / np.sqrt(learn.dls.items.label.value_counts())).rename(index=lambda x: str(x)).to_dict()['count']
    learn.dls.train.get_idxs = types.MethodType(partial(oversampled_epoch, class_weights=class_weights), learn.dls.train)

# Training

In [None]:
# Find LR
learn.lr_find()

In [15]:
# Set LR
hyperparameters['LR'] = 3e-3

In [None]:
# Train
learn.fine_tune(hyperparameters['EPOCHS'], base_lr=hyperparameters['LR'], cbs=callbacks)

# Results and logs

In [None]:
learn.export(f'{save_path}/model.pkl')
learn.save(f'{save_path}/model')

from DLOlympus.training.plots import plot_confusion_matrix, plot_losses, plot_metrics
_ = plot_losses(learn, save_path)
_ = plot_metrics(learn, save_path)
probs, ground_truths = learn.get_preds(ds_idx=1)        # DO NOT PREDICT BEFORE PLOTTING LOSSES AND METRICS
if hyperparameters['model_description'] == 'imaging_type:radiology(main|attribute_angiography)':
    predictions, ground_truths = multioutput2multiclass(probs, ground_truths[0], ground_truths[1])
    _ = plot_confusion_matrix(ground_truths, predictions, ['ct + angiography', 'ct + not_angiography', 'mri + angiography', 'mri + not_angiography', 'ultrasound + angiography', 'ultrasound + not_angiography', 'x_ray + angiography', 'x_ray + not_angiography'], save_path)
else:
    predictions = np.argmax(probs, axis=1)
    _ = plot_confusion_matrix(ground_truths, predictions, learn.dls.vocab, save_path)
    from DLOlympus.training.tables import get_predictions_table
    train_table = get_predictions_table(learn, learn.dls.train)
    valid_table = get_predictions_table(learn, learn.dls.valid)
    train_table.to_csv(f'{save_path}train_table.csv', index=False)
    valid_table.to_csv(f'{save_path}valid_table.csv', index=False)

from DLOlympus.training.utils import get_metrics
results = get_metrics(learn, with_tta=False)