In [2]:
#@title Define if we are on Colab and mount drive { display-mode: "form" }
run_params = {}
try:
  from google.colab import drive
  drive.mount('/content/gdrive')
  run_params['IN_COLAB'] = True
except:
  run_params['IN_COLAB'] = False

In [3]:
#@title (COLAB ONLY) Clone GitHub repo { display-mode: "form" }

if run_params['IN_COLAB']:
  !git clone https://github.com/lluissalord/radiology_ai.git

  %cd radiology_ai

In [4]:
#@title Setup environment and Colab general variables { display-mode: "form" }
# %%capture
%run colab_pip_setup.ipynb

In [5]:
#@title Move images from Drive to temporary folder here to be able to train models { display-mode: "form" }
# %%capture
%run move_raw_preprocess.ipynb

In [6]:
import os
import pandas as pd
import random
from tqdm import tqdm

from fastai.basics import *
from fastai.callback import *
from fastai.vision.all import *
from fastai.vision.widgets import *
from fastai.medical.imaging import *

In [7]:
from organize.dicom import *
from train.models import *
from preprocessing.transforms import *
# from preprocessing.dicom import *
from preprocessing.misc import *

from train.losses.APL_losses import *

In [8]:
run_params['TRAIN_RESIZE'] = 384
run_params['RANDOM_RESIZE_CROP'] = 256
run_params['RANDOM_MIN_SCALE'] = 0.5

run_params['BATCH_SIZE'] = 32

run_params['N_TRAIN'] = None
run_params['N_SAMPLES_BIN'] = 50 # None

run_params['HIST_CLIPPING'] = True
run_params['HIST_CLIPPING_CUT_MIN'] = 5.
run_params['HIST_CLIPPING_CUT_MAX'] = 99.

run_params['KNEE_LOCALIZER'] = False
run_params['CLAHE_SCALED'] = True
run_params['HIST_SCALED'] = False
run_params['HIST_SCALED_SELF'] = False

run_params['USE_SAVED_MODEL'] = False
run_params['SAVE_MODEL'] = False

run_params['MODEL'] = resnet18
run_params['MODEL_VERSION'] = 0
run_params['MODEL_SAVE_NAME'] = f'{run_params["MODEL"].__name__}_v{run_params["MODEL_VERSION"]}.pkl'
run_params['MODEL_SAVE_PATH'] = os.path.join(run_params['MODELS_FOLDER'], run_params['MODEL_SAVE_NAME'])

In [None]:
all_check_DICOM_dict = {
    'ap': {
        'Modality': ['CR', 'DR', 'DX'],
        'SeriesDescription': ['RODILLA AP', 'TIBIA AP DIRECTO', 'Rodilla AP', 'rodilla AP', 'W098bDER Rodilla a.p.', 'T098aDER Rodilla a.p.', 'rodilla 1P AP', 'xeonllo DEREITO AP', 'xeonllo ESQUERDO AP', 'X106aL Tibia AP'],
        'BodyPartExamined': ['LOWER LIMB', 'KNEE', 'EXTREMITY'],
        'function': lambda row: row.Rows/row.Columns >= 0.83,
    },
    # 'lat': {
    #     'Modality': ['CR', 'DR', 'DX'],
    #     'SeriesDescription': ['RODILLA LAT', 'TIBIA LAT DIRECTO', 'RODILLA LAT EN CARGA', 'T Rodilla lat', 'rodilla LAT', 'rodilla  LAT', 'W098bDER Rodilla lat.', 'T098aDER Rodilla lat', 'rodilla 1P LAT', 'xeonllo DEREITO LAT', 'xeonllo ESQUERDO LAT', 'TOBILLO EN CARGA LAT', 'PIE LAT EN CARGA', 'rodilla LAT dcha', 'rodilla LAT izda'],
    #     'BodyPartExamined': ['LOWER LIMB', 'KNEE']
    # },
    # 'two': {
    #     'Modality': ['CR', 'DR', 'DX'],
    #     'SeriesDescription': ['RODILLAS AP', 'rodilla AP y LAT', 'ambas rodillas AP', 'ambas rodillas LAT', 'rodilla (telemando) AP y LAT', 'rodilla AP y LAT', 'Rodillas LAT', 'Rodilla AP y LAT', 'ambolos dous xeonllos AP', 'ambolos dous xeonllos LAT', 'rodilla seriada', 'Rodillas AP', 'Rodillas LAT'],
    #     'BodyPartExamined': ['LOWER LIMB', 'KNEE']
    # },
    # 'other': {
    #     'Modality': ['CR', 'DR', 'DX'],
    #     'BodyPartExamined': ['THORAX', 'UPPER LIMB', 'KNEE STANDING',
    #    'RIBS', 'HAND', 'HIP', 'PIE EN CARGA', 'FOOT', 'ANKLE',
    #    'ELBOW', 'PELVIS', 'LSPINE', 'CSPINE']
    # }
}

targets = list(all_check_DICOM_dict.keys()) + ['other']

In [None]:
metadata_raw_path = os.path.join(run_params['PATH_PREFIX'], 'metadata_raw.csv')
metadata_df = pd.read_csv(metadata_raw_path)
metadata_df.fname = metadata_df.fname.apply(
    lambda x: os.path.normpath(
        os.path.join(
            run_params['RAW_PREPROCESS_FOLDER'],
            os.path.split(x)[-1] + '.png'
        )
    )
    .replace(os.sep, '/')
)

# centers_fnames = metadata_df[
#     (
#       metadata_df.InstitutionName.str.lower().str.contains('coslada').astype(bool)
#      | metadata_df.InstitutionName.str.lower().str.contains('cugat').astype(bool)
#   ) 
#   & (metadata_df.InstitutionName.notnull())
# ]
centers_fnames = metadata_df[
    metadata_df.AccessionNumber.astype('str').str.startswith('885')
]
centers_fnames.index = centers_fnames.fname
centers_fnames['check_center'] = True

null_info = metadata_df[
    metadata_df.SeriesDescription.isnull() | metadata_df.BodyPartExamined.isnull()
]
null_info.index = null_info.fname
null_info['check_center'] = True

metadata_labels_path = os.path.join(run_params['PATH_PREFIX'], 'metadata_labels.csv')
metadata_labels = pd.read_csv(metadata_labels_path)
reviewed_labels = metadata_labels[metadata_labels['Prob'].isnull()].rename({'Path': 'fname'}, axis=1)
reviewed_labels = reviewed_labels.set_index('fname')

# Define which column to use as the prediction
if 'Final_pred' in reviewed_labels.columns:
    pred_col = 'Final_pred'
else:
    pred_col = 'Pred'

# Initialize lists containing the filenames for each class
all_fnames = {}

for label, check_DICOM_dict in all_check_DICOM_dict.items():
    # Check DICOM which according to the metadata should be that label
    match_df = df_check_DICOM(metadata_df, check_DICOM_dict)
    
    # Remove cases that have been reviewed and selected as DIFFERENT from the current label
    match_df = match_df.merge(reviewed_labels[reviewed_labels[pred_col] != label], how='left', left_on='fname', right_index=True)
    match_df = match_df[match_df[pred_col].isnull()]

    # Add cases that have been reviewed and selected as EQUAL from the current label
    match_df = pd.concat(
        [
            reviewed_labels[reviewed_labels[pred_col] == label].reset_index(),
            match_df
        ]
    ).drop_duplicates('fname').reset_index(drop=True)

    # Remove all cases which do not have relevant metadata
    remove_match = match_df.merge(null_info, how='left', left_on='fname', right_index=True)['check_center'].isnull()
    match_df = match_df[remove_match]

    # Remove all cases from the centers that will be used for prediction
    remove_match = match_df.merge(centers_fnames, how='left', left_on='fname', right_index=True)['check_center'].isnull()
    match_df = match_df[remove_match]

    all_fnames[label] = L(list(match_df.fname))

# Set as raw filenames all the ones in the metadata DataFrame
raw_fnames = L(list(metadata_df.fname))

# Filter on the filenames to not include undesired files
raw_fnames = L(filter_fnames(raw_fnames, metadata_raw_path))

# Label the rest of images as other also not including the centers one and null info
other_fnames = copy(raw_fnames)
for label, fnames in all_fnames.items():
    other_fnames = L(set(other_fnames.map(lambda path: str(path).replace(os.sep, '/'))) - set(fnames) - set(centers_fnames.fname) - set(null_info.fname))

# Filter on the filenames to not include undesired files
other_fnames = filter_fnames(other_fnames, metadata_raw_path)
all_fnames['other'] = L(other_fnames)

# Select the corresponding part for training
if run_params['N_TRAIN'] is None:
    fnames = raw_fnames
else:
    fnames = random.choices(raw_fnames, k=run_params['N_TRAIN'])

In [None]:
print(targets)
[len(all_fnames[label]) for label in all_fnames]

In [None]:
# Make sure there is no fname from centers in the dataset used for training/validation
check = [len(set(all_fnames[label])) == len(set(all_fnames[label]) - set(centers_fnames.fname) - set(null_info.fname)) for label in all_fnames]
assert np.array(check).all()

In [None]:
# Trying to not use oversampling due to issues on relabeling data and because with 2 labels is already somehow balanced

# # Oversampling of all classes to meet the biggest one or reach max_n_times its own size
# max_samples = max([len(fnames) for _, fnames in all_fnames.items()])
# max_n_times = 4
# for label, fnames in all_fnames.items():
#     k = min(max_samples-len(fnames), max_n_times * len(fnames))
#     all_fnames[label] = all_fnames[label] + random.choices(all_fnames[label], k=k)

# print(targets)
# [len(all_fnames[label]) for label in all_fnames]

In [None]:
# Create DataFrame with the filenames and the corresponding label
labels_concat = []
for label, fnames in all_fnames.items():
    label_df = pd.DataFrame(list(fnames), columns=['fname'])
    label_df['Target'] = label
    labels_concat.append(label_df)

labels_df = pd.concat(labels_concat).set_index('fname', drop=False)

In [15]:
item_tfms = []

if run_params['HIST_CLIPPING']:
    item_tfms.append(XRayPreprocess(PIL_cls=PILImageBW, cut_min=run_params['HIST_CLIPPING_CUT_MIN'], cut_max=run_params['HIST_CLIPPING_CUT_MAX'], np_input=len(item_tfms) > 0, np_output=True))

if run_params['KNEE_LOCALIZER']:
    item_tfms.append(KneeLocalizer(run_params['KNEE_SVM_MODEL_PATH'], PIL_cls=PILImageBW, resize=run_params['TRAIN_RESIZE'], np_input=len(item_tfms) > 0, np_output=True))
else:
    item_tfms.append(Resize(run_params['TRAIN_RESIZE'], method=ResizeMethod.Pad, pad_mode=PadMode.Zeros))

batch_tfms = [
    Flip(),
    *aug_transforms(
        pad_mode=PadMode.Zeros,
    ),
    RandomResizedCropGPU(run_params['RANDOM_RESIZE_CROP'], min_scale=run_params['RANDOM_MIN_SCALE']),
    Normalize()
]

In [16]:
# Histogram scaling DICOM on the fly

if run_params['CLAHE_SCALED']:
    item_tfms.append(CLAHE_Transform(PIL_cls=PILImageBW, grayscale=True, np_input=len(item_tfms) > 0, np_output=False))
elif run_params['HIST_SCALED']:
    if run_params['HIST_SCALED_SELF']:
        bins = None
    else:
        # bins = init_bins(fnames=L(list(final_df['Original'].values)), n_samples=100)
        all_valid_raw_preprocess = pd.concat([pd.Series(unlabel_all_df.index), label_df['Raw_preprocess']])
        bins = init_bins(L([fname for labels,fnames in all_fnames.items() for fname in fnames]), n_samples=run_params['N_SAMPLES_BIN'], isDCM=False)
    item_tfms.append(HistScaled(bins))

In [17]:
dls = DataBlock(
    blocks=(ImageBlock(PILImageBW), CategoryBlock),
    get_x=ColReader('fname'),
    get_y=ColReader('Target'),
    splitter=RandomSplitter(0.2),
    item_tfms=item_tfms,
    batch_tfms=batch_tfms,
).dataloaders(labels_df, bs=run_params['BATCH_SIZE'], num_workers=0, shuffle_train=True, drop_last=True)
dls.show_batch(max_n=25, cmap=plt.cm.bone)

NameError: name 'labels_df' is not defined

In [None]:
loss_func = None

# loss_func = NCEandRCE(1, 1, len(targets))

In [None]:
# Define the callbacks that will be used during training
cbs = [
        MixUp(),
        # partial(OverSamplingCallback),
        # ShowGraphCallback(),
        EarlyStoppingCallback(monitor='valid_loss', min_delta=0.05, patience=2),
    ]

# Adapt metrics depending on the number of labels
if len(targets) == 2:
    average = 'binary'
    roc_auc = RocAucBinary()
else:
    average = 'macro'
    roc_auc = RocAuc()

f1_score = F1Score(average=average)
precision = Precision(average=average)
recall = Recall(average=average)
learn = cnn_learner(
    dls,
    resnet18,
    metrics=[
        error_rate,
        roc_auc,
        f1_score,
        precision,
        recall
    ],
    loss_func=loss_func,
    cbs=cbs,
    config={'n_in': 1}
)

# Regularization by using float precision of 16 bits
# This helps to not overfit because is more difficult to "memorize" images, but enough to learn
learn = learn.to_fp16()

In [None]:
if run_params['USE_SAVED_MODEL']:
    model_load = create_model(run_params['MODEL'], len(targets))
    opt_load = copy(learn.opt)

    if not os.path.exists(run_params['MODELS_FOLDER']):
        os.makedirs(run_params['MODELS_FOLDER'])

    load_model(file=run_params['MODEL_SAVE_PATH'], model=model_load, opt=opt_load, device=torch.cuda.current_device())
    learn.model = model_load
    learn.opt = opt_load

In [None]:
learn.lr_find()

In [None]:
%%time
learn.fine_tune(1, 0.005, freeze_epochs=2)

In [None]:
learn.show_results(max_n=25)

In [None]:
if run_params['SAVE_MODEL']:

    if not os.path.exists(run_params['MODELS_FOLDER']):
        os.makedirs(run_params['MODELS_FOLDER'])

    save_model(file=run_params['MODEL_SAVE_PATH'], model=learn.model, opt=learn.opt)

In [None]:
# Use too much RAM and the session is not capable of handle it
# interp = Interpretation.from_learner(learn)
# losses, idx = interp.top_losses()
# interp.plot_top_losses(25, figsize=(15,10))

In [None]:
# Select only the top K images with largest loss

from fastai.interpret import ClassificationInterpretation
# from fastai2_extensions.interpret.all import *
# from fastai_amalgam.interpret.all import *

k = 9
largest = True
dls_idx = 1

preds, targs, decoded, all_losses = learn.get_preds(dls_idx, with_loss=True, with_decoded=True)
losses, idx = all_losses.topk(ifnone(k, len(all_losses)), largest=largest)

top_losses_dl = learn.dls.test_dl(learn.dls[dls_idx].items.iloc[idx])
top_losses_dl.bs = len(idx)

interp = ClassificationInterpretation(
    learn.dls[dls_idx],
    inputs=first(top_losses_dl),
    preds=preds[idx],
    targs=targs[idx],
    decoded=decoded[idx],
    losses=losses,
    # *tuple(map(lambda x: x[idx], learn.get_preds(dls_idx, with_input=True, with_loss=True, with_decoded=True)))
)
interp.plot_top_losses(k=k, cmap=plt.cm.bone)

In [None]:
# Plot GradCAM for the top K images with largest loss

from fastai_amalgam.interpret.gradcam import gradcam

for i in idx:
    gcam = gradcam(learn, learn.dls[dls_idx].items.iloc[i.numpy()]['fname'], labels=['ap', 'other'], show_original=True, cmap=plt.cm.bone)
    display(gcam)
    print()

In [None]:
# Plot GradCAM for the true positive images

from fastai_amalgam.interpret.gradcam import gradcam

dls_idx = 0
max_plots = 12
label_idxs = learn.dls[dls_idx].items[learn.dls[dls_idx].items['Target'] == 'ap'].index[:max_plots]

for i in label_idxs:
    gcam = gradcam(learn, learn.dls[dls_idx].items.loc[i, 'fname'], labels=['ap', 'other'], show_original=True, cmap=plt.cm.bone)
    display(gcam)
    print()

In [12]:
dcm_aaa_files = [str(file) for file in get_files(
    os.path.join(run_params["DATA_FOLDER"], "AAA")
)]
centers_fnames = pd.DataFrame(
    dcm_aaa_files,
    columns=["fname"]
)

In [18]:
# Generating DataLoader and select the paths that will be used for inference from centers_fnames
centers_dls = DataBlock(
    blocks=(ImageBlock(PILImageBW)),
    get_x=ColReader('fname'),
    item_tfms=item_tfms,
).dataloaders(pd.DataFrame(list(centers_fnames.fname),columns=['fname']), bs=run_params['BATCH_SIZE'], num_workers=0, shuffle_train=True, drop_last=True)

paths = list(centers_dls.train.items.iloc[:,0].values) + list(centers_dls.valid.items.iloc[:,0].values)

# Generate fake labels to be able to use show_results
labels = ['ap','other'] * (len(paths)//2)
labels = labels + (['ap'] if len(paths)%2 else [])

# Add DataSet from paths to the Test set of the learner
dl = learn.dls.test_dl(pd.DataFrame({'fname': paths, 'Target':labels}), with_labels=True)

UnidentifiedImageError: cannot identify image file 'sources\\AAA\\Anonymized - 00047768277\\Rodilla Ap Y Lat - Derecho\\X106aL Tibia AP - 2\\IM-0032-0001-0001.dcm'

In [None]:
learn.show_results(dl=dl, max_n=25)

In [None]:
# Calculate predictions and probabilities
preds, _ = learn.tta(dl=dl)
# preds, _ = learn.get_preds(dl=dl)
max_probs, targs = preds.max(1)

In [None]:
labels = ['None'] * len(paths)

class_threshold = {
    'Correct_label': 0.95,
    'Wrong_label': 0.95,
}

metadata_labels_path = os.path.join(run_params['PATH_PREFIX'], 'metadata_labels_CENTERS.csv')
metadata_labels = pd.read_csv(metadata_labels_path)
reviewed_labels = metadata_labels[metadata_labels['Prob'].isnull()].rename({'Path': 'fname'}, axis=1)
reviewed_labels = reviewed_labels.set_index('fname')

data = {
    'Path': [],
    'Label': [],
    'Raw_pred': [],
    'Pred': [],
    'Prob': [],
}
to_be_reviewed = []
for label, prob, targ, path_str in tqdm(zip(labels, max_probs, targs, paths), total=len(labels)):
    path = Path(path_str)
    raw_pred = targets[targ]
    
    # Check if already reviewed
    try:
        review = reviewed_labels.loc[path_str]

        # Set current data if reviews
        pred = review['Final_pred']
        prob = np.NaN
    except KeyError:
        # Set prob and pred according to the thresholds
        prob = float(prob)

        # Case of confidence on itself to predict same labels as target
        if label == targets[targ]:
            if prob >= class_threshold['Correct_label']:
                pred = raw_pred
            else:
                pred = 'Unsure_' + targets[targ] + '_' + str(label)
                to_be_reviewed.append((path, targets[targ], label, prob))

        # Confidence on wrong labelling
        else:
            if prob >= class_threshold['Wrong_label']:
                pred = raw_pred
            else:
                pred = 'Unsure_' + targets[targ] + '_' + str(label)
                to_be_reviewed.append((path, targets[targ], label, prob))

    data['Path'].append(os.path.normpath(path).replace(os.sep, '/'))
    data['Label'].append(label)
    data['Raw_pred'].append(raw_pred)
    data['Pred'].append(pred)
    data['Prob'].append(prob)

df = pd.DataFrame(data)
df.to_csv(metadata_labels_path, index=False)

In [None]:
df.pivot_table(
    index=['Raw_pred', 'Pred'],
    values=['Prob'],
    aggfunc=['mean', 'count']
)

In [None]:
def _open_thumb(fn, h, w): return Image.open(fn).to_thumb(h, w).convert('RGBA')

class ImagesCleanerDefaultPred(ImagesCleaner):
    "run_params['A'] widget that displays all images in `fns` along with a `Dropdown` with default value the prediction"

    def set_fns(self, fns, preds, labels, probs):
        self.fns = L(fns)[:self.max_n]
        # ims = parallel(_open_thumb, self.fns, h=self.height, w=self.width, progress=False,
        #                n_workers=min(len(self.fns)//10,defaults.cpus))
        ims = [_open_thumb(fn, h=self.height, w=self.width) for fn in self.fns]
        self.widget.children = [
            VBox([
                Label(f'{pred}/{label}/{prob:.4f}'),
                widget(im, height=f'{self.height}px'),
                Dropdown(options=self.opts, layout={'width': 'max-content'}, value=pred)
            ]) for im, pred, label, prob in zip(ims,preds,labels,probs)
        ]

    def values(self): return L(self.widget.children).itemgot(-1).attrgot('value')

In [None]:
# Check the unsure with lowest probability
df_to_review = df[(~df['Pred'].isin(targets)) & (df['Prob'].notnull())].sort_values(['Raw_pred', 'Prob']).iloc[:100]

# # Check the OTHER cases which the model is totally sure and are also confirmed by metadata
df_to_review = df[(df['Pred'] == 'Unsure_other_None') & (df['Prob'].notnull())].sort_values('Prob', ascending=True).iloc[:100]

# # Check the AP cases which the model is totally sure and are also confirmed by metadata
# df_to_review = df[(df['Label'] == 'ap') & (df['Pred'] == 'ap') & (df['Prob'].notnull())].sort_values('Prob', ascending=False).iloc[:100]

w = ImagesCleanerDefaultPred(targets,  max_n=len(df_to_review.index))
w.set_fns(
    list(df_to_review['Path']),
    list(df_to_review['Raw_pred']),
    # ['ap',] * len(df_to_review),
    list(df_to_review['Label']),
    list(df_to_review['Prob'])
)
w

In [None]:
sum([case[1] == 'ap' for case in w.change()])

In [None]:
df['Final_pred'] = df['Pred']
for i, pred in w.change():
    idx = df_to_review.iloc[i].name
    df.loc[idx, 'Final_pred'] = pred
    df.loc[idx, 'Prob'] = np.nan

    # Update label image if required
    path =  Path(df.loc[idx, 'Path'])
    if path.parent.name != pred:
        labels_df.loc[path, 'Target'] = pred
        labels_df.loc[path, 'fname'] = path

df.to_csv(metadata_labels_path, index=False)