## Imports

In [None]:
import torch
import os
import json
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../iu_xray.py

In [None]:
REPORTS_DIR = os.path.join(DATASET_DIR, 'reports')

## Preprocess reports

Clean and tokenize

### Debug tokenize functions

In [None]:
%run ../preprocess/tokenize.py

In [None]:
text = """findings/pneumothorax """
text_to_tokens(text)

#### Check already clean reports

Look for errors

In [None]:
with open(os.path.join(REPORTS_DIR, 'reports.clean.v3.json'), 'r') as f:
    reports_dict = json.load(f)

In [None]:
def search_in_reports(target):
    found = []
    for r in reports_dict.values():
        clean_text = r['clean_text']
        if re.search(target, clean_text):
            found.append({
                k: r[k]
                for k in ('filename', 'clean_text', 'findings', 'impression')
            })
    print('Found: ', len(found))
    return found

In [None]:
search_in_reports(r'\bexample\b')

### Run preprocess

- Tokenize reports, create json with clean reports and vocabularies
- Create sentences_with_chexpert_labels.csv (takes about 12min)
- Create sentences_with_organs.csv

In [None]:
%run -n ../preprocess/iu_xray.py

In [None]:
reports, tokens_appearances, errors = preprocess_iu_x_ray(
    'v5-3',
    [0],
    override=True,
    # impression_fallback=False,
    # concat_if=True,
    concat_fi=True,
)
len(reports), len(tokens_appearances)

In [None]:
df_sentences_chexpert, errors = create_sentences_with_organs()

In [None]:
%%time

df_sentences_chexpert = create_sentences_with_chexpert_labels()
len(df_sentences_chexpert)

#### v5-2 and v5-3

- The vocabulary simulates the one used in the Co-att paper
- Manually keep the 1000 top words, and override the vocab

In [None]:
%run ../vocab/__init__.py

In [None]:
total_appearances = sum(tokens_appearances.values())
len(tokens_appearances), total_appearances

In [None]:
t = [(k, v, v/total_appearances*100) for k, v in tokens_appearances.items()]
t = sorted(t, key=lambda x: x[1], reverse=True)
t[:10]

In [None]:
top_n = 1000
perc = sum(x[2] for x in t[:top_n])
TOP_N_WORDS = set(x[0] for x in t[:top_n])
print(f'Top {top_n:,} words cover {perc:.2f}% of the appearances')

In [None]:
vocab = load_vocab(REPORTS_DIR, 'v5-3')
len(vocab)

In [None]:
## Copied from _compute_vocab()
new_vocab = {
    PAD_TOKEN: PAD_IDX,
    START_TOKEN: START_IDX,
    END_TOKEN: END_IDX,
    UNKNOWN_TOKEN: UNKNOWN_IDX,
    END_OF_SENTENCE_TOKEN: END_OF_SENTENCE_IDX,
}

for token in vocab:
    if token not in TOP_N_WORDS:
        continue
    if token not in new_vocab:
        new_vocab[token] = len(new_vocab)
len(new_vocab), len(vocab)

In [None]:
_save_vocab(REPORTS_DIR, 'v5-3', new_vocab, 0)

### Check errors

#### Check in tokens

In [None]:
for token, n_appears in tokens.items():
    if 'NUMBER' in token:
        print(token, n_appears)

#### Check in text

In [None]:
import re

In [None]:
TARGET_TOKENS = ['NUMBER[^\s]']

In [None]:
found = []
for report in reports.values():
    for token in TARGET_TOKENS:
        if re.search(token, report['clean_text']):
            found.append(report)
            
len(found)

## Rotate images

NOTE: are already rotated!!
(Run this only once)

In [None]:
info_fname = os.path.join(DATASET_DIR, 'info.json')
with open(info_fname, 'r') as f:
    info = json.load(f)
len(info)

In [None]:
info['marks']['rotated_left']

In [None]:
rotations = [
    ('left', -90),
    ('right', 90),
    ('bottom', 180),
]

In [None]:
for key, degrees in rotations:
    images_key = f'rotated_{key}'
    for image_name in info['marks'][images_key]:
        filepath = os.path.join(DATASET_DIR, 'images', image_name)
        img = Image.open(filepath).rotate(degrees)
        # img.save(filepath)

## Calculate image normalization

In [None]:
%run ../../utils/images.py

In [None]:
image_folder = os.path.join(DATASET_DIR, 'images')

In [None]:
dataset = IUXRayDataset('train')
len(dataset)

In [None]:
train_images = [
    i if i.endswith('.png') else f'{i}.png'
    for i in [r['image_name'] for r in dataset.reports]
]
len(train_images)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(image_folder, train_images), show=True)
mean, std

### Plot average image

In [None]:
from torchvision import transforms

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [None]:
summed = torch.zeros(3, 256, 256)

for image_name in tqdm(image_names):
    fpath = os.path.join(image_folder, image_name)
    image = transform(Image.open(fpath).convert('RGB'))
    summed += image
    
summed /= len(image_names)

In [None]:
average_image = summed.mean(dim=0)
average_image.size()

In [None]:
plt.imshow(average_image, cmap='gray')

## Dataset labels

### Coatt labels

Come from: https://github.com/ZexinYan/Medical-Report-Generation

#### Load labels

In [None]:
import pandas as pd

In [None]:
%run ../common/constants.py

In [None]:
def load_labels(split):
    fpath = os.path.join(DATASET_DIR, 'coatt-labels', f'{split}_data.txt')
    df = pd.read_csv(fpath, header=None, sep=' ', names=COATT_LABELS)
    print(len(df))
    return df

In [None]:
train_df = load_labels('train')
train_df.head()

In [None]:
val_df = load_labels('val')
val_df.head(2)

In [None]:
test_df = load_labels('test')
test_df.head(2)

In [None]:
train_images = set(train_df.index)
val_images = set(val_df.index)
test_images = set(test_df.index)

In [None]:
train_images.intersection(val_images), \
val_images.intersection(test_images), \
train_images.intersection(test_images)

In [None]:
master_df = pd.concat([train_df, val_df, test_df], axis=0)
print(len(master_df), len(train_df) + len(val_df) + len(test_df))
master_df = master_df.reset_index()
master_df = master_df.rename(columns={'index': 'image_id'})
master_df.head()

#### Merge with report filenames

In [None]:
import json

In [None]:
with open(os.path.join(REPORTS_DIR, 'reports.clean.v4.json')) as f:
    reports = list(json.load(f).values())
len(reports)

In [None]:
image_name_to_report_filename = {}
for report in reports:
    filename = report['filename']
    for image in report['images']:
        image_id = image['id']
        image_name_to_report_filename[image_id] = filename
len(image_name_to_report_filename)

In [None]:
set(master_df['image_id']) - set(image_name_to_report_filename.keys())

In [None]:
master_df['filename'] = [
    image_name_to_report_filename.get(image_id, '')
    for image_id in master_df['image_id']
]
master_df.head()

In [None]:
cols = ['image_id', 'filename', *COATT_LABELS]
master_df = master_df[cols]
master_df.head()

In [None]:
master_df.to_csv(os.path.join(DATASET_DIR, 'coatt-labels', 'metadata.csv'), index=False)

### MTI tags

In [None]:
from collections import Counter
import re
import pandas as pd

In [None]:
%run ../preprocess/iu_xray.py
%run ../common/constants.py

In [None]:
raw_reports = load_raw_reports()
len(raw_reports)

In [None]:
def _clean_tag(tag):
    tag = tag.lower()
    tag = re.sub(r'\W', ' ', tag)
    tag = re.sub(r'\s+', ' ', tag)
    return tag

In [None]:
tag_counter = Counter()
for report in raw_reports.values():
    for tag in report['tags_auto']:
        tag = _clean_tag(tag)
        tag_counter[tag] += 1
len(tag_counter)

In [None]:
sorted(tag_counter.items(), key=lambda x:x[1], reverse=True)[:10]

In [None]:
step = 4
tags = list(tag_counter)
for i in range(0, len(tags), step):
    print(', '.join(f"'{tag}'" for tag in tags[i:i+step]) + ',')

In [None]:
all_tags = []
all_reports = []
for report_id, report in raw_reports.items():
    tags = set(
        _clean_tag(tag)
        for tag in report['tags_auto']
    )
    tags_onehot = [
        int(t in tags)
        for t in IU_MTI_TAGS
    ]
    
    all_tags.append(tags_onehot)
    all_reports.append(report_id)
tags_df = pd.DataFrame(all_tags, columns=IU_MTI_TAGS)
tags_df['filename'] = all_reports
cols = ['filename'] + IU_MTI_TAGS
tags_df = tags_df[cols]
tags_df.head(2)

In [None]:
tags_df.to_csv(os.path.join(DATASET_DIR, 'mti-tags.csv'), index=False)

In [None]:
# TODO: reduce synonyms??
# syns = {
#     'atelectases': 'atelectasis',
#     'atheroscleroses': 'atherosclerosis',
#     'bronchiectases': 'bronchiectasis',
#     'histoplasmoma': 'histoplasmosis',
#     'histoplasmoses': 'histoplasmosis',
#     'humeral fractures': 'humeral fracture',
#     'tuberculoses': 'tuberculosis',
# }

## Test `IUXrayDataset` class

In [None]:
%run ../../utils/common.py

In [None]:
%run ../iu_xray.py

In [None]:
dataset = IUXRayDataset(
    dataset_type='test',
    # masks=True,
    # masks_version='v2',
    # frontal_only=True,
    image_size=(1024, 1024),
    # seg_multilabel=False,
    # labels='mti',
    images_version='16bit-1024p',
    # image_format='I;16',
    image_format='I',
)
len(dataset), len(dataset.word_to_idx)

In [None]:
item = dataset[100]
image = item.image
labels = item.labels
report = item.report
image.size(), labels.size(), len(report)

In [None]:
if isinstance(item.masks, torch.Tensor):
    print(item.masks.min(), item.masks.max(), item.masks.size())

In [None]:
n_rows = 2
n_cols = 3

plt.figure(figsize=(n_cols*5, n_rows*5))

plt.subplot(n_rows, n_cols, 1)
plt.title(item.image_fname)
plt.imshow(tensor_to_range01(image).permute(1, 2, 0))
# plt.axis('off')

if isinstance(item.masks, torch.Tensor) and item.masks.ndim == 3:
    for index, organ in enumerate(JSRT_ORGANS):
        mask = item.masks[index]

        plt.subplot(n_rows, n_cols, index + 2)
        plt.imshow(mask)
        plt.title(organ)
        plt.axis('off')

        min_value = mask.min().item()
        max_value = mask.max().item()
        print(organ, min_value, max_value)
elif isinstance(item.masks, torch.Tensor) and item.masks.ndim == 2:
    plt.subplot(n_rows, n_cols, 2)
    plt.imshow(item.masks)

In [None]:
image_copy = image.clone()

In [None]:
def stats(arr):
    print(arr.type(), arr.min(), arr.max())

In [None]:
stats(image_copy)
stats(image)

## Plot distributions

In [None]:
from collections import Counter

In [None]:
%run ../iu_xray.py
%run ../common/constants.py

In [None]:
train_dataset = IUXRayDataset('train')
val_dataset = IUXRayDataset('val')
len(train_dataset), len(val_dataset)

### Labels distribution

In [None]:
def plot_labels_distribution(dataset):
    amounts_by_disease = sum(
        (dataset.labels_by_report[r['filename']] for r in dataset.reports),
        torch.zeros(len(CHEXPERT_DISEASES)),
    ).tolist()
    max_amount = max(amounts_by_disease)
    amounts_by_disease = list(zip(CHEXPERT_DISEASES, amounts_by_disease))
    amounts_by_disease = sorted(amounts_by_disease, key=lambda x: x[1], reverse=True)
    
    n_samples = len(dataset)
    plt.title(f'{dataset.dataset_type} (n={n_samples:,})', fontsize=20)
    plt.bar(*zip(*amounts_by_disease))
    plt.xticks(rotation=60, fontsize=15, ha='right')
    plt.ylabel('Amount of images', fontsize=18)
    plt.ylim(0, max_amount * 1.15)
    y_padding = int(max_amount * 0.03)
    
    for index, (disease, amount) in enumerate(amounts_by_disease):
        amount = int(amount)
        perc = amount / n_samples * 100
        plt.text(index, amount + y_padding, f'{amount:,}\n{perc:.0f}%', ha='center')

In [None]:
n_rows = 1
n_cols = 2
plt.figure(figsize=(15, 5))

plt.subplot(n_rows, n_cols, 1)
plot_labels_distribution(train_dataset)

plt.subplot(n_rows, n_cols, 2)
plot_labels_distribution(val_dataset)

### Report lengths distribution

In [None]:
def plot_n_words_distribution(dataset):
    lengths = [len(r['tokens_idxs']) for r in dataset.reports]
    plt.title(f'Report-lengths ({dataset.dataset_type}, total={len(dataset):,})')
    plt.ylabel('Amount of images')
    plt.xlabel('Number of words')
    _ = plt.hist(lengths, bins=25, range=(0, 150))

In [None]:
n_rows = 1
n_cols = 2
plt.figure(figsize=(15, 5))

plt.subplot(n_rows, n_cols, 1)
plot_n_words_distribution(train_dataset)

plt.subplot(n_rows, n_cols, 2)
plot_n_words_distribution(val_dataset)

### Frontal vs lateral distribution

In [None]:
def plot_view_position_distribution(dataset):
    def _reduce_pos(position):
        return position.replace('-left', '').replace('-right', '')
    positions = Counter([_reduce_pos(r['position']) for r in dataset.reports])
    
    plt.title(f'Frontal vs lateral ({dataset.dataset_type})', fontsize=20)
    plt.ylabel('Amount of images', fontsize=15)
    plt.xticks(fontsize=15)
    
    positions = sorted(positions.items(), key=lambda x: x[1], reverse=True)
    keys, values = zip(*positions)
    plt.bar(keys, values, width=0.2)
    
    plt.ylim(0, max(values) * 1.2)
    y_padding = max(values) * 0.03
    n_samples = len(dataset)
    for index, value in enumerate(values):
        perc = value / n_samples * 100
        text = f'{value:,}\n{perc:.2f}%'
        plt.text(index, value + y_padding, text, ha='center')

In [None]:
n_rows = 1
n_cols = 2
plt.figure(figsize=(15, 5))

plt.subplot(n_rows, n_cols, 1)
plot_view_position_distribution(train_dataset)

plt.subplot(n_rows, n_cols, 2)
plot_view_position_distribution(val_dataset)

## Inspect different vocabs

In [None]:
%run ../vocab/__init__.py

In [None]:
vocab_full = load_vocab('iu_xray')
vocab_1 = load_vocab('iu_xray', 1)
vocab_10 = load_vocab('iu_xray', 10)
len(vocab_full), len(vocab_1), len(vocab_10)

In [None]:
assert set(vocab_10).issubset(vocab_1)
assert set(vocab_1).issubset(vocab_full)

In [None]:
out_of_vocab_1 = set(vocab_full) - set(vocab_1)
out_of_vocab_10 = set(vocab_1) - set(vocab_10)
len(out_of_vocab_1), len(out_of_vocab_10)

In [None]:
out_of_vocab_10

## Check no-findings vs labels==0

In [None]:
from collections import defaultdict

In [None]:
chexpert_path = os.path.join(REPORTS_DIR, 'reports_with_chexpert_labels.csv')
mirqi_path = os.path.join(REPORTS_DIR, 'reports_with_mirqi_labels.csv')

In [None]:
chexpert_df = pd.read_csv(chexpert_path, index_col=0)
chexpert_df.replace(-1, 1, inplace=True)
chexpert_df.replace(-2, 0, inplace=True)
chexpert_df.head()

In [None]:
mirqi_df = pd.read_csv(mirqi_path, index_col=0)
mirqi_df.drop(columns=['attributes-gen', 'MIRQI-r', 'MIRQI-p', 'MIRQI-f'], inplace=True)
mirqi_df.rename(columns={'attributes-gt': 'attributes'}, inplace=True)
mirqi_df.replace(-1, 1, inplace=True)
mirqi_df.replace(-2, 0, inplace=True)
mirqi_df.head()

In [None]:
base_columns = set(['filename', 'Reports', 'attributes'])
MIRQI_LABELS = [c for c in mirqi_df.columns if c not in base_columns]

In [None]:
len(chexpert_df), len(mirqi_df)

In [None]:
df = chexpert_df.merge(mirqi_df, on='filename', suffixes=['_chx', '_mirqi'])
print(len(df))
df.head()

In [None]:
reports_by_condition = defaultdict(set)

for index, row in chexpert_df.iterrows():
    filename = row['filename']
    report = row['Reports']
    labels = row[CHEXPERT_LABELS]

    tup = (index, filename, report)

    no_findings = labels['No Finding']
    
    if no_findings == 1:
        reports_by_condition['no-findings-1'].add(tup)
        if any(l != 0 for l in labels[1:-1]):
            # Exclude no-findings and support-devices
            reports_by_condition['inconsistent'].add(tup)
    else:
        if not any(l != 0 for l in labels[1:-1]):
            reports_by_condition['no-findings-absent'].add(tup)
    
    if all(l != 1 for l in labels):
        reports_by_condition['no-1s'].add(tup)
    
[(k, len(v)) for k, v in reports_by_condition.items()]

In [None]:
l = list(reports_by_condition['no-findings-absent'])
l[:5]

In [None]:
mirqi_df.loc[mirqi_df['filename'] == '256.xml'][MIRQI_LABELS]

In [None]:
l = list(reports_by_condition['no-1s'])
l[:10]

In [None]:
l = list(reports_by_condition['no-findings-1'])
l[:10]