## Imports

In [None]:
import torch
import os
import json
import matplotlib.pyplot as plt

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../iu_xray.py

## Preprocess reports

Clean and tokenize

### Debug tokenize functions

In [None]:
%run ../preprocess/tokenize.py

In [None]:
text = """1. low lung volumes
2. exam limited on lateral: view by superimposed soft tissue and bony structures of the arm
3. lungs appear grossly clear . no evidence of pneumonia ."""
text_to_tokens(text)

In [None]:
remove_consecutive_dots(['.', '.', 'asdf', 'hello', '.', 'abc', '.', '.', 'c', '.'])

In [None]:
text_to_tokens('1st and 3rd-5XXXX')

### Run preprocess

In [None]:
%run -n ../preprocess/iu_xray.py

In [None]:
reports, tokens_appearances, errors = preprocess_iu_x_ray('v3', [0, 1, 2, 5, 10], override=True)
len(reports), len(tokens_appearances)

### Check errors

#### Check in tokens

In [None]:
for token, n_appears in tokens.items():
    if 'NUMBER' in token:
        print(token, n_appears)

#### Check in text

In [None]:
import re

In [None]:
TARGET_TOKENS = ['NUMBER[^\s]']

In [None]:
found = []
for report in reports.values():
    for token in TARGET_TOKENS:
        if re.search(token, report['clean_text']):
            found.append(report)
            
len(found)

## Rotate images

NOTE: are already rotated!!
(Run this only once)

In [None]:
info_fname = os.path.join(DATASET_DIR, 'info.json')
with open(info_fname, 'r') as f:
    info = json.load(f)
len(info)

In [None]:
info['marks']['rotated_left']

In [None]:
rotations = [
    ('left', -90),
    ('right', 90),
    ('bottom', 180),
]

In [None]:
for key, degrees in rotations:
    images_key = f'rotated_{key}'
    for image_name in info['marks'][images_key]:
        filepath = os.path.join(DATASET_DIR, 'images', image_name)
        img = Image.open(filepath).rotate(degrees)
        # img.save(filepath)

## Calculate image normalization

In [None]:
%run ../../utils/images.py

In [None]:
image_folder = os.path.join(DATASET_DIR, 'images')

In [None]:
dataset = IUXRayDataset('train')
len(dataset)

In [None]:
train_images = [
    i if i.endswith('.png') else f'{i}.png'
    for i in [r['image_name'] for r in dataset.reports]
]
len(train_images)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(image_folder, train_images), show=True)
mean, std

### Plot average image

In [None]:
from torchvision import transforms

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [None]:
summed = torch.zeros(3, 256, 256)

for image_name in tqdm(image_names):
    fpath = os.path.join(image_folder, image_name)
    image = transform(Image.open(fpath).convert('RGB'))
    summed += image
    
summed /= len(image_names)

In [None]:
average_image = summed.mean(dim=0)
average_image.size()

In [None]:
plt.imshow(average_image, cmap='gray')

## Test `IUXrayDataset` class

In [None]:
import matplotlib.pyplot as plt
# from PIL import Image
import numpy as np
from torchvision import transforms

In [None]:
%run ../../utils/common.py

In [None]:
%run ../iu_xray.py

In [None]:
dataset = IUXRayDataset(
    dataset_type='test',
    masks=True,
    masks_version='v1',
    frontal_only=True,
    image_size=(512, 512),
)
len(dataset), len(dataset.word_to_idx)

In [None]:
item = dataset[100]
image = item.image
labels = item.labels
report = item.report
image.size(), labels.size(), len(report), \
    (isinstance(item.masks, torch.Tensor) and item.masks.size())

In [None]:
n_rows = 2
n_cols = 3

plt.figure(figsize=(n_cols*5, n_rows*5))

plt.subplot(n_rows, n_cols, 1)
plt.title(item.image_fname)
plt.imshow(tensor_to_range01(image).permute(1, 2, 0))
plt.axis('off')

for index, organ in enumerate(JSRT_ORGANS):
    mask = item.masks[index] # , 100:200, 100:200

    plt.subplot(n_rows, n_cols, index + 2)
    plt.imshow(mask)
    plt.title(organ)
    plt.axis('off')
    
    min_value = mask.min().item()
    max_value = mask.max().item()
    print(organ, min_value, max_value)

## Inspect tags

In [None]:
from collections import defaultdict

In [None]:
counter = defaultdict(lambda: 0)
for report in reports:
    tags = report['tags_manual']
    for tag in tags:
        counter[tag] += 1

In [None]:
len(reports)

In [None]:
sorted(((k, v) for k, v in counter.items()), key=lambda x:x[1], reverse=True)

## Get sample reports

For LATINX in AI workshop

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np
from pycocoevalcap.bleu import bleu_scorer
from pycocoevalcap.rouge import rouge

In [None]:
%run ../common.py
%run ../iu_xray.py
%run ../../utils/nlp.py
%run ../../utils/__init__.py

In [None]:
CONSTANT_REPORT = """the heart is normal in size . the mediastinum is unremarkable . 
the lungs are clear .
there is no pneumothorax or pleural effusion . no focal airspace disease .
no pleural effusion or pneumothorax ."""

In [None]:
dataset = IUXRayDataset(dataset_type='all')
report_reader = ReportReader(dataset.get_vocab())
len(dataset)

In [None]:
idx = GT_IDX
item = dataset[idx]
image = arr_to_range(item.image.permute(1, 2, 0))
report_base = report_reader.idx_to_text(item.report)
plt.imshow(image)
plt.axis('off')
print(report_base)

In [None]:
GT_IDX = 7289

In [None]:
target = [
    'the cardiac silhouette is enlarged',
    # 'the lungs are hyper',
    # 'the heart is',
]
not_target = [
    # 'the lungs are clear',
#     'the mediastinum is unremarkable',
#     'the mediastinum is stable',
#     'the mediastinum is normal',
#     'the mediastinum is within normal limits',
]
found = []
found_names = set()
for idx, report in enumerate(dataset.reports):
    filename = report['filename']
    report = report_reader.idx_to_text(report['tokens_idxs'])
    if all(t in report for t in target) and all(t not in report for t in not_target):
        if filename not in found_names:
            found.append((idx, report))
        found_names.add(filename)
len(found)

In [None]:
found[5]

In [None]:
gen = 'the heart is enlarged. the mediastinum is unremarkable . the lungs are hyperinflated with mildly coarsened interstitial markings . '
# the lungs are hyperexpanded
# the lungs are hyperinflated with mildly coarsened interstitial markings
# the lungs are hyperinflated with biapical pleural-parenchymal scarring and upward retraction of the xxxx

In [None]:
def measure_bleu_rouge(gen, gt):
    scorer = bleu_scorer.BleuScorer(n=4)
    scorer += (gen, [gt])
    bleu_1_4, _ = scorer.compute_score()
    
    scorer = rouge.Rouge()
    rouge_score = scorer.calc_score([gen], [gt])
    
    print('BLEU 1-4: ', bleu_1_4)
    print('BLEU: ', np.mean(bleu_1_4))
    print('ROUGE-L: ', rouge_score)

In [None]:
report_1 = """the heart is normal in size . the mediastinum is unremarkable . 
the lungs are clear ."""
report_2 = """the heart is normal . the mediastinum is otherwise unremarkable . 
lungs are both clear ."""
measure_bleu_rouge(report_1, report_2)

In [None]:
report = report_reader.idx_to_text(dataset[GT_IDX].report)
report

In [None]:
gt = """the cardiac silhouette is enlarged .
the lungs are hyperexpanded with flattening of the bilateral hemidiaphragms .
no pneumothorax or pleural effusion ."""
# the lungs are hyperinflated with mildly coarsened interstitial markings .
# with flattening of the bilateral hemidiaphragms 

In [None]:
gen = """the cardiac silhouette is normal in size .
the lungs are clear .
no pneumothorax or pleural effusion ."""

In [None]:
measure_bleu_rouge(gen, gt)

In [None]:
gt = "the cardiac silhouette is enlarged . the lungs are hyperexpanded with flattening of the bilateral hemidiaphragms . no pneumothorax or pleural effusion ."
gen = "the cardiac silhouette is normal in size and configuration . the lungs are clear . no pneumothorax or pleural effusion ."
measure_bleu_rouge(gen, gt)

In [None]:
measure_bleu_rouge(gen, gt)

## Check no-findings vs labels==0

In [None]:
from collections import defaultdict

In [None]:
chexpert_path = os.path.join(REPORTS_DIR, 'reports_with_chexpert_labels.csv')
mirqi_path = os.path.join(REPORTS_DIR, 'reports_with_mirqi_labels.csv')

In [None]:
chexpert_df = pd.read_csv(chexpert_path, index_col=0)
chexpert_df.replace(-1, 1, inplace=True)
chexpert_df.replace(-2, 0, inplace=True)
chexpert_df.head()

In [None]:
mirqi_df = pd.read_csv(mirqi_path, index_col=0)
mirqi_df.drop(columns=['attributes-gen', 'MIRQI-r', 'MIRQI-p', 'MIRQI-f'], inplace=True)
mirqi_df.rename(columns={'attributes-gt': 'attributes'}, inplace=True)
mirqi_df.replace(-1, 1, inplace=True)
mirqi_df.replace(-2, 0, inplace=True)
mirqi_df.head()

In [None]:
base_columns = set(['filename', 'Reports', 'attributes'])
MIRQI_LABELS = [c for c in mirqi_df.columns if c not in base_columns]

In [None]:
len(chexpert_df), len(mirqi_df)

In [None]:
df = chexpert_df.merge(mirqi_df, on='filename', suffixes=['_chx', '_mirqi'])
print(len(df))
df.head()

In [None]:
reports_by_condition = defaultdict(set)

for index, row in chexpert_df.iterrows():
    filename = row['filename']
    report = row['Reports']
    labels = row[CHEXPERT_LABELS]

    tup = (index, filename, report)

    no_findings = labels['No Finding']
    
    if no_findings == 1:
        reports_by_condition['no-findings-1'].add(tup)
        if any(l != 0 for l in labels[1:-1]):
            # Exclude no-findings and support-devices
            reports_by_condition['inconsistent'].add(tup)
    else:
        if not any(l != 0 for l in labels[1:-1]):
            reports_by_condition['no-findings-absent'].add(tup)
    
    if all(l != 1 for l in labels):
        reports_by_condition['no-1s'].add(tup)
    
[(k, len(v)) for k, v in reports_by_condition.items()]

In [None]:
l = list(reports_by_condition['no-findings-absent'])
l[:5]

In [None]:
mirqi_df.loc[mirqi_df['filename'] == '256.xml'][MIRQI_LABELS]

In [None]:
l = list(reports_by_condition['no-1s'])
l[:10]

In [None]:
l = list(reports_by_condition['no-findings-1'])
l[:10]