# Imports

In [None]:
import os
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../datasets/common/constants.py

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DIR

# Utils

In [None]:
ACTUAL_DISEASES = CHEXPERT_DISEASES[1:]
ACTUAL_DISEASES

In [None]:
def collect_for_disease(df, target_disease, remove_other=True, column='text'):
    only_df = df

    if remove_other:
        # Keep only sentences that do not mention other diseases
        other_diseases = list(ACTUAL_DISEASES)
        other_diseases.remove(target_disease)
        only_df = only_df.loc[(only_df[other_diseases] == -2).all(axis=1)]
    
    grouped = only_df.groupby(target_disease)[column].apply(
        lambda x: sorted(list(x), key=lambda y: len(y)),
    )
    print([(valuation, len(sentences)) for valuation, sentences in grouped.iteritems()])
    
    return grouped

# Analyze sentence vs diseases

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
df = pd.read_csv(fpath)
print(len(df))
df.head(2)

In [None]:
TOTAL_SENTENCES = len(df)
TOTAL_APPEARANCES = df['appearances'].sum()
TOTAL_SENTENCES, TOTAL_APPEARANCES

## Utils

In [None]:
def remove_non_covered_info(df):
    def _remove_key_with_value(d, key, value):
        if key not in df.columns:
            print(f'Key not found in df: {key}')
        else:
            d = d.loc[d[key] == value]
        return d
    
    df = _remove_key_with_value(df, 'obfuscated', False)
    df = _remove_key_with_value(df, 'time', False)
    return df

In [None]:
def collect_sentences_for_disease(target_disease, remove_other=True, remove_useless_info=True):
    only_df = df

    if remove_useless_info:
        only_df = remove_non_covered_info(only_df)
        
    grouped = collect_for_disease(only_df, target_disease, remove_other, column='sentence')

    return grouped

In [None]:
def mentions_any_term(sentence, terms):
    return any(
        term in sentence
        for term in terms
    )

In [None]:
def new_column_mentions_any_term(sentences, terms):
    return [
        mentions_any_term(sentence, terms)
        for sentence in sentences
    ]

In [None]:
def print_subdf_stats(subdf, name, full_df=df):
    total_sentences = len(full_df)
    total_appearances = full_df['appearances'].sum()
    
    n_sent = len(subdf)
    n_appear = subdf['appearances'].sum()

    perc_sent = n_sent / total_sentences * 100
    perc_appear = n_appear / total_appearances * 100
    print(f'{name}:')
    print(f'\tsentences={n_sent:,}/{total_sentences:,} ({perc_sent:.2f}%)')
    print(f'\tappearances={n_appear:,}/{total_appearances:,} ({perc_appear:.2f}%)')

## Annotate non-covered info

TODO: other non-convered info:

* Non-disease descriptive info: e.g. 'ap and lateral view of the chest .'

### Obfuscated sentences

Contain xxxx

In [None]:
def contains_obfuscated(sentence):
    return 'xxxx' in sentence

In [None]:
df['obfuscated'] = [
    contains_obfuscated(sentence)
    for sentence in df['sentence']
]
df.head()

In [None]:
obf_df = df.loc[df['obfuscated'] == True]
print_subdf_stats(obf_df, 'Obfuscated')

In [None]:
l = list(obf_df['sentence'])
l

### Time-related sentences

"shown again", "given history", etc

In [None]:
_TIME_MENTIONS = set([
    'unchanged', 'improved', 'given history',
    'previous', 'with prior', 'no change',
    'prior exam', 'consistent with prior',
    'prior study', 'compared to prior',
    'from the prior',
    'prior',
    'has been removed',
    'have been removed',
    'interval', 'persistent', 'remain',
    'stable', 'now', 'again',
    'as before',
])

In [None]:
l = list(df[df['sentence'].str.contains('again')]['sentence'])
len(l), l

In [None]:
df['time'] = new_column_mentions_any_term(df['sentence'], _TIME_MENTIONS)
df.head()

In [None]:
time_df = df.loc[df['time'] == True]
print_subdf_stats(time_df, 'Time')

### Lung adjectives and extra information

In [None]:
ADJECTIVES = ['mild', 'subsegmental', 'streaky', 'minimal',
              # 'decreased', 'increased',
              'scattered', 'calcified', 'discrete', 'poorly defined',
              'diffuse', 'patchy', 'vague', 'bandlike', 'mildly', 'prominent',
              'subtle', 'asymmetric', 'strandy', 'shaped', 'rotated',
              'irregular', 'coarse', 'residual', 'maximal thickness',
              'thin', 'resolved', 'smooth',
             ]
LOCATION = ['basilar', 'bibasilar', 'medial', 'bilateral', 'basal', 'bilaterally',
            'right', 'left', 'midlung', 'lung base', 'near the', 'upper lung', 'lobe',
            'hilum', 'perihilar', 'cavitary', 'rib', 'periphery', 'lingular',
            'biapical', 'apical', 'apex', 'apices', 'interstitial', 'alveolar',
           ]
AMOUNTS = ['innumerable', 'multiple', 'three', 'a few']
SIZE = ['NUMBER', 'large', 'small', 'moderate sized', 'width', 'diameter']
COMPARISON = ['than', # e.g. right larger than left
             ]

In [None]:
sentences = df['sentence']
df['lung-adj'] = new_column_mentions_any_term(sentences, ADJECTIVES)
df['lung-loc'] = new_column_mentions_any_term(sentences, LOCATION)
df['lung-amount'] = new_column_mentions_any_term(sentences, AMOUNTS)
df['lung-size'] = new_column_mentions_any_term(sentences, SIZE)
df['lung-compare'] = new_column_mentions_any_term(sentences, COMPARISON)
df.head()

In [None]:
cols = [c for c in df.columns if c.startswith('lung-')]
df['lung-any-detail'] = df[cols].any(axis=1)
df.head()

In [None]:
LUNG_DISEASES = CHEXPERT_DISEASES[3:-2]

In [None]:
d = df.loc[((df[LUNG_DISEASES] == 1) | (df[LUNG_DISEASES] == -1)).any(axis=1)]
df_detail = d.loc[(d['lung-any-detail'] == True)]
print_subdf_stats(df_detail, 'Lung details (out of positive sentences)', d)

### Save to file

With extra info

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
df.to_csv(fpath, index=False)

## Heart

In [None]:
grouped = collect_sentences_for_disease('Enlarged Cardiomediastinum')

In [None]:
grouped[1]

In [None]:
grouped = collect_sentences_for_disease('Enlarged Cardiomediastinum')

In [None]:
grouped[1]

## Lungs

In [None]:
grouped = collect_sentences_for_disease('Lung Lesion')

In [None]:
[
    s
    for s in grouped[1]
    if mentions_any_term(s, AMOUNTS)
]

In [None]:
[s for s in grouped[1] if all(k not in s for k in ('right', 'left', 'apic',
                                                   'bilateral', 'bibasilar',
                                                   'interstitial', 'perihilar'))]

In [None]:
grouped = collect_sentences_for_disease('Pleural Other', True, True)

In [None]:
grouped[1]

## Others

Fracture and devices

In [None]:
grouped = collect_sentences_for_disease('Fracture')

In [None]:
grouped[1]

## Sentences with more than one disease

In [None]:
many_diseases_df = df.loc[((df[ACTUAL_DISEASES] == 1) | (df[ACTUAL_DISEASES] == -1)).sum(axis=1) > 1]
many_diseases_df = many_diseases_df.sort_values('sentence', key=lambda x: x.str.len())
print(len(many_diseases_df))
many_diseases_df.head(1)

In [None]:
print_subdf_stats(many_diseases_df, 'More than 1 disease')

In [None]:
l = list(many_diseases_df['sentence'])
l

## Sentences not covered by chexpert

### Including NF

In [None]:
not_covered_df = df.loc[(df[CHEXPERT_DISEASES] == -2).all(axis=1)]
not_covered_df = not_covered_df.sort_values('sentence', key=lambda x: x.str.len())
print(len(not_covered_df))
print_subdf_stats(not_covered_df, 'Non covered including NF')
not_covered_df.head(2)

In [None]:
l1 = list(not_covered_df['sentence'])
l1

### Not including NF

In [None]:
d = df.loc[(df[ACTUAL_DISEASES] == -2).all(axis=1)]
d = d.loc[d['No Finding'] != -2]
d = d.sort_values('sentence', key=lambda x: x.str.len())
print(len(d))
print_subdf_stats(d, 'Non covered diseases')
d.head(2)

In [None]:
Counter(d['No Finding'])

In [None]:
l2 = list(d['sentence'])
len(l2), l2

# Compare with expert-given-procedure

## Common stuff

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.rename(columns={'sentence': 'text'}, inplace=True)
SENTENCES_DF = SENTENCES_DF.sort_values('text', key=lambda x: x.str.len())

fpath = os.path.join(IU_DIR, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath, index_col=0)
REPORTS_DF.rename(columns={'Reports': 'text'}, inplace=True)
REPORTS_DF = REPORTS_DF.sort_values('text', key=lambda x: x.str.len())
REPORTS_DF.reset_index(drop=True, inplace=True)

len(SENTENCES_DF), len(REPORTS_DF)

## Inspect

### Get sentences with text

In [None]:
d = SENTENCES_DF
# d = d.loc[(d['text'].str.contains(r'atrial') & d['text'].str.contains('large'))]
d = d.loc[d['text'].str.contains(r'mass')]
len(d), d['appearances'].sum()

In [None]:
list(d['text'])

### Group sentences by chexpert disease eval

In [None]:
g = collect_for_disease(SENTENCES_DF, 'Lung Lesion')

In [None]:
g[1]

# Check sentence positions

Try to define the best position in the report to name the reports

In [None]:
import json
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import math

In [None]:
%run ../utils/nlp.py
%run ../datasets/common/constants.py

In [None]:
with open(os.path.join(IU_DIR, 'reports', 'reports.clean.v4.json'), 'r') as f:
    reports_raw = list(json.load(f).values())
reports = [r['clean_text'] for r in reports_raw]
len(reports)

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
df = pd.read_csv(fpath)
# df.replace(-2, 0, inplace=True)
# df.replace(-1, 1, inplace=True)
print(len(df))
df.head(2)

In [None]:
sentence_to_chexpert_labels = {
    row['sentence']: row[CHEXPERT_DISEASES].to_numpy().astype(np.uint8).tolist()
    for index, row in df.iterrows()
}
len(sentence_to_chexpert_labels)

In [None]:
not_found = []
positions_and_disease = []

for report in reports:
    for sentence_position, sentence in enumerate(split_sentences_text(report)):
        if sentence not in sentence_to_chexpert_labels:
            not_found.append(sentence)
            continue
        chex = sentence_to_chexpert_labels[sentence]
        
        for disease, value in zip(CHEXPERT_DISEASES, chex):
            if value in (0, 1, -1):
                positions_and_disease.append((disease, sentence_position, value))
len(positions_and_disease)

In [None]:
positions_by_disease = defaultdict(list)
diseases_by_position = defaultdict(list)
for disease, position, value in positions_and_disease:
    positions_by_disease[disease].append(position)
    diseases_by_position[position].append(disease)
len(diseases_by_position), len(positions_by_disease)

In [None]:
n_subplots = len(CHEXPERT_DISEASES)
n_cols = 4
n_rows = math.ceil(n_subplots / n_cols)
plt.figure(figsize=(5*n_cols, 5*n_rows))

for index, disease in enumerate(CHEXPERT_DISEASES):
    positions = positions_by_disease[disease]
    
    plt.subplot(n_rows, n_cols, index+1)
    plt.title(disease)
    plt.hist(positions)
    plt.ylabel('Amount')
    plt.xlabel('Position')

In [None]:
shorten_name = {
    'Enlarged Cardiomediastinum': 'EC', 'Cardiomegaly': 'Ca',
    'Lung Lesion': 'LL', 'Lung Opacity': 'LO',
    'Edema': 'E', 'Consolidation': 'Co',
    'Pneumonia': 'Pm', 'Atelectasis': 'A', 'Pneumothorax': 'Pt',
    'Pleural Effusion': 'PE', 'Pleural Other': 'PO',
    'Fracture': 'F', 'Support Devices': 'SD',
}

In [None]:
max_position = max(diseases_by_position.keys())
disease_to_index = {
    disease: index
    for index, disease in enumerate(CHEXPERT_DISEASES)
}

n_subplots = len(diseases_by_position)
n_cols = 2
n_rows = math.ceil(n_subplots / n_cols)
plt.figure(figsize=(8*n_cols, 5*n_rows))

chosen_diseases = CHEXPERT_DISEASES[1:]
chosen_diseases_short = [shorten_name[d] for d in chosen_diseases]

for index, position in enumerate(range(max_position)):
    diseases = diseases_by_position[position]
    
    diseases = [
        disease_to_index[d] for d in diseases
        if d != 'No Finding'
    ]
    
    hist, _ = np.histogram(diseases, bins=13)
    amounts_and_disease_names = list(zip(hist, chosen_diseases_short))
    amounts_and_disease_names = sorted(amounts_and_disease_names, reverse=True)
    amounts, short_names = tuple(zip(*amounts_and_disease_names))

    plt.subplot(n_rows, n_cols, index+1)
    plt.title(position, fontsize=20)
    plt.bar(short_names, amounts)
    # plt.hist(diseases, rwidth=0.8, align='left', bins=13)
    plt.ylabel('Amount')
    plt.xticks(fontsize=16) # rotation=90

In [None]:
avg_by_disease = Counter()
appearances_by_disease = Counter()
for disease, position, value in positions_and_disease:
    avg_by_disease[disease] += position
    appearances_by_disease[disease] += 1
    
avg_by_disease = [
    (value / appearances_by_disease[disease], disease)
    for disease, value in avg_by_disease.items()
]
avg_by_disease = sorted(avg_by_disease, reverse=False)
avg_by_disease

In [None]:
_, diseases_ordered = tuple(zip(*avg_by_disease))
diseases_ordered

# Debug RG-templates model

In [None]:
import torch

In [None]:
%run ../datasets/vocab/__init__.py
%run ../utils/nlp.py

In [None]:
vocab = load_vocab('iu_xray')
len(vocab)

In [None]:
report_reader = ReportReader(vocab)

In [None]:
%run ../models/report_generation/templates/__init__.py

In [None]:
ORDER = (
        'Cardiomegaly',
        'Enlarged Cardiomediastinum',
        'Consolidation',
        'Lung Opacity',
        'Atelectasis',
        'Support Devices',
        'Pleural Effusion',
        'Pleural Other',
        'Pneumonia',
        'Pneumothorax',
        'Edema',
        'Lung Lesion',
        'Fracture',
    )

In [None]:
model = create_rg_template_model('chex-v1', ACTUAL_DISEASES, vocab, order=ORDER)
model

In [None]:
labels = torch.tensor([[1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1],
                       [1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0],
                      ]).long()
labels.size()

In [None]:
reports = model(labels)
[
    report_reader.idx_to_text(r)
    for r in reports
]

# Debug chexpert-labeler

Check that made-up sentences evaluate correctly with chexpert

In [None]:
%run ../metrics/report_generation/chexpert.py

In [None]:
sentences = [
#     'there are pulmonary nodules or mass identified',
#     'one or more airspace opacities can be seen',
#     'pulmonary edema is seen',
#     'there is focal consolidation',
#     'there is evidence of pneumonia',
#     'no atelectasis',
#     'pleural effusion is seen',
#     'pleural thickening is present',
#     'a fracture is identified',
    'a device is seen',
]

In [None]:
temp_df = pd.DataFrame(sentences, columns=['s'])

In [None]:
labels = apply_labeler_to_column(temp_df, 's')
labels