## Imports

In [None]:
import torch
import os
import json
import matplotlib.pyplot as plt

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../iu_xray.py

In [None]:
REPORTS_DIR = os.path.join(DATASET_DIR, 'reports')

## Apply chexpert labeler to reports

In [None]:
%run -n ../../eval_report_generation_chexpert_labeler.py

In [None]:
fname = os.path.join(REPORTS_DIR, 'reports.clean.json')
with open(fname, 'r') as f:
    clean_reports = list(json.load(f).values())
len(clean_reports)

In [None]:
reports_by_filename = {
    r['filename']: r['clean_text']
    for r in clean_reports
}
len(reports_by_filename)

In [None]:
df_reports = pd.DataFrame(reports_by_filename, columns=['reports'])
print(len(df_reports))
df_reports.head()

In [None]:
%%time

labels = _apply_labeler_to_column(df_reports, 'reports',
                                  fill_empty=-2, fill_uncertain=-1)
labels.shape

In [None]:
df_reports = _concat_df_matrix(df_reports, labels)
df_reports.head()

## Apply MIRQI labeler to reports

In [None]:
from collections import defaultdict

In [None]:
%run -n ../../eval_report_generation_mirqi.py

In [None]:
path = os.path.join(REPORTS_DIR, 'reports_with_chexpert_labels.csv')
df = pd.read_csv(path, index_col=0)
df = df[['Reports', 'filename']]
df.head()

In [None]:
%%time

mirqi_df = _apply_mirqi_to_df(df, gt_col_name='Reports', gen_col_name='Reports')
mirqi_df.head()

In [None]:
valoration_to_int = {
    'POSITIVE': 1,
    'NEGATIVE': 0,
    'UNCERTAIN': -1,
}

WRONG_LEN_ATTRIBUTES = defaultdict(list)

def expand_attributes(row):
    attributes = row['attributes-gt']
    attributes = [s.strip('()') for s in attributes.split(') (')]
    attributes = [s.split('|') for s in attributes]
    
    for tup in attributes:
        if len(tup) != 4:
            WRONG_LEN_ATTRIBUTES['len-not-4'].append(tup)
            continue
        text, label, value, additional = tup
        row[label] = valoration_to_int[value]
        
    return row

In [None]:
mirqi_df = mirqi_df.apply(expand_attributes, axis=1)
mirqi_df.head()

In [None]:
base_cols = ['filename', 'Reports', 'attributes-gt', 'attributes-gen', 'MIRQI-r', 'MIRQI-p', 'MIRQI-f']
columns = base_cols + [c for c in mirqi_df if c not in base_cols]
mirqi_df = mirqi_df[columns]
mirqi_df.head()

In [None]:
mirqi_df.replace(np.nan, -2, inplace=True)
mirqi_df.head()

In [None]:
path = os.path.join(REPORTS_DIR, 'reports_with_mirqi_labels.csv')
mirqi_df.to_csv(path)

## Split sentences

In [None]:
%run ../../utils/nlp.py

In [None]:
from collections import Counter, defaultdict

In [None]:
def split_sentences(report, end_token='.'):
    report = report.split()
    if report[-1] != end_token:
        report.append(end_token)

    sentences = []
    sentence = []
    for word in report:
        sentence.append(word)
        if word == end_token:
            sentences.append(sentence)
            sentence = []
            
    return [' '.join(s) for s in sentences]

In [None]:
reports_fname = os.path.join(REPORTS_DIR, 'reports.clean.v2.json')
with open(reports_fname, 'r') as f:
    reports_as_dict = json.load(f)
    reports = list(reports_as_dict.values())
len(reports_as_dict), len(reports)

### Count appearances

In [None]:
sentence_counter = defaultdict(list)
for report in reports:
    for sentence in split_sentences(report['clean_text']):
        sentence_counter[sentence].append(report['filename'])
len(sentence_counter)

#### Check not-so-common sentences

In [None]:
l = list(sentence_counter.items())
l = sorted(l, key=lambda x: x[1], reverse=True)
l[:20]

#### Check short sentences

In [None]:
sorted([
    (sentence, len(appearances))
    for sentence, appearances in sentence_counter.items()
], key=lambda x: len(x[0]))

In [None]:
target_sentence = 'hand .'
appearances = sentence_counter[target_sentence]
appearances

In [None]:
target_report = appearances[0]
[
    report
    for report in reports
    if report['filename'] == target_report
]

## Label sentences with chexpert labels

In [None]:
%run -n ../../metrics/report_generation/chexpert.py
# %run -n ../../eval_report_generation_chexpert_labeler.py

In [None]:
columns = ['sentences', 'appearances']
df_sentences = pd.DataFrame([
    (sentence, len(appearances))
    for sentence, appearances in sentence_counter.items()
], columns=columns)
print(len(df_sentences))
df_sentences.head()

In [None]:
%%time

labels = apply_labeler_to_column(df_sentences, 'sentences',
                                 fill_empty=-2, fill_uncertain=-1)
labels.shape

In [None]:
df_sentences = _concat_df_matrix(df_sentences, labels)
print(len(df_sentences))
df_sentences.head()

In [None]:
fpath = os.path.join(REPORTS_DIR, 'sentences_with_chexpert_labels.csv')

In [None]:
df_sentences.to_csv(fpath, index=False)

## Load sentences for later use

In [None]:
import pandas as pd

In [None]:
fpath = os.path.join(REPORTS_DIR, 'sentences_with_chexpert_labels.csv')

In [None]:
df_sentences = pd.read_csv(fpath)
df_sentences.head()

In [None]:
len(df_sentences)

## Count sentences' groups

### Count normal vs abnormal

In [None]:
snt_normal = df_sentences[df_sentences['No Finding'] == 1]
snt_abnormal = df_sentences[df_sentences['No Finding'] == 0]
len(snt_normal), len(snt_abnormal)

In [None]:
snt_normal['appearances'].sum()

In [None]:
snt_abnormal['appearances'].sum()

### Number of abnormal sentences per report

In [None]:
is_sentence_normal = dict()
for index, row in df_sentences.iterrows():
    sentence = row['sentences']
    is_normal = row['No Finding']
    is_sentence_normal[sentence] = is_normal
len(is_sentence_normal)

In [None]:
res = []
for report in reports:
    number_of_abnormal = sum(
        1 - is_sentence_normal[sentence]
        for sentence in split_sentences(report['clean_text'])
    )
    res.append(number_of_abnormal)
len(res)

In [None]:
plt.hist(res)

### Top-K most common sentences

In [None]:
cols = ['sentences', 'appearances']
df = df_sentences[cols].sort_values('appearances', ascending=False).head(5)
df

### Plot sentences appearances distribution

In [None]:
len(df_sentences)

In [None]:
df_sentences.sort_values('appearances', ascending=False).head()

In [None]:
appearances = list(df_sentences['appearances'])

In [None]:
plt.hist(appearances, bins=30)
plt.yscale('log')

plt.title('Sentence appearances distribution')

plt.ylabel('Number of sentences')
plt.xlabel('Number of appearances')

In [None]:
plt.bar(list(range(len(values))), values)

## Collect synonyms

In [None]:
import json
from collections import Counter, defaultdict

In [None]:
%run ../vocab/__init__.py

In [None]:
SYNONYMS = load_synonyms('iu_xray')
len(SYNONYMS)

In [None]:
SYNONYMS = {}

In [None]:
FOR_LATER = set()

In [None]:
SEEN_SENTENCES = set()
for representative, syns in SYNONYMS.items():
    SEEN_SENTENCES.add(representative)
    for s in syns:
        SEEN_SENTENCES.add(s)
len(SEEN_SENTENCES)

In [None]:
def is_number(s):
    try:
        s = int(s)
        return True
    except:
        return False

In [None]:
def process_sentences(sentences, skip_later=True):
    index_to_repr = {
        index: representative
        for index, representative in enumerate(SYNONYMS.keys())
    }
    
    def _print_reprs():
        print('-'*20)
        for index, representative in index_to_repr.items():
            print(f'{index} - {representative}')
    
    def _add_new(sentence):
        index_to_repr[len(SYNONYMS)] = sentence
        SYNONYMS[sentence] = []
        SEEN_SENTENCES.add(sentence)

    def _add_as_syn(sentence, option):
        option = int(option)
            
        if option not in index_to_repr:
            print(f'No synonym found for option={option}')
            raise
        representative = index_to_repr[option]

        if representative not in SYNONYMS:
            print(f'representative {representative} not in SYNS')
            # Internal error!
            raise

        SYNONYMS[representative].append(sentence)
        SEEN_SENTENCES.add(sentence)
        
    _print_reprs()
    
    sentence_idx = 0
    while sentence_idx < len(sentences):
        sentence = sentences[sentence_idx]
        sentence = clean_sentence(sentence)
        
        if sentence in SEEN_SENTENCES or (not skip_later and sentence in FOR_LATER):
            sentence_idx += 1
            continue
            
        option = input(f'"{sentence}" --> ')
        
        if is_number(option):
            _add_as_syn(sentence, option)
            sentence_idx += 1
        elif option == 'l': # later
            FOR_LATER.add(sentence)
            sentence_idx += 1
        elif ',' in option: # split and allocate
            added_new = False
            for suboption in option.split(','):
                suboption = suboption.strip()
                if suboption == 'n':
                    new_sentence = input('\t\tInput new sentence: ')
                    _add_new(new_sentence)
                    added_new = True
                else:
                    _add_as_syn(sentence, suboption)

            if added_new:
                _print_reprs()
                
            sentence_idx += 1
        elif option == 'n': # new
            _add_new(sentence)
            _print_reprs()
            sentence_idx += 1
        elif option == 'b':
            print('Breaking')
            break
        else:
            print(f'Option not recognized: {option}')

In [None]:
some_sentences = df_sentences.groupby('Fracture')['sentences'].apply(list)
some_sentences = sorted(some_sentences[0], key=lambda x: len(x))
process_sentences(some_sentences)

In [None]:
save_synonyms('iu_xray', SYNONYMS)

In [None]:
o

## Inspect Max amounts

### Max amount of words in a sentence

In [None]:
sorted([(len(s.split()), s) for s in sentence_counter], reverse=True)

 ### Max amount of words

In [None]:
max(len(report['clean_text'].split()) for report in reports)

### Max amount of sentences

In [None]:
max(len(split_sentences(report['clean_text'])) for report in reports)

## Sentences + organs

In [None]:
from collections import namedtuple

In [None]:
OrganLabeler = namedtuple('OrganLabeler',
                          ['mentions_other', 'mentions_heart', 'mentions_lungs'])

In [None]:
df_sentences.head()

### Attempt 1: Use chexpert-labels

#### Heart rules

In [None]:
grouped_heart = df_sentences.groupby([
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
])['sentences'].apply(list)
grouped_heart.head()

In [None]:
grouped_heart[(-2.0, 0)]

In [None]:
heart_col1 = 'Enlarged Cardiomediastinum'
heart_col2 = 'Cardiomegaly'

def mentions_heart(sample):
    empty1 = sample[heart_col1] == -2
    empty2 = sample[heart_col2] == -2
    return int(not empty1 or not empty2)

In [None]:
sample = df_sentences.iloc[30]
sample['sentences'], mentions_heart(sample)

#### Lungs rules

In [None]:
from collections import defaultdict
import re

In [None]:
lungs_cols = [
    'Lung Lesion',
    'Lung Opacity',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
]

In [None]:
grouped_lungs = df_sentences.groupby(
    lambda x: any(y != -2 for y in df_sentences.loc[x, lungs_cols]),
)['sentences'].apply(list)
grouped_lungs.head()

In [None]:
lungs_appear = grouped_lungs[True]
len(lungs_appear)

In [None]:
PATTERN_BOTH = re.compile(r'both|bilateral')
PATTERN_RIGHT = re.compile('right')
PATTERN_LEFT = re.compile('left')

In [None]:
by_lung = defaultdict(list)

for sentence in lungs_appear:
    both = PATTERN_BOTH.search(sentence)
    right = PATTERN_RIGHT.search(sentence)
    left = PATTERN_LEFT.search(sentence)
            
    if left and right and both: key = 'all'
    elif left and right: key = 'left-right'
    elif left and both: key = 'both-left'
    elif both and right: key = 'both-right'
    elif both: key = 'both'
    elif right: key = 'right'
    elif left: key = 'left'
    else: key = 'none'
    
    by_lung[key].append(sentence)

[(k, len(g)) for k, g in by_lung.items()]

In [None]:
by_lung['left']

In [None]:
def mentions_lungs(sample):
    all_empty = all(label == -2 for label in sample[lungs_cols])
    
    if all_empty:
        return 0, 0
    
    sentence = sample['sentences']
    if PATTERN_BOTH.search(sentence):
        return 1, 1
    
    left = PATTERN_LEFT.search(sentence)
    right = PATTERN_RIGHT.search(sentence)
    
    if not right and not left:
        # None found ("both", "right", "left")
        return 1, 1
    
    return int(bool(left)), int(bool(right))

In [None]:
# s = 'there is a 1 cm nodular opacity in the right costophrenic xxxx , increased since comparison examination .'
# 3523

# s =  'there is focal airspace disease in the right lung base concerning for pneumonia or aspiration .'
# 4777

s = 'left basilar opacity compatible pleural effusion and atelectasis .'
# 6058
df_sentences[df_sentences['sentences'] == s]

In [None]:
sample = df_sentences.iloc[6058]
sample['sentences'], mentions_lungs(sample)

#### Other rules (background, bones)

In [None]:
other_cols = [
    'Fracture',
    'Support Devices',
]

In [None]:
grouped_other = df_sentences.groupby(
    lambda x: any(y != -2 for y in df_sentences.loc[x, other_cols]),
)['sentences'].apply(list)
grouped_other.head()

In [None]:
other_present = grouped_other[True]
other_absent = grouped_other[False]
len(other_present), len(other_absent)

In [None]:
other_present

In [None]:
def mentions_other(sample):
    all_empty = all(label == -2 for label in sample[other_cols])
    
    return int(not all_empty)

In [None]:
s = 'no displaced rib fracture visualized .'
# 250
df_sentences[df_sentences['sentences'] == s]

In [None]:
sample = df_sentences.iloc[250]
sample['sentences'], mentions_other(sample)

#### Gather chexpert-label-based OrganLabeler

In [None]:
chexpert_organ_labeler = OrganLabeler(
    mentions_other=mentions_other,
    mentions_heart=mentions_heart,
    mentions_lungs=mentions_lungs,
)

### Attempt 2: Regex-based

#### MIRQI phrases loader

In [None]:
import os
import re

In [None]:
MIRQI_DIR = os.path.abspath('../../../../software/MIRQI/predefined/phrases')

In [None]:
def load_phrases(label, mention=True):
    mention = 'mention' if mention else 'unmention'

    fname = os.path.join(MIRQI_DIR, mention, f'{label}.txt')
    with open(fname, 'r') as f:
        lines = [l.strip().replace('_', ' ') for l in f.readlines()]
        
    return lines

#### Heart regex

In [None]:
heart_mentions = load_phrases('cardiomegaly') + load_phrases('enlarged_cardiomediastinum')
# Edema heart related
heart_mentions += ['heart failure', 'chf', 'vascular congestion', 'vascular prominence']
# Others
heart_mentions += ['heart', 'aorta', 'aortic', ' cardio', 'mediastinal', 'mediastinum']
len(heart_mentions)

In [None]:
REGEX_HEART = re.compile('|'.join(heart_mentions))
def regex_mentions_heart(sample):
    sentence = sample['sentences']
    return int(bool(REGEX_HEART.search(sentence)))

In [None]:
sample = df_sentences.iloc[3233]
sample['sentences'], regex_mentions_heart(sample)

#### Lung regex

In [None]:
lung_diseases = ['airspace_disease', 'airspace_opacity', 'atelectasis', 'calcinosis',
                 'consolidation', 'emphysema',
                 'hypoinflation', 'lung_lesion',
                 'pleural_effusion', 'pleural_other',
                 'pneumonia', 'pneumothorax',
                ]

In [None]:
lungs_mentions = [
    ph
    for disease in lung_diseases
    for ph in load_phrases(disease)
]
# Edema lung related 
lungs_mentions += [
    'edema', 'pulmonary congestion',
    'clear lung',
    'the lung',
    'pleural space',
    'pleural air collection',
]
len(lungs_mentions)

In [None]:
# Copied from attempt 1
PATTERN_BOTH = re.compile(r'both|bilateral')
PATTERN_RIGHT = re.compile('right')
PATTERN_LEFT = re.compile('left')

In [None]:
REGEXES_LUNGS = [
    re.compile(r'lungs?\s(are\s)?clear'),
    re.compile(r'(right|left) lung'),
    re.compile(r'\Alung'),
    re.compile(r'lungs? volume'),
    re.compile(r'pulmon\w*\s(vascul\w*)?'),
    re.compile('expanded lungs?'),
    re.compile('(right|left) (upper |lower )?lobe'),
    re.compile('|'.join(lungs_mentions)),
]
def regex_mentions_lungs(sample):
    sentence = sample['sentences']
    any_lung = any(pattern.search(sentence) for pattern in REGEXES_LUNGS)
    
    if not any_lung:
        return 0, 0
    
    if PATTERN_BOTH.search(sentence):
        return 1, 1
    
    left = PATTERN_LEFT.search(sentence)
    right = PATTERN_RIGHT.search(sentence)
    
    if not right and not left:
        # None found ("both", "right", "left")
        return 1, 1
    
    return int(bool(left)), int(bool(right))

In [None]:
sample = df_sentences.iloc[1200]
sample['sentences'], regex_mentions_lungs(sample)

#### Other regex

In [None]:
other_diseases = ['scoliosis', 'support_devices', 'fracture']

In [None]:
other_mentions = [ph for disease in other_diseases for ph in load_phrases(disease)]

other_mentions += ['bony', 'bone',
                   'spine', 'osseous', 'osseus', 'skeletal',
                   'spondylosis', 'trachea']

# Other support devices
other_mentions += ['ivc', 'clips']

len(other_mentions)

In [None]:
REGEX_OTHER = re.compile('|'.join(other_mentions))
def regex_mentions_other(sample):
    sentence = sample['sentences']
    return int(bool(REGEX_OTHER.search(sentence)))

In [None]:
sample = df_sentences.iloc[1000]
sample['sentences'], regex_mentions_other(sample)

##### Other other!

Not used for now

TODO: keep reviewing these?

In [None]:
phrases = load_phrases('other_finding')

In [None]:
phrases = [
    'blunt', # Lungs
    'elevation',  # hemidiaphragm elevation
    'bronchospasm', # None
    'asthma', # None
    'interstitial markings', # Lungs
    'plaque', # Lungs
    'osteophytosis', # None
    'aortic disease', # heart or lungs
    'bronchiolitis', # Lungs
    'thickening',
    'cephalization',
    'aspiration',
    'bullae',
    'contusion',
    'atherosclero',
    'osteopenia',
    'metastasis',
    'granuloma',
    'pneumomediastinum',
    'pneumoperitoneum',
    'osteodystrophy',
    'cuffing',
    'irregular lucency',
    'inflam',
    'fissure',
    'prominen',
    'kyphosis',
    'defib',
    'bullet',
    'reticula',
    'thoracentesis',
    'bronchitis',
    'volume loss',
    'deformity',
    'hemorrhage',
    'hematoma',
    'radiopaque',
    'aerophagia',
    'arthropathy',
    'tracheostomy',
]

In [None]:
idx = 9
# regex_phrases = '|'.join(phrases)
regex_phrases = phrases[idx]
print(regex_phrases)
samples = list(df_sentences[df_sentences['sentences'].str.contains(regex_phrases)]['sentences'])
len(samples)

In [None]:
samples

#### Gather

In [None]:
regex_organ_labeler = OrganLabeler(
    mentions_other=regex_mentions_other,
    mentions_heart=regex_mentions_heart,
    mentions_lungs=regex_mentions_lungs,
)

### Label all organs

In [None]:
%run ../jsrt.py
%run ../common.py

In [None]:
ORGAN_BACKGROUND, ORGAN_HEART, ORGAN_LEFT_LUNG, ORGAN_RIGHT_LUNG

In [None]:
# ORGAN_LABELER = chexpert_organ_labeler
ORGAN_LABELER = regex_organ_labeler

In [None]:
WRONG_ONES = defaultdict(list)

def find_organs(sample):
    background = heart = right_lung = left_lung = 0

    if ORGAN_LABELER.mentions_other(sample):
        background = heart = right_lung = left_lung = 1
    else:
        heart = ORGAN_LABELER.mentions_heart(sample)
        left_lung, right_lung = ORGAN_LABELER.mentions_lungs(sample)

    if background + heart + right_lung + left_lung == 0:
        WRONG_ONES['all-empty'].append(sample.name)
        # If nothing is identified, set all to 1
        background = heart = right_lung = left_lung = 1

    sample[ORGAN_BACKGROUND] = background
    sample[ORGAN_HEART] = heart
    sample[ORGAN_RIGHT_LUNG] = right_lung
    sample[ORGAN_LEFT_LUNG] = left_lung
    
    return sample

In [None]:
%%time

df_sentences_2 = df_sentences.apply(find_organs, axis=1)
df_sentences_2.head()

####  Review empty ones

In [None]:
[(k, len(v)) for k, v in WRONG_ONES.items()]

In [None]:
a = WRONG_ONES['all-empty']
sample = df_sentences.iloc[a]
len(sample)

In [None]:
sample['appearances'].sum(), df_sentences['appearances'].sum()

In [None]:
sample.sort_values('appearances', ascending=False).head(40)

In [None]:
df_sentences.iloc[1056]['sentences']

In [None]:
columns = ['sentences'] + JSRT_ORGANS
sentences_and_organs = df_sentences_2[columns]
sentences_and_organs.head()

In [None]:
fpath = os.path.join(REPORTS_DIR, 'sentences_with_organs.csv')
sentences_and_organs.to_csv(fpath, index=False)