# Imports

In [None]:
import sys
import os

In [None]:
NEGBIO_PATH = os.environ['NEGBIO_PATH']
if NEGBIO_PATH not in sys.path:
    sys.path.append(NEGBIO_PATH)

In [None]:
import re
from collections import defaultdict, Counter, namedtuple
import json

In [None]:
%run -n ../../../chexpert/chexpert-labeler/label.py

In [None]:
IU_DIR = os.environ['DATASET_DIR_IU_XRAY']

# Check sentences' nouns

## Load sentences

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF['clean_sentence'] = [
    ' '.join(s.lower().replace('xxxx', ' ').split())
    for s in SENTENCES_DF['sentence']
]
SENTENCES_DF.head(1)

In [None]:
sentences = list(SENTENCES_DF['clean_sentence'])
len(sentences)

In [None]:
SENTENCES_APPEARANCES = SENTENCES_DF.set_index('clean_sentence')['appearances'].to_dict()
len(SENTENCES_APPEARANCES)

## Load chexpert-stuff

In [None]:
parser = ArgParser()
args = parser.parse_args([
    '--reports_path', '',
    '--base-dir', '/home/pdpino/chexpert/chexpert-labeler',
])

In [None]:
loader = Loader(args.reports_path, args.extract_impression)

In [None]:
extractor = Extractor(args.mention_phrases_dir,
                      args.unmention_phrases_dir,
                      verbose=args.verbose)

In [None]:
classifier = Classifier(args.pre_negation_uncertainty_path,
                        args.negation_path,
                        args.post_negation_uncertainty_path,
                        verbose=args.verbose, light=False)

In [None]:
aggregator = Aggregator(CATEGORIES,
                        verbose=args.verbose)

## Run chexpert stuff

In [None]:
%%time

# sample_sentences = sentences[:100]
sample_sentences = sentences
loader.load(sample_sentences)

In [None]:
%%time

extractor.extract(loader.collection)

In [None]:
%%time

classifier.classify(loader.collection)

## Analyze output

In [None]:
class WordDetail(namedtuple('WordDetail', ['id', 'word', 'tag', 'deps'])):
    def __repr__(self):
        s = f'{self.word} ({self.tag})'
        if len(self.deps) > 0:
            s += '\n'
        for dep in self.deps:
            s += f'\t{dep}\n'
        return s
Dependency = namedtuple('Dependency', ['word', 'tag', 'dep_type', 'role'])
class NounDetail(namedtuple('NounDetail', ['words', 'deps'])):
    def __repr__(self):
        s = self.words.__repr__()
        if len(self.deps) > 0:
            s += '\n'
        for dep in self.deps:
            s += f'\t{dep}\n'
        return s

In [None]:
def iter_node_relations(relations, current_id):
    for relation in relations:
        assert len(relation.nodes) == 2, f'len={len(relation.nodes)}'
        first, second = relation.nodes

        if first.refid == current_id:
            other = second
        elif second.refid == current_id:
            other = first
        else:
            continue
            
        yield relation, other.refid, other.role

In [None]:
_noun_FP = set([
    'xxxx',
    'no', # When like "no pneumothorax, or ..." --> set as NN
    'streaky', 'patchy', 'bibasilar',
    'or',
    # 'top',
])
_noun_FN = set([
    'cardiomegaly',
    'mediastinal',
    'fracture',
    'dislocation',
])

def _is_noun(lemma, tag):
    if lemma in _noun_FN:
        return True
    if lemma in _noun_FP:
        return False

    return tag in ('NN', 'NNS')

In [None]:
_adjective_FN = set([
    'streaky', 'patchy', 'hyperdense', 'retrocardiac', 'bandlike', 'number',
])
def _is_adjective(lemma, tag, dep_type):
    if lemma in _adjective_FN:
        return True
    if tag == 'JJ':
        return True
    if dep_type in ('amod',): # 'dobj'
        return True
    if dep_type == 'nsubj' and tag == 'VBD':
        return True
    if dep_type == 'nsubjpass' and tag == 'VBN':
        return True
    return False

In [None]:
def _find_compounds_recursively_(id_to_details, relations, current_id, seen, found):
    if current_id in seen:
        return

    seen.add(current_id)

    _word_details = id_to_details[current_id]
    found.append(_word_details)

    for relation, other_id, _ in iter_node_relations(relations, current_id):
        other_details = id_to_details[other_id]
        dep_type = relation.infons['dependency']

        if _is_noun(other_details.word, other_details.tag) and dep_type == 'compound':
            _find_compounds_recursively_(
                id_to_details, relations, other_id, seen, found,
            )

In [None]:
results = []
warnings = defaultdict(list)

for document in loader.collection.documents:
    for passage in document.passages:
        for sentence in passage.sentences:
            # 1. Collect all word-details and select nouns
            id_to_details = {}
            noun_ids = []

            for annotation in sentence.annotations:
                lemma = annotation.infons['lemma']
                tag = annotation.infons['tag']
                id_to_details[annotation.id] = WordDetail(
                    id=annotation.id,
                    word=lemma,
                    tag=tag,
                    deps=[],
                )

                # if True:
                if _is_noun(lemma, tag):
                    noun_ids.append(annotation.id)

            # 2. Group compound nouns together
            core_nouns = []
            consumed_ids = set()
            for noun_id in noun_ids:
                if noun_id in consumed_ids:
                    continue

                words = []
                _find_compounds_recursively_(
                    id_to_details, sentence.relations, noun_id, consumed_ids, words,
                )
                core_nouns.append(NounDetail(
                    words=words,
                    deps=[],
                ))

            # 3. Get amod
            full_nouns = []
            for noun_details in core_nouns:
                dependencies = []

                for word_details in noun_details.words:
                    word_id = word_details.id
                    for relation, other_id, other_role in iter_node_relations(
                        sentence.relations, word_id):
                        other_details = id_to_details[other_id]
                        dep_type = relation.infons['dependency']

                        # if True:
                        if _is_adjective(other_details.word, other_details.tag, dep_type):
                            dependencies.append(Dependency(
                                word=other_details.word,
                                tag=other_details.tag,
                                dep_type=dep_type,
                                role=other_role,
                            ))
                    
                full_nouns.append(noun_details._replace(
                    deps=dependencies,
                ))      
                    
            if len(full_nouns) == 0:
                warnings['no-nouns'].append(sentence)
            else:
                results.append((sentence.text, full_nouns))
len(warnings['no-nouns']), [s.text for s in warnings['no-nouns']]

In [None]:
results[:3]

In [None]:
#### TODOs:
# Cross-reference nouns vs RG-procedure: intersection? which are left out?
# Statistic: from all the sentences, what % use nouns from the procedure?
# Statistic: from all the reports, what % use nouns from the procedure?


### Some special cases:

## Noun groups that should be adjectives
# limit (+ normal, )
# change
# midline (when like "trachea is midline")
# lung base (when like "X ... in the lung base") (location)
# spine (when like "X in the spine")

In [None]:
[(txt, details) for txt, details in results if re.search(r'juxtahilar', txt)]

In [None]:
[s for s in results if 'juxtahilar' in s.text]

## Clean and fix nouns

* Remove dependency and details, keep only words
* Manually fix noun issues

### Utils

In [None]:
class SentenceMeta(namedtuple('SentenceMeta', ['text', 'nouns'])):
    def __repr__(self):
        s = f'{self.text}'
        if self.nouns:
            n = ', '.join(str(x) for x in self.nouns)
            s += f'\n\t{n}'
        return s
class NounMeta(namedtuple('NounMeta', ['words', 'adjectives'])):
    @property
    def noun(self):
        return ' '.join(sorted(self.words))

    def __repr__(self):
        s = ' '.join(self.words)
        s = f'"{s}"'
        if self.adjectives:
            a = '|'.join(self.adjectives)
            s += f' ({a})'
        return s

In [None]:
def noun_to_hash(words):
    if isinstance(words, str):
        words = words.split()
    words = sorted(words)
    return ','.join(words)

### Append adjectives to nouns

In [None]:
_FIX_ADJECTIVES = set([
    'streaky', 'patchy', 'hyperdense', 'retrocardiac', 'bandlike', 'number',
    'paratracheal', 'juxtahilar', 'bibasilar', 'basilar', 'perihilar', 'suprahilar',
    'lobe', 'midlung', 'base',
])

In [None]:
_FIX_AMOD_UP_RAW = {
    'airspace disease': 'focal',
    'airspace opacity': 'focal',
    'aortic': 'calcification',
    'blunting': 'costophrenic',
    'calcification': 'aortic',
    'calcification': 'vascular',
    'consolidation': 'focal',
    'contour': 'mediastinal',
    'edema': 'pulmonary',
    'effusion': 'pleural',
    'hernia': 'hiatal',
    'joint': 'acromioclavicular',
    'marking': 'interstitial',
    'mediastinal': 'contour',
    'silhouette': ('cardiomediastinal', 'cardiac'),
    'space': 'pleural',
    'spine': 'thoracic',
    'structure': ('bony', 'osseous'),
    'sulcus': 'costophrenic',
    'tubing': 'shunt',
    'vasculature': 'pulmonary',
    'vascularity': 'pulmonary',
    'view': 'lateral',
}
_FIX_AMOD_UP = {}
for noun, amod in _FIX_AMOD_UP_RAW.items():
    noun_hash = noun_to_hash(noun)
    if isinstance(amod, str):
        amod = (amod,)
    amod = set(amod)
    _FIX_AMOD_UP[noun_hash] = amod
del _FIX_AMOD_UP_RAW
len(_FIX_AMOD_UP)

### Remove adjectives from noun

In [None]:
_FIX_AMOD_DOWN_RAW = {
    'basilar opacity': 'basilar',
    'catheter subclavian': 'subclavian',
    'density number': 'number',
    'granuloma midlung': 'midlung',
    'midline sternotomy': 'midline',
    'number opacity': 'number',
    'opacity perihilar': 'perihilar',
    'round opacity': 'round',

    'atelectasis base': 'base',
    'atelectasis basilar': 'basilar',
    'atelectasis basilar subsegmental': ('basilar', 'subsegmental'),
    'atelectasis bronchovascular crowding': ('bronchovascular', 'crowding'),
    'atelectasis subsegmental': 'subsegmental',
    'atelectasis lung': 'lung',
    'atelectasis lobe': 'lobe',
    'atelectasis base': 'base',
    'atelectasis passive': 'passive',
    'atelectasis base subsegmental': ('base', 'subsegmental'),
    'atelectasis base lung': ('base', 'lung'),
    'atelectasis base opacity': 'base',
    'atelectasis perihilar': 'perihilar',
    'atelectasis discoid': 'discoid',
    'atelectasis basis lung': ('basis', 'lung'),
    'atelectasis fissure subsegmental': ('subsegmental', 'fissure'),
    'atelectasis lobe middle': ('lobe', 'middle'),
    'airspace atelectasis opacity': '',
    'atelectasis infiltrate lobe': ('infiltrate', 'lobe'),
}
_FIX_AMOD_DOWN = {}
for noun, amod in _FIX_AMOD_DOWN_RAW.items():
    noun_hash = noun_to_hash(noun)
    if isinstance(amod, str):
        amod = (amod,)
    amod = set(amod)
    _FIX_AMOD_DOWN[noun_hash] = amod
del _FIX_AMOD_DOWN_RAW
len(_FIX_AMOD_DOWN)

### Finish preprocessing nouns

In [None]:
sentences_meta = []
for text, nouns in results:
    nouns_meta = []
    for compound_noun in nouns:
        words = list(w.word for w in compound_noun.words)
        adjectives = [d.word for d in compound_noun.deps]
        noun_hash = noun_to_hash(words)
        
        # 1. Check for adjectives to carry up
        amod_to_raise = _FIX_AMOD_UP.get(noun_hash, None)
        if amod_to_raise:
            adjectives = set(adjectives)
            adjectives_to_raise = adjectives.intersection(amod_to_raise)
            words = list(adjectives_to_raise) + words
            
            adjectives = list(adjectives - adjectives_to_raise)
            
        # 2. Check for adjectives to carry down
        amod_to_move_down = _FIX_AMOD_DOWN.get(noun_hash, [])
        for amod in amod_to_move_down:
            words.remove(amod)
            if amod not in adjectives:
                adjectives.append(amod)
            
        nouns_meta.append(NounMeta(words, adjectives))

    sentences_meta.append(SentenceMeta(text=text, nouns=nouns_meta))

In [None]:
SYNONYMS = {
    'pneumothorace': 'pneumothorax',
    'cardio silhouette': 'cardiac silhouette',
}

In [None]:
NOUN_ADJECTIVES = defaultdict(list)
NOUN_APPEARANCES = Counter()
_warnings = defaultdict(list)

for meta in sentences_meta:
    if meta.text not in SENTENCES_APPEARANCES:
        _warnings['no-match'].append(meta)
        continue
    sentence_appearances = SENTENCES_APPEARANCES[meta.text]
    for noun in meta.nouns:
        noun_str = SYNONYMS.get(noun.noun, noun.noun)
        NOUN_APPEARANCES[noun_str] += sentence_appearances
        NOUN_ADJECTIVES[noun_str].extend(noun.adjectives)
        
len(NOUN_ADJECTIVES), len(NOUN_APPEARANCES)

In [None]:
len(_warnings['no-match']), _warnings['no-match']

In [None]:
[s for s in SENTENCES_APPEARANCES if s.startswith('question edema')]

In [None]:
sorted(NOUN_APPEARANCES.items(), key=lambda x: x[1], reverse=True)

In [None]:
# problematic nouns:
# widening, redemonstration, "a", "or", collecting, 

### Categorize nouns

(disease/organ/etc)

In [None]:
# SAVED_CATEGORY = dict()
with open(os.path.join(IU_DIR, 'reports', 'nouns', 'categories.json'), 'r') as f:
    SAVED_CATEGORY = json.load(f)
WRONG_NOUN = []

In [None]:
def ask_for_categories():
    nouns_and_appearances = sorted(NOUN_APPEARANCES.items(), key=lambda x: x[1], reverse=True)
    length = len(nouns_and_appearances)

    for index, (noun, appearances) in enumerate(nouns_and_appearances):
        if noun in SAVED_CATEGORY or noun in WRONG_NOUN:
            continue

        while True:
            option = input(f'({index}/{length}) {noun} ({appearances})')

            if option in ('quit', 'q'):
                return
            elif option in ('w',):
                WRONG_NOUN.append(noun)
                break
            elif option in ('d','disease'):
                category = 'disease'
            elif option in ('gd','general-disease'):
                category = 'general-disease'
            elif option in ('dev','device'):
                category = 'device'
            elif option in ('o','organ'):
                category = 'organ'
            elif option in ('go','general-organ'):
                category = 'general-organ'
            elif option in ('n','normal'):
                category = 'normal'
            elif option in ('p', 'proj', 'projection'):
                category = 'projection'
            elif option in ('s', 'sus'):
                category = 'sus'
            elif option in ('surg', 'surgery',):
                category = 'surgery'
            else:
                print('Option not recognized')
                continue
                
            SAVED_CATEGORY[noun] = category
            break

In [None]:
ask_for_categories()

In [None]:
with open(os.path.join(IU_DIR, 'reports', 'nouns', 'categories.json'), 'w') as f:
    json.dump(SAVED_CATEGORY, f, indent=2)

### Plot distribution

In [None]:
[k for k in NOUN_APPEARANCES if 'opacity' in k]

In [None]:
## TODO: Save to JSON
[{
    'text': m.text,
    'nouns': [
        {'noun': n.noun, 'adjectives': n.adjectives} for n in m.nouns
    ]
} for m in sentences_meta]