# Import

In [None]:
import os
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DIR
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

In [None]:
ACTUAL_DISEASES = CHEXPERT_DISEASES[1:]

# Utils

In [None]:
import re

In [None]:
def mentions_any_term(sentence, terms):
    return any(
        bool(re.search(term, sentence))
        for term in terms
    )

In [None]:
def array_mentions_any_term(sentences, terms):
    return [
        mentions_any_term(sentence, terms)
        for sentence in sentences
    ]

In [None]:
def print_subdf_stats(subdf, name, full_df):
    total_sentences = len(full_df)
    total_appearances = full_df['appearances'].sum()
    
    n_sent = len(subdf)
    n_appear = subdf['appearances'].sum()

    perc_sent = n_sent / total_sentences * 100
    perc_appear = n_appear / total_appearances * 100
    print(f'{name}:')
    print(f'\tsentences={n_sent:,}/{total_sentences:,} ({perc_sent:.2f}%)')
    print(f'\tappearances={n_appear:,}/{total_appearances:,} ({perc_appear:.2f}%)')

# Annotate out-of-reach info

TODO: other non-convered info:

* Non-disease descriptive info: e.g. 'ap and lateral view of the chest .'

## Load sentences

### Select IU version with XXXX

In [None]:
import json

In [None]:
%run ../../datasets/preprocess/common.py

In [None]:
for dataset_dir in (IU_DIR, MIMIC_DIR):
    with open(os.path.join(dataset_dir, 'reports', 'reports.clean.v4.json')) as f:
        reports_dict = json.load(f)
    split_sentences_and_save_csv(os.path.join(dataset_dir, 'reports'), reports_dict, suffix='v4')

In [None]:
# dataset_dir = IU_DIR
dataset_dir = MIMIC_DIR

fpath = os.path.join(dataset_dir, 'reports', 'sentences.v4.csv')
SENTENCES_DF = pd.read_csv(fpath)
print(len(SENTENCES_DF)), print(SENTENCES_DF['appearances'].sum())
SENTENCES_DF.head(3)

## Annotate obfuscated sentences

Contain xxxx

In [None]:
def contains_obfuscated(sentence):
    return 'xxxx' in sentence

In [None]:
SENTENCES_DF['obfuscated'] = [
    contains_obfuscated(sentence)
    for sentence in SENTENCES_DF['sentence']
]
SENTENCES_DF.head()

In [None]:
obf_df = SENTENCES_DF.loc[SENTENCES_DF['obfuscated'] == True]
print_subdf_stats(obf_df, 'Obfuscated', SENTENCES_DF)

In [None]:
l = list(obf_df['sentence'])
sorted([a for a in l if 'heart' in a], key=lambda x: len(x))

## Annotate time-related sentences

Comparison sentences, "shown again", "given history", etc

In [None]:
_TIME_MENTIONS = set([
    'unchanged', 'improved', 'given history',
    'previous', 'with prior',
    'no change', 'unchanged',
    'again noted',
    'comparison',
    'prior exam', 'consistent with prior',
    'prior study', 'compared to prior', 'compared to exams',
    'compared to ,', 'compared to .', # Edge cases, xxxx was removed
    'from the prior',
    'prior',
    'has been removed',
    'have been removed',
    'interval', 'persistent', 'remain',
    'stable', 'now', 'again',
    'as before',
    'postoperative changes',
    'temporal development',
])

In [None]:
SENTENCES_DF['time'] = array_mentions_any_term(SENTENCES_DF['sentence'], _TIME_MENTIONS)
SENTENCES_DF.head(3)

In [None]:
time_df = SENTENCES_DF.loc[SENTENCES_DF['time'] == True]
print_subdf_stats(time_df, 'Time', SENTENCES_DF)

In [None]:
list(time_df['sentence'])

In [None]:
l = []
df = SENTENCES_DF.loc[SENTENCES_DF['time'] == False]
for sentence in df['sentence']:
    if mentions_any_term(sentence, ['change']):
        l.append(sentence)
len(l), l[:20]

## Annotate technical sentences

e.g.: "technically limited study secondary to patient"

In [None]:
_TECHNICAL_MENTIONS = set([
    'limited',
    'limitation',
    'technique', 'technical',
])

In [None]:
SENTENCES_DF['technical'] = array_mentions_any_term(
    SENTENCES_DF['sentence'],
    _TECHNICAL_MENTIONS,
)
SENTENCES_DF.head(3)

In [None]:
technical_df = SENTENCES_DF.loc[SENTENCES_DF['technical'] == True]
print_subdf_stats(technical_df, 'Technical', SENTENCES_DF)

In [None]:
list(technical_df['sentence'])

In [None]:
l = []
df = SENTENCES_DF.loc[SENTENCES_DF['technical'] == False]
for sentence in df['sentence']:
    if mentions_any_term(sentence, ['technique', 'technical']):
        l.append(sentence)
len(l), l[:20]

## Annotate follow-up

In [None]:
_FU_MENTIONS = set([
    'correlate', 'correlation',
    'recommend',
    'follow.?up', 'followed up',
    'evaluated further',
    r'further (investigation|imag|character|evaluat|clarification|concern|local|workup)',
    'discuss this case further',
    r'\b(ct|cta|contrast)\b.*further',
    r'further.*\b(ct|cta|contrast)\b',
    'review at this time', 'ordered at the time',
    'refer to',
    'radiographic assessment',
    r'\bif (the|concern)', # condition, typically followed by a recommendation
    r'\b(ct|cta|ultrasound)',
    
    # Other:
    'the patient was xxxx transferred to the operating room',
    'repeat images were not obtained',
])

In [None]:
SENTENCES_DF['followup'] = array_mentions_any_term(
    SENTENCES_DF['sentence'],
    _FU_MENTIONS,
)
SENTENCES_DF.head(3)

In [None]:
fu_df = SENTENCES_DF.loc[SENTENCES_DF['followup'] == True]
print_subdf_stats(fu_df, 'Follow-up', SENTENCES_DF)

In [None]:
list(fu_df['sentence'])

In [None]:
l = []
df = SENTENCES_DF.loc[SENTENCES_DF['followup'] == False]
for sentence in df['sentence']:
    if mentions_any_term(sentence, ['\bif']):
        l.append(sentence)
len(l), l[:20]

## Annotate comms with other people

In [None]:
_COMM_MENTIONS = set([
    'telephone',
    'staff',
    'radiologist',
    'physician',
    r'notifi',
    'communicated',
    'contacted',
    'paged',
])

In [None]:
SENTENCES_DF['comm'] = array_mentions_any_term(
    SENTENCES_DF['sentence'],
    _COMM_MENTIONS,
)
SENTENCES_DF.head(3)

In [None]:
comm_df = SENTENCES_DF.loc[SENTENCES_DF['comm'] == True]
print_subdf_stats(comm_df, 'Communications', SENTENCES_DF)

In [None]:
list(comm_df['sentence'])

In [None]:
l = []
df = SENTENCES_DF.loc[SENTENCES_DF['comm'] == False]
for sentence in df['sentence']:
    if mentions_any_term(sentence, [r'paged']):
        l.append(sentence)
len(l), l[-20:]

## Summarize all out-of-reach

In [None]:
cols = [c for c in SENTENCES_DF.columns if c not in ('sentence', 'appearances')]
cols

In [None]:
for col in cols:
    subdf = SENTENCES_DF.loc[SENTENCES_DF[col] == True]
    print_subdf_stats(subdf, col, SENTENCES_DF)

## Collect any out-of-reach info

In [None]:
SENTENCES_DF.head(3)

In [None]:
cols = [c for c in SENTENCES_DF.columns if c not in ('sentence', 'appearances')]
any_df = SENTENCES_DF.loc[(SENTENCES_DF[cols] == True).any(axis=1)]
print_subdf_stats(any_df, 'Any out-of-reach', SENTENCES_DF)

In [None]:
clean_df = SENTENCES_DF.loc[(SENTENCES_DF[cols] == False).all(axis=1)]
print(len(clean_df))
clean_df.head(3)

In [None]:
l = []
for sentence in clean_df['sentence']:
    if mentions_any_term(sentence, [r'\bif']):
        l.append(sentence)
len(l), l[:10]

In [None]:
sorted(list(clean_df['sentence']), key=lambda x: len(x))

In [None]:
list(any_df['sentence'])

In [None]:
[
    s
    for s in SENTENCES_DF['sentence']
    if re.search(r'pleural (recess|reflection)', s)
]

# Save annotations to file

With extra info

In [None]:
dataset_dir = IU_DIR
fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_extra_info.csv')

In [None]:
assert not os.path.isfile(fpath)
# SENTENCES_DF.to_csv(fpath, index=False)

In [None]:
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(5)

In [None]:
d = SENTENCES_DF
d = d.loc[d['followup'] == True]
print(len(d))
d.head(2)