# Imports

In [None]:
import os
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DIR
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

# Utils

In [None]:
ACTUAL_DISEASES = CHEXPERT_DISEASES[1:]
ACTUAL_DISEASES

In [None]:
def collect_for_disease(df, target_disease, remove_other=True, column='text'):
    only_df = df

    if remove_other:
        # Keep only sentences that do not mention other diseases
        other_diseases = list(ACTUAL_DISEASES)
        other_diseases.remove(target_disease)
        only_df = only_df.loc[(only_df[other_diseases] == -2).all(axis=1)]
    
    grouped = only_df.groupby(target_disease)[column].apply(
        lambda x: sorted(list(x), key=lambda y: len(y)),
    )
    print([(valuation, len(sentences)) for valuation, sentences in grouped.iteritems()])
    
    return grouped

In [None]:
# dataset_dir = IU_DIR
dataset_dir = MIMIC_DIR
# fpath = os.path.join(dataset_dir, 'reports', 'sentences.v4.csv')
fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(1)

In [None]:
d = SENTENCES_DF
d = d.loc[d['sentence'].str.contains(r'\batelectasis.*consolidation')]
print(len(d))
list(d['sentence'])

# Analyze sentence vs diseases

In [None]:
TOTAL_SENTENCES = len(SENTENCES_DF)
TOTAL_APPEARANCES = SENTENCES_DF['appearances'].sum()
TOTAL_SENTENCES, TOTAL_APPEARANCES

In [None]:
APPEARANCES_BY_SENTENCE = SENTENCES_DF.set_index('sentence')['appearances'].to_dict()

## Utils

In [None]:
def remove_non_covered_info(df):
    def _remove_key_with_value(d, key, value):
        if key not in df.columns:
            print(f'Key not found in df: {key}')
        else:
            d = d.loc[d[key] == value]
        return d
    
    df = _remove_key_with_value(df, 'obfuscated', False)
    df = _remove_key_with_value(df, 'time', False)
    return df

In [None]:
def collect_sentences_for_disease(target_disease, remove_other=True, remove_useless_info=True):
    only_df = SENTENCES_DF

    if remove_useless_info:
        only_df = remove_non_covered_info(only_df)
        
    grouped = collect_for_disease(only_df, target_disease, remove_other, column='sentence')

    return grouped

In [None]:
def mentions_any_term(sentence, terms):
    return any(
        term in sentence
        for term in terms
    )

In [None]:
def array_mentions_any_term(sentences, terms):
    return [
        mentions_any_term(sentence, terms)
        for sentence in sentences
    ]

In [None]:
def print_subdf_stats(subdf, name, full_df=SENTENCES_DF):
    total_sentences = len(full_df)
    total_appearances = full_df['appearances'].sum()
    
    n_sent = len(subdf)
    n_appear = subdf['appearances'].sum()

    perc_sent = n_sent / total_sentences * 100
    perc_appear = n_appear / total_appearances * 100
    print(f'{name}:')
    print(f'\tsentences={n_sent:,}/{total_sentences:,} ({perc_sent:.2f}%)')
    print(f'\tappearances={n_appear:,}/{total_appearances:,} ({perc_appear:.2f}%)')

In [None]:
def count_adjectives(sentences, adjectives, exact=False):
    d = APPEARANCES_BY_SENTENCE

    adjectives = list(set(adjectives))
    adjectives_appearances = Counter()
    for sentence in sentences:
        for adjective in adjectives:
            pattern = adjective
            if exact:
                pattern = r'\b{}\b'.format(adjective)
            if re.search(pattern, sentence):
                adjectives_appearances[adjective] += d[sentence]
    adjectives_appearances = sorted(adjectives_appearances.items(),
                                    key=lambda x:x[1], reverse=True)
    return adjectives_appearances

## Annotate adjectives

TODO: move this to bottom?

### Lung adjectives and extra information

In [None]:
ADJECTIVES = ['mild', 'subsegmental', 'streaky', 'minimal',
              # 'decreased', 'increased',
              'scattered', 'calcified', 'discrete', 'poorly defined',
              'diffuse', 'patchy', 'vague', 'bandlike', 'mildly', 'prominent',
              'subtle', 'asymmetric', 'strandy', 'shaped', 'rotated',
              'irregular', 'coarse', 'residual', 'maximal thickness',
              'thin', 'resolved', 'smooth',
             ]
LOCATION = ['basilar', 'bibasilar', 'medial', 'bilateral', 'basal', 'bilaterally',
            'right', 'left', 'midlung', 'lung base', 'near the', 'upper lung', 'lobe',
            'hilum', 'perihilar', 'cavitary', 'rib', 'periphery', 'lingular',
            'biapical', 'apical', 'apex', 'apices', 'interstitial', 'alveolar',
           ]
AMOUNTS = ['innumerable', 'multiple', 'three', 'a few']
SIZE = ['NUMBER', 'large', 'small', 'moderate sized', 'width', 'diameter']
COMPARISON = ['than', # e.g. right larger than left
             ]

In [None]:
sentences = df['sentence']
df['lung-adj'] = new_column_mentions_any_term(sentences, ADJECTIVES)
df['lung-loc'] = new_column_mentions_any_term(sentences, LOCATION)
df['lung-amount'] = new_column_mentions_any_term(sentences, AMOUNTS)
df['lung-size'] = new_column_mentions_any_term(sentences, SIZE)
df['lung-compare'] = new_column_mentions_any_term(sentences, COMPARISON)
df.head()

In [None]:
cols = [c for c in df.columns if c.startswith('lung-')]
df['lung-any-detail'] = df[cols].any(axis=1)
df.head()

In [None]:
LUNG_DISEASES = CHEXPERT_DISEASES[3:-2]

In [None]:
d = df.loc[((df[LUNG_DISEASES] == 1) | (df[LUNG_DISEASES] == -1)).any(axis=1)]
df_detail = d.loc[(d['lung-any-detail'] == True)]
print_subdf_stats(df_detail, 'Lung details (out of positive sentences)', d)

## Heart

In [None]:
grouped = collect_sentences_for_disease('Enlarged Cardiomediastinum', False, False)

In [None]:
EC_ADJECTIVES = [
    'prominen', 'prominent', 'prominence',
    'mild', 'mildly',
    'slight', 'slightly',
    'moderate', 'moderately',
    'significantly',
    'bilaterally',
    'stable',
]

In [None]:
count_adjectives(grouped[1], EC_ADJECTIVES, exact=True)

In [None]:
[s for s in grouped[1] if 'cardiomediastinal' in s]

In [None]:
grouped = collect_sentences_for_disease('Cardiomegaly', False, False)

In [None]:
CARDIOMEGALY_ADJECTIVES = [
    'mild', 'mildly',
    'slight', 'slightly',
    'moderate', 'moderately',
    'severe', 'severely',
    'borderline', 'minimal',
    'stable',
]

In [None]:
count_adjectives(grouped[1], CARDIOMEGALY_ADJECTIVES, exact=True)

## Lungs

In [None]:
LUNGS_ADJECTIVES = [
    # Size
    'small', 'large', 'borderline',
    # Amount
    'multiple',
    # Intensity
    'mild', 'mildly',
    'slight', 'slightly',
    'moderate', 'moderately',
    'severe', 'severely',
    'minimal',
    'stable',
    # Location:
    'right', 'left', 'apic', 'apical', 'biapical',
    'lobe',
    'upper', 'middle', 'lower', 'base', 'basal',
    'right upper lobe',
    'bilateral', 'bibasilar', 'basilar',
    'midlung', 'mid chest',
    'interstitial', 'perihilar',
    'lingular', 'cavitary', 'parahilar',
    # Other
    'calcified', 'noncalcified',
    'scattered', 'diffuse',
    'streaky', 'patchy',
]

In [None]:
grouped = collect_sentences_for_disease('Lung Lesion', False, False)

In [None]:
# grouped[1]

In [None]:
count_adjectives(grouped[1], LUNGS_ADJECTIVES, exact=True)[:10]

In [None]:
[(s, APPEARANCES_BY_SENTENCE[s]) for s in grouped[1] if 'left' in s]

## Others

Fracture and devices

In [None]:
OTHER_ADJECTIVES = [
    *LUNGS_ADJECTIVES,
    'old',
    'atrium', 'quadrant', 'mid',
    'subclavian',
    'right sided',
]

In [None]:
grouped = collect_sentences_for_disease('Support Devices')

In [None]:
count_adjectives(grouped[1], OTHER_ADJECTIVES, exact=True)[:10]

In [None]:
[s for s in grouped[1] if 'right' in s]

## Sentences with best intra-BLEU

Chex-v2 templates

In [None]:
import numbers
from pycocoevalcap.bleu import bleu_scorer
from tqdm.auto import tqdm
from collections import defaultdict

In [None]:
def get_sentences_and_appearances_for(target_diseases, target_value, remove_other=True):
    if isinstance(target_diseases, str):
        target_diseases = (target_diseases,)

    only_df = SENTENCES_DF

    if isinstance(target_value, numbers.Number):
        target_value = (target_value,)
    
    if remove_other:
        other_diseases = list(ACTUAL_DISEASES)
        for t in target_diseases:
            other_diseases.remove(t)
        only_df = only_df.loc[(only_df[other_diseases] == -2).all(axis=1)]

    for t in target_diseases:
        only_df = only_df.loc[(only_df[t].isin(target_value))]
    only_df = only_df.sort_values('appearances', ascending=False)
    only_df = only_df[['sentence', 'appearances'] + list(target_diseases)]
    
    return only_df.reset_index(drop=True)
    # return list(only_df.set_index('sentence')['appearances'].items())

In [None]:
TARGET_DISEASE = ('Cardiomegaly', 'Enlarged Cardiomediastinum')
df = get_sentences_and_appearances_for(TARGET_DISEASE, (0,1))
df.head(2)

In [None]:
# df = df.append({
#     'sentence': '', # 'interval removal of catether',
#     'appearances': 0,
#     'Support Devices': 0,
# }, ignore_index=True)

In [None]:
sentences = list(df['sentence'])
more_columns = defaultdict(list)
# more_metadata = []
for sentence in tqdm(df['sentence']):
    scorer = bleu_scorer.BleuScorer()
    
    for other in sentences:
        scorer += (sentence, [other])
        
    bleus, _ = scorer.compute_score()
    for i, b in enumerate(bleus):
        more_columns[f'bleu{i+1}'].append(b)
    more_columns['bleu'].append(np.mean(bleus))
    # more_metadata.append(bleus + [bleu])

for col, values in more_columns.items():
    df[col] = values
    
df = df.sort_values(['bleu', 'appearances'], ascending=False)
df = df.set_index('sentence')
len(df)

In [None]:
# d = df.loc[df[TARGET_DISEASE] == 0]
d = df
d.head(40)

In [None]:
d = df
d.loc[d.index == 'heart and mediastinum within normal limits .']

In [None]:
dis1 = 'Cardiomegaly'

if remove_other:
    other_diseases = list(ACTUAL_DISEASES)
    other_diseases.remove('Cardiomegaly')
    other_diseases.remove('Enlarged Cardiomediastinum')
    only_df = only_df.loc[(only_df[other_diseases] == -2).all(axis=1)]

only_df = only_df.loc[(
    only_df['Cardiomegaly'].isin(target_value) & only_df['Enlarged Cardiomediastinum'].isin(target_value)
)]
only_df = only_df.sort_values('appearances', ascending=False)
only_df = only_df[['sentence', 'appearances', target_disease]]

## Sentences with more than one disease

In [None]:
many_diseases_df = df.loc[((df[ACTUAL_DISEASES] == 1) | (df[ACTUAL_DISEASES] == -1)).sum(axis=1) > 1]
many_diseases_df = many_diseases_df.sort_values('sentence', key=lambda x: x.str.len())
print(len(many_diseases_df))
many_diseases_df.head(1)

In [None]:
print_subdf_stats(many_diseases_df, 'More than 1 disease')

In [None]:
l = list(many_diseases_df['sentence'])
l

## Sentences not covered by chexpert

### Including NF

In [None]:
not_covered_df = df.loc[(df[CHEXPERT_DISEASES] == -2).all(axis=1)]
not_covered_df = not_covered_df.sort_values('sentence', key=lambda x: x.str.len())
print(len(not_covered_df))
print_subdf_stats(not_covered_df, 'Non covered including NF')
not_covered_df.head(2)

In [None]:
l1 = list(not_covered_df['sentence'])
l1

### Not including NF

In [None]:
d = df.loc[(df[ACTUAL_DISEASES] == -2).all(axis=1)]
d = d.loc[d['No Finding'] != -2]
d = d.sort_values('sentence', key=lambda x: x.str.len())
print(len(d))
print_subdf_stats(d, 'Non covered diseases')
d.head(2)

In [None]:
Counter(d['No Finding'])

In [None]:
l2 = list(d['sentence'])
len(l2), l2

# Compare with expert-given-procedure

## Common stuff

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.rename(columns={'sentence': 'text'}, inplace=True)
SENTENCES_DF = SENTENCES_DF.sort_values('text', key=lambda x: x.str.len())

fpath = os.path.join(IU_DIR, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath, index_col=0)
REPORTS_DF.rename(columns={'Reports': 'text'}, inplace=True)
REPORTS_DF = REPORTS_DF.sort_values('text', key=lambda x: x.str.len())
REPORTS_DF.reset_index(drop=True, inplace=True)

len(SENTENCES_DF), len(REPORTS_DF)

## Inspect

### Get sentences with text

In [None]:
d = SENTENCES_DF
# d = d.loc[(d['text'].str.contains(r'atrial') & d['text'].str.contains('large'))]
d = d.loc[d['text'].str.contains(r'mass')]
len(d), d['appearances'].sum()

In [None]:
list(d['text'])

### Group sentences by chexpert disease eval

In [None]:
g = collect_for_disease(SENTENCES_DF, 'Lung Lesion')

In [None]:
g[1]

# Check sentence positions

Try to define the best position in the report to name the reports

In [None]:
import json
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import math

In [None]:
%run ../../utils/nlp.py
%run ../../datasets/common/constants.py

In [None]:
# dataset_dir, version = IU_DIR, 'v4-1'
dataset_dir, version = MIMIC_DIR, 'v4-2'

In [None]:
chosen_split = 'test'

In [None]:
_fpath = os.path.join(MIMIC_DIR, 'master_metadata.v4-2-fixed.csv')
mimic_master_df = pd.read_csv(_fpath)
mimic_master_df = mimic_master_df.loc[mimic_master_df['split'] == chosen_split]
studies = set(mimic_master_df['study_id'])
len(studies)

In [None]:
with open(os.path.join(dataset_dir, 'reports', f'reports.clean.{version}.json'), 'r') as f:
    reports_raw = list(json.load(f).values())
reports = [r['clean_text'] for r in reports_raw if r['study_id'] in studies]
len(reports)

In [None]:
fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_chexpert_labels.csv')
df = pd.read_csv(fpath)
# df.replace(-2, 0, inplace=True)
# df.replace(-1, 1, inplace=True)
print(len(df))
df.head(2)

In [None]:
cols = ['sentence'] + CHEXPERT_DISEASES
sentence_to_chexpert_labels = df[cols].set_index('sentence',).transpose().to_dict('list')
len(sentence_to_chexpert_labels)

In [None]:
list(zip(CHEXPERT_DISEASES, sentence_to_chexpert_labels['the heart is enlarged .']))

In [None]:
not_found = []
positions_and_disease = []

for report in reports:
    for sentence_position, sentence in enumerate(split_sentences_text(report)):
        if sentence not in sentence_to_chexpert_labels:
            not_found.append(sentence)
            continue
        chex = sentence_to_chexpert_labels[sentence]
        
        for disease, value in zip(CHEXPERT_DISEASES, chex):
            if value in (0, 1, -1):
                positions_and_disease.append((disease, sentence_position, value))
len(positions_and_disease), len(not_found)

In [None]:
positions_by_disease = defaultdict(list)
diseases_by_position = defaultdict(list)
for disease, position, value in positions_and_disease:
    if value in (0, -1, 1):
        positions_by_disease[disease].append(position)
        diseases_by_position[position].append(disease)
len(diseases_by_position), len(positions_by_disease)

In [None]:
n_subplots = len(CHEXPERT_DISEASES)
n_cols = 4
n_rows = math.ceil(n_subplots / n_cols)
plt.figure(figsize=(5*n_cols, 5*n_rows))

for index, disease in enumerate(CHEXPERT_DISEASES):
    positions = positions_by_disease[disease]
    
    plt.subplot(n_rows, n_cols, index+1)
    plt.title(disease)
    plt.hist(positions)
    plt.ylabel('Amount')
    plt.xlabel('Position')

In [None]:
max_position = max(diseases_by_position.keys())
disease_to_index = {
    disease: index
    for index, disease in enumerate(CHEXPERT_DISEASES)
}

n_subplots = len(diseases_by_position)
n_cols = 2
n_rows = math.ceil(n_subplots / n_cols)
plt.figure(figsize=(8*n_cols, 5*n_rows))

chosen_diseases = CHEXPERT_DISEASES[1:]
chosen_diseases_short = [ABN_SHORTCUTS[d] for d in chosen_diseases]

for index, position in enumerate(range(max_position)):
    diseases = diseases_by_position[position]
    
    diseases = [
        disease_to_index[d] for d in diseases
        if d != 'No Finding'
    ]
    
    hist, _ = np.histogram(diseases, bins=13)
    amounts_and_disease_names = list(zip(hist, chosen_diseases_short))
    amounts_and_disease_names = sorted(amounts_and_disease_names, reverse=True)
    amounts, short_names = tuple(zip(*amounts_and_disease_names))

    plt.subplot(n_rows, n_cols, index+1)
    plt.title(position, fontsize=20)
    plt.bar(short_names, amounts)
    # plt.hist(diseases, rwidth=0.8, align='left', bins=13)
    plt.ylabel('Amount')
    plt.xticks(fontsize=16) # rotation=90

In [None]:
def get_avg_position_by_disease(values=(0, -1, 1)):
    avg_by_disease = Counter()
    appearances_by_disease = Counter()
    for disease, position, value in positions_and_disease:
        if value in values:
            avg_by_disease[disease] += position
            appearances_by_disease[disease] += 1

    avg_by_disease = [
        (value / appearances_by_disease[disease], disease)
        for disease, value in avg_by_disease.items()
    ]
    avg_by_disease = sorted(avg_by_disease, reverse=False)
    return avg_by_disease

In [None]:
get_avg_position_by_disease()

In [None]:
get_avg_position_by_disease((-1, 1))

In [None]:
get_avg_position_by_disease((0, ))

In [None]:
_, diseases_ordered = tuple(zip(*avg_by_disease))
diseases_ordered

# MIMIC sentences

In [None]:
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

## Split in sentences

TODO: move this preprocessing to mimic notebook/preprocess?

In [None]:
import json
from collections import Counter
import pandas as pd

In [None]:
%run ../utils/nlp.py

In [None]:
fpath = os.path.join(MIMIC_DIR, 'reports', 'reports.clean.v4.json')
with open(fpath) as f:
    reports = json.load(f)
len(reports)

In [None]:
study = '53911762' # list(reports.keys())[0]
reports[study]

In [None]:
sentences_appearances = Counter()

for report in reports.values():
    text = report['clean_text']
    for sentence in split_sentences_text(text):
        sentences_appearances[sentence] += 1
len(sentences_appearances)

In [None]:
df = pd.DataFrame(list(sentences_appearances.items()), columns=['sentence', 'appearances'])
print(len(df))
df.head()

In [None]:
df.to_csv(os.path.join(MIMIC_DIR, 'reports', 'sentences.csv'), index=False)

## Analyze sentences

In [None]:
reports_df = pd.read_csv(os.path.join(MIMIC_DIR, 'reports', 'reports_with_chexpert_labels.csv'))
print(len(reports_df))
# reports_df['n_sentences'] = [len(list(split_sentences_text(r))) for r in reports_df['Reports']]
reports_df.head(2)

In [None]:
split_df = pd.read_csv(os.path.join(MIMIC_DIR, 'master_metadata.csv'))
split_df = split_df[['split', 'report_fpath']]
split_df = split_df.groupby('report_fpath').first().reset_index()
print(len(split_df))
split_df.head(2)

In [None]:
if 'split' not in reports_df.columns:
    reports_df = reports_df.merge(split_df, left_on='filename', right_on='report_fpath', how='left')
    reports_df = reports_df.loc[reports_df['split'] == 'train']
print(len(reports_df))
reports_df.head(2)

In [None]:
reports_appearances = reports_df.groupby('Reports')['filename'].apply(
    lambda x: len(x.values),
)
reports_appearances = reports_appearances.sort_values(ascending=False)
reports_appearances = reports_appearances.to_frame()
reports_appearances = reports_appearances.reset_index()
reports_appearances['n_sentences'] = [
    len(list(split_sentences_text(r)))
    for r in reports_appearances['Reports']
]
reports_appearances = reports_appearances.rename(columns={'filename': 'appearances'})
reports_appearances.head(2)

In [None]:
if 'No Finding' not in reports_appearances.columns:
    cols = ['Reports', *CHEXPERT_DISEASES]
    right = reports_df[cols].groupby('Reports').first()
    reports_appearances = reports_appearances.merge(right, on='Reports', how='left')
print(len(reports_appearances))
reports_appearances.head(2)

In [None]:
d = reports_appearances
d.loc[d['Reports'].str.contains('focal')]

In [None]:
reports_appearances.head(40)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
d = reports_appearances
d = d.loc[((d[ACTUAL_DISEASES] == 0).sum(axis=1) >= 2)]
d.head(40)

### By sentence

In [None]:
sentences_appearances = Counter()

for report in reports_df['Reports']:
    for sentence in split_sentences_text(report):
        sentences_appearances[sentence] += 1
        
train_sentences_df = pd.DataFrame(
    list(sentences_appearances.items()), columns=['sentence', 'appearances'],
)
train_sentences_df = train_sentences_df.sort_values('appearances', ascending=False)
len(train_sentences_df)

In [None]:
train_sentences_df.head(20)

In [None]:
reports_appearances.head(20)

# Debug RG-templates model

In [None]:
import torch

In [None]:
%run ../datasets/vocab/__init__.py
%run ../utils/nlp.py

In [None]:
vocab = load_vocab('iu_xray')
len(vocab)

In [None]:
report_reader = ReportReader(vocab)

In [None]:
# %run ../models/report_generation/templates/__init__.py
%run ../models/report_generation/templates/models.py
%run ../models/report_generation/templates/chex_v1.py
%run ../models/report_generation/templates/chex_group.py

In [None]:
ORDER = (
    'Cardiomegaly',
    'Enlarged Cardiomediastinum',
    'Consolidation',
    'Lung Opacity',
    'Atelectasis',
    'Support Devices',
    'Pleural Effusion',
    'Pleural Other',
    'Pneumonia',
    'Pneumothorax',
    'Edema',
    'Lung Lesion',
    'Fracture',
)

In [None]:
# model = create_rg_template_model('chex-v1-grouped', ACTUAL_DISEASES, vocab, order=ORDER)
model = GroupedTemplateRGModel(
    templates=TEMPLATES_CHEXPERT_v1,
    groups=GROUPS_v1,
    diseases=ACTUAL_DISEASES,
    vocab=vocab,
    order=ORDER,
)
model

In [None]:
labels = torch.tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
                       [1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0],
                      ]).long()
labels.size()

In [None]:
reports = model(labels)
[
    report_reader.idx_to_text(r)
    for r in reports
]

# Test against chexpert-labeler

Check that fixed sentences evaluate correctly with chexpert

In [None]:
%run ../metrics/report_generation/chexpert.py

In [None]:
sentences = [
#     'there are pulmonary nodules or mass identified',
#     'one or more airspace opacities can be seen',
#     'pulmonary edema is seen',
#     'there is focal consolidation',
#     'there is evidence of pneumonia',
#     'no atelectasis',
#     'pleural effusion is seen',
#     'pleural thickening is present',
#     'a fracture is identified',
# 'a device is seen',
    # 'heart size is at the upper limits of normal',
    # 'the heart size is within normal limits'
    # 'acute , displaced rib fractures'
    'heart and mediastinum within normal limits',
    'the heart size and mediastinal silhouette are within normal limits',
]

In [None]:
labels = apply_labeler_to_column(sentences)
labels