# Imports

In [None]:
import os
from collections import Counter, defaultdict
import importlib

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets import iu_xray, mimic_cxr
IU_DIR = iu_xray.DATASET_DIR
MIMIC_DIR = mimic_cxr.DATASET_DIR

# Load sentences

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(3)

## Word appearances

In [None]:
word_counter = Counter()
for index, row in SENTENCES_DF.iterrows():
    appearances = row['appearances']
    for word in row['sentence'].split():
        word_counter[word] += appearances
        
len(word_counter)

In [None]:
word_appearances = list(word_counter.items())
word_appearances = sorted(word_appearances, key=lambda x: x[1], reverse=True)
word_appearances[:10]

In [None]:
word_appearances[:50]

## Grab keywords from TextRay

In [None]:
import re

In [None]:
%run ../../metrics/report_generation/abn_match/textray.py
%run ../../datasets/vocab/__init__.py

In [None]:
vocab = load_vocab(os.path.join(IU_DIR, 'reports'), 'v4')
len(vocab)

In [None]:
keywords = set()

def resolve_pattern(pattern):
    if isinstance(pattern, str):
        for word in vocab:
            if re.search(pattern, word):
                keywords.add(word)
        return
    for p in pattern:
        resolve_pattern(p)
for key, value in _TEXTRAY_PATTERNS.items():
    resolve_pattern(value)

len(keywords)

In [None]:
keywords

In [None]:
set(vocab) - keywords

## Embedding with keywords

In [None]:
import numpy as np

In [None]:
keyword_to_index = {
    word: idx
    for idx, word in enumerate(keywords)
}

In [None]:
def compute_embeddings_onehot_keywords(sentences):
    embeddings = np.zeros((len(sentences), len(keywords)))

    for sentence_index, sentence in enumerate(sentences):
        for word in sentence.split():
            onehot_index = keyword_to_index.get(word, -1)
            if onehot_index != -1:
                embeddings[sentence_index, onehot_index] = 1
    return embeddings

In [None]:
embeddings = compute_embeddings(list(SENTENCES_DF['sentence']))

In [None]:
len(keywords)

## Embedding with RadGlove

In [None]:
%run ../../models/report_generation/word_embedding.py

In [None]:
radglove = RadGlove()

In [None]:
def compute_embeddings_radglove(sentences):
    embeddings = []
    for sentence_index, sentence in enumerate(sentences):
        sentence_embedding = np.zeros(100)
        
        for word in sentence.split():
            sentence_embedding += radglove[word].numpy()
            
        embeddings.append(sentence_embedding)
    return np.array(embeddings)

In [None]:
embeddings = compute_embeddings_radglove(SENTENCES_DF['sentence'])
embeddings.shape

# Clustering

In [None]:
def group_by_cluster(cluster_instance, sentences):
    clusters = defaultdict(list)
    assert len(sentences) == len(cluster_instance.labels_)
    for sentence, label in zip(sentences, cluster_instance.labels_):
        clusters[label].append(sentence)
    return clusters

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [None]:
kmeans = KMeans(n_clusters=40, random_state=123)

In [None]:
%%time
kmeans.fit(embeddings)

In [None]:
clusters = group_by_cluster(kmeans, SENTENCES_DF['sentence'])

{k:len(v) for k, v in clusters.items()}

In [None]:
clusters[0]

In [None]:
%%time

dbscan = DBSCAN(eps=20) # , metric='manhattan'
dbscan.fit(embeddings)
len(dbscan.components_)

In [None]:
clusters = group_by_cluster(dbscan, SENTENCES_DF['sentence'])

len(clusters), len(clusters[-1])

In [None]:
clusters[-1]

In [None]:
outsider_sentences = clusters[-1]
emb2 = compute_embeddings(outsider_sentences)
emb2.shape

In [None]:
%%time

dbscan = DBSCAN(eps=1, metric='manhattan')
dbscan.fit(emb2)

In [None]:
clusters2 = group_by_cluster(dbscan, outsider_sentences)

len(clusters2), len(clusters2[-1])

In [None]:
clusters2[10]

In [None]:
sentences = list(SENTENCES_DF['sentence'])

In [None]:
[s for s in sentences if re.search(r'reflecting', s)]

In [None]:
%run ../../metrics/report_generation/chexpert.py

In [None]:
labels = apply_labeler_to_column([
    'no focal airspace consolidations',
    'no focal airspace consolidations.',
    'no consolidations',
    'no consolidations .',
    'no focal consolidations',
    'no focal consolidations .',
    'no airspace consolidations',
    'no airspace consolidations .',
], caller_id='testing-consolidations')
labels

# By vocab embedding

In [None]:
from tqdm import tqdm

## Word overlap

In [None]:
l = list(SENTENCES_DF['sentence'])
l[:10]

In [None]:
def word_overlap(sentence1, sentence2):
    s1 = sentence1.split()
    s2 = sentence2.split()
    intersection = set(s1).intersection(set(s2))
    union = len(s1) + len(s2)
    return 2 * len(intersection) / union

In [None]:
a, b = l[0], l[2]
a, b, word_overlap(a, b)

In [None]:
def compute_overlap_matrix(sentences):
    def _word_overlap(s1, s2):
        intersection = s1.intersection(s2)
        union = len(s1) + len(s2)
        return 2 * len(intersection) / (union + 1e-5)
    sentences = [
        set([word for word in sentence.split()]) # if word in keywords
        for sentence in sentences
    ]
    n_sentences = len(sentences)
    embeddings = np.zeros((n_sentences, n_sentences))
    for i in tqdm(range(n_sentences)):
        si = sentences[i]
        for j in range(i+1, n_sentences):
            sj = sentences[j]
            overlap = _word_overlap(si, sj)
            
            embeddings[i, j] = embeddings[j, i] = overlap
            
        embeddings[i, i] = 1
            
    return embeddings

In [None]:
overlaps = compute_overlap_matrix(SENTENCES_DF['sentence'])
overlaps.shape

In [None]:
overlaps

In [None]:
# distances = 1 - overlaps
distances = overlaps.copy()
distances[distances==0] = 1
distances = 1/distances - 1

In [None]:
distances

## Hierarchical keywords

In [None]:
keywords

In [None]:
### WIP: selecting a "handful" of keywords for sentence clustering
# 'aorta',
# 'aortic',
# 'aorticopulmonary',
# 'aorto',
# 'arthritic',
# 'arthritis',
# 'atelectasis',
# 'atelectatic',
# 'blunted',
# 'blunting',
# 'bochdalek',
# 'bone',
# 'bony',
# 'bronchial',
# 'bronchiectatic',
# 'bronchopleural',
# 'bronchopulmonary',
# 'bronchovascular',
# 'calcific',
# 'calcification',
# 'calcifications',
# 'calcified',
# 'cardiac',
# 'cardiomediastinal',
# 'cardiomegaly',
# 'catheter',
# 'chf',
# 'cholecystectomy',
# 'clips',
# 'congestion',
# 'consolidated',
# 'consolidating',
# 'consolidation',
# 'consolidations',
# 'consolidative',
# 'contour',
# 'contours',
# 'costodiaphragmatic',
# 'costophrenic',
# 'cyst',
# 'cystic',
# 'degenerative',
# 'dextrocurvature',
# 'dextroscoliosis',
# 'diaphragm',
# 'diaphragmatic',
# 'diaphragms',
# 'ectasia',
# 'ectatic',
# 'edema',
# 'effusion',
# 'effusions',
# 'elevated',
# 'elevation',
# 'endotracheal',
# 'enlarged',
# 'enlargement',
# 'epigastric',
# 'epigastrium',
# 'esophagogastric',
# 'fibronodular',
# 'fibrotic',
# 'fissure',
# 'fissures',
# 'fluid',
# 'fracture',
# 'fractured',
# 'fractures',
# 'gastric',
# 'gastroesophageal',
# 'gastrostomy',
# 'granuloma',
# 'granulomas',
# 'granulomata',
# 'granulomatous',
# 'heart',
# 'hemidiaphragm',
# 'hemidiaphragms',
# 'hernia',
# 'hiatal',
# 'hiatus',
# 'hilar',
# 'histoplasmosis',
# 'hydropneumothorax',
# 'hyperexpanded',
# 'hyperexpansion',
# 'hyperinflated',
# 'hyperinflation',
# 'icd',
# 'infiltrate',
# 'infiltrates',
# 'infrahilar',
# 'interstitial',
# 'intervertebral',
# 'ivc',
# 'juxtahilar',
# 'kyphosis',
# 'kyphotic',
# 'large',
# 'largely',
# 'larger',
# 'largest',
# 'levocurvature',
# 'levoscoliosis',
# 'line',
# 'lines',
# 'liver',
# 'lymph',
# 'lymphadenopathy',
# 'lymphangitic',
# 'lymphoma',
# 'lymphoproliferative',
# 'mass',
# 'masses',
# 'masslike',
# 'mediastinal',
# 'mediastinum',
# 'mitral',
# 'morgagni',
# 'narrow',
# 'narrowed',
# 'narrowing',
# 'nasogastric',
# 'nodular',
# 'nodularity',
# 'nodule',
# 'nodules',
# 'noncalcified',
# 'nonenlarged',
# 'nonrib',
# 'opacification',
# 'opacities',
# 'opacity',
# 'orthopedic',
# 'osseous',
# 'osteoarthritis',
# 'osteopenia',
# 'osteophyte',
# 'osteophytes',
# 'osteoporosis',
# 'pacemaker',
# 'parahilar',
# 'paramediastinal',
# 'paratracheal',
# 'pattern',
# 'peribronchial',
# 'perihilar',
# 'peritracheal',
# 'picc',
# 'pleural',
# 'pneumothoraces',
# 'pneumothorax',
# 'portacatheter',
# 'postop',
# 'postoperative',
# 'pretracheal',
# 'prevertebral',
# 'prominence',
# 'prominent',
# 'prosthetic',
# 'pseudofissure',
# 'pulmonary',
# 'radiodensity',
# 'redistribution',
# 'reticulonodular',
# 'retrocardiac',
# 'retrohilar',
# 'revascularization',
# 'rib',
# 'ribs',
# 'scoliosis',
# 'scoliotic',
# 'silhouette',
# 'silhouettes',
# 'skeletal',
# 'soft',
# 'sternotomy',
# 'subdiaphragmatic',
# 'subpleural',
# 'suprahilar',
# 'svc',
# 'thickening',
# 'tissue',
# 'tissues',
# 'top',
# 'tortuosity',
# 'tortuous',
# 'trachea',
# 'tube',
# 'uncalcified',
# 'unchanged',
# 'unfolded',
# 'unfolding',
# 'unremarkable',
# 'upper',
# 'valve',
# 'vascular',
# 'vascularity',
# 'vertebrae',
# 'vertebral',
# 'widened',
# 'widening'

In [None]:
level1 = keywords
level0 = None # any

In [None]:
levels = [
    keywords,
    None,
]
base = 10
max_exp = len(levels) - 1

In [None]:
def compute_h_overlap_matrix(sentences):
    def _prepare_sentence(sentence):
        return [
            set([
                word
                for word in sentence.split() if level is None or word in level
            ])
            for level in levels
        ]
    sentences = [
        _prepare_sentence(sentence)
        for sentence in sentences
    ]
    def _word_overlap(s1, s2):
        intersection = s1.intersection(s2)
        union = len(s1) + len(s2)
        return 2 * len(intersection) / (union + 1e-5)
    n_sentences = len(sentences)
    embeddings = np.zeros((n_sentences, n_sentences))
    for i in tqdm(range(n_sentences)):
        si_by_level = sentences[i]
        for j in range(i+1, n_sentences):
            sj_by_level = sentences[j]

            overlap = 0
            for index, (si, sj) in enumerate(zip(si_by_level, sj_by_level)):
                overlap += _word_overlap(si, sj) * base ** (max_exp - index)
            
            embeddings[i, j] = embeddings[j, i] = overlap
            
        embeddings[i, i] = 1
            
    return embeddings

In [None]:
l = list(SENTENCES_DF['sentence'])
# l = l[:100]
overlaps = compute_h_overlap_matrix(l)
overlaps.shape

In [None]:
overlaps

In [None]:
distances = overlaps.copy()
distances[distances==0] = 1
distances = 1/distances
for i in range(len(distances)):
    distances[i, i] = 0
distances

## Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [None]:
%%time

dbscan = DBSCAN(eps=0.1, metric='precomputed', min_samples=2)
dbscan.fit(distances)

In [None]:
clusters = group_by_cluster(dbscan, SENTENCES_DF['sentence'])
len(clusters), len(clusters[-1])

In [None]:
clusters[2]

In [None]:
clusters.keys()