# Imports

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

# Load data

In [None]:
from medai.datasets import iu_xray, mimic_cxr
IU_DIR = iu_xray.DATASET_DIR
MIMIC_DIR = mimic_cxr.DATASET_DIR

In [None]:
%run ../../datasets/vocab/__init__.py

In [None]:
IU_VOCAB = load_vocab(os.path.join(IU_DIR, 'reports'), 'v4-1')
MIMIC_VOCAB = load_vocab(os.path.join(MIMIC_DIR, 'reports'), 'v4-2')
len(IU_VOCAB), len(MIMIC_VOCAB)

In [None]:
dataset_dir, VOCAB = IU_DIR, IU_VOCAB
# dataset_dir, VOCAB = MIMIC_DIR, MIMIC_VOCAB

fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(3)

In [None]:
fpath = os.path.join(dataset_dir, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath)
REPORTS_DF.head(3)

# Topic modelling

In [None]:
import math

## Choose sentences/reports

In [None]:
reports = list(REPORTS_DF['Reports'])
sentences = list(SENTENCES_DF['sentence'])
len(sentences), len(reports)

In [None]:
subsentences = list(set(
    sub.strip()
    for sentence in sentences
    for sub in sentence.split(',')
))
len(subsentences)

In [None]:
text = sentences
# text = subsentences
# text = reports
len(text)

## Text to vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

In [None]:
%run ../../models/report_generation/word_embedding.py

In [None]:
# Does not work for now:
# (1) negative values in the embeddings (not allowed in LDA or NMF)
# (2) features do not represent words directly --> should search closest word to feature?
# or something like that? does it make sense?
class RadGloveVectorizer:
    def __init__(self, vocabulary=None, stop_words=None, **kwargs):
        self.radglove = RadGlove()
        
        assert vocabulary is not None
        self.vocab = vocabulary
        self.stop_words = set(stop_words) or set()
        
    def fit(self, X):
        pass

    def fit_transform(self, texts):
        return self.transform(texts)
        
    def transform(self, texts):
        # texts: list of texts

        vectors = []
        for text in texts:
            report_vector = np.zeros(self.radglove.dim)
            n_words = 0

            for word in text.split(): # assume is tokenized
                if word in self.stop_words:
                    continue
                if word not in self.vocab:
                    continue

                vector = self.radglove[word].numpy() # really inefficient!!
                report_vector += vector
                n_words += 1
                
            if n_words > 0:
                report_vector /= n_words
        
            vectors.append(report_vector)
            
        vectors = np.array(vectors) # shape: n_texts, n_dim=100
        
        return vectors
    
    def get_feature_names(self):
        return list(range(self.radglove.dim))

In [None]:
STOP_WORDS = [
    'there', 'the', 'is', 'are', 'in', 'on', '.', 'of', 'to', 'a',
]

In [None]:
tokenizer = lambda text: text.split() # text is already tokenized!!

In [None]:
kwargs = {'vocabulary': VOCAB, 'tokenizer': tokenizer, 'stop_words': STOP_WORDS}
# vectorizer = RadGloveVectorizer(**kwargs)
vectorizer = TfidfVectorizer(**kwargs)
# vectorizer = CountVectorizer(vocabulary=IU_VOCAB, tokenizer=tokenizer)

In [None]:
%%time
vectors = vectorizer.fit_transform(text)
vectors.shape

In [None]:
FEATURE_NAMES = vectorizer.get_feature_names()
len(FEATURE_NAMES)

## Topic modelling

In [None]:
%%time
lda = LDA(n_components=10)
lda.fit(vectors)

In [None]:
plot_topics(lda, 'LDA', yscale=8)

In [None]:
%%time
nmf = NMF(n_components=10, beta_loss='kullback-leibler', solver='mu')
nmf.fit(vectors)

In [None]:
plot_topics(nmf, 'NMF')

In [None]:
def plot_topics(model, name, n_top_words=20, n_cols=5, yscale=8):
    n_rows = math.ceil(len(model.components_) / n_cols)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*yscale), sharex=True)
    axes = axes.flatten()

    for topic_idx, topic in enumerate(model.components_):
        # topic shape: n_words (features)
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1] # shape: n_top_words

        top_features = [FEATURE_NAMES[i] for i in top_features_ind] # names of 
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights)
        ax.set_title(f"T{topic_idx + 1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(name, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()