In [None]:
import glob
import os
from pathlib import Path

print(os.getcwd())
os.chdir('../')
print(os.getcwd())

In [None]:
import pandas as pd

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.ChexpertLabeler.loader import Loader
from src.ChexpertLabeler.stages import Aggregator, Classifier, Extractor

In [None]:
root_loc = './'
file_in = 'sample_reports.csv'

In [None]:
loader = Loader(
    reports_path=Path(root_loc + file_in),
    # extract_impression=True,
)

loader.load()

In [None]:
loader.collection.documents[0]

In [None]:
# loader.collection

In [None]:
extractor = Extractor(
    mention_phrases_dir=Path('./phrases/mention/'),
    unmention_phrases_dir=Path('./phrases/unmention/'),
    verbose=True,
)

In [None]:
extractor.extract(loader.collection)

In [None]:
classifier = Classifier(
    pre_negation_uncertainty_path='./patterns/pre_negation_uncertainty.txt',
    negation_path='./patterns/negation.txt',
    post_negation_uncertainty_path='./patterns/post_negation_uncertainty.txt',
    verbose=True
)

In [None]:
classifier.classify(loader.collection)

In [None]:
CATEGORIES = [
    "No Finding", 
    "Enlarged Cardiomediastinum", 
    "Cardiomegaly",
    "Lung Lesion", 
    "Lung Opacity", 
    "Edema", 
    "Consolidation",
    "Pneumonia", 
    "Atelectasis", 
    "Pneumothorax", 
    "Pleural Effusion",
    "Pleural Other", 
    "Fracture", 
    "Support Devices"
]

In [None]:
aggregator = Aggregator(
    CATEGORIES,
    verbose=True
)

In [None]:
labels = aggregator.aggregate(loader.collection)

In [None]:
labels_sents = aggregator.aggregate_with_sentence(loader.collection)

In [None]:
# long with doc, sentence, and label, able to group by sentence...

In [None]:
labeled_reports = pd.DataFrame({"Reports": loader.reports})
for index, category in enumerate(CATEGORIES):
    labeled_reports[category] = labels[:, index]
    
labeled_reports

In [None]:
%%time
df_long = pd.DataFrame(columns=['id', 'cat', 'vals', 'sents'])

for index, entry in enumerate(labels_sents):
    for ent, vals in entry.items():
        for val in vals:
            df_long = df_long.append(
                {
                    'id': index, 
                    'cat': ent, 
                    'vals': val[0],
                    'sents': val[1],
                }, ignore_index=True
            )

df_long.head()

In [None]:
df_long.vals.value_counts()

In [None]:
df_long.shape

In [None]:
df_long['id'] = df_long['id'].astype(int)

In [None]:
df_reports = pd.read_csv(root_loc + file_in, header=None).reset_index()

df_reports

In [None]:
df_out = df_reports.merge(df_long, left_on='index', right_on='id', how='left')

df_out = df_out.drop(['id', 'index'], axis=1).rename(
    {
        0: 'mimic_id',
        1: 'report',
    }, axis=1
)

df_out

In [None]:
df_out.to_csv('./data/mimic_processed/' + 'sents_' + file_in, index=False)