# Imports

In [1]:
import pandas as pd
import os
from collections import defaultdict

In [2]:
# %run ../utils/__init__.py
%run ../metrics/report_generation/labeler_correctness/cache.py

# Load Cache file

In [71]:
FPATH = os.path.join(LABELER_CACHE_DIR, 'sentences_chexpert.csv')

In [3]:
df = pd.read_csv(FPATH)
print(len(df))
df.head()

1937880


Unnamed: 0,sentences,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,apical placement proximal opacity convexity bl...,-2.0,-2.0,-2.0,-2.0,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,hilar postoperative cannot near curvature two ...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,hilar probable great detecting comparisons yea...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,possibly hemidiaphragm subcentimeter midclavic...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,decreased decreased eventration blunted dictat...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


In [4]:
sentences = list(df['sentences'])
len(sentences), len(set(sentences))

(1937880, 1937880)

In [5]:
sentences[:20]

['apical placement proximal opacity convexity blunted contain marking years blunted blunted midclavicular questioned recurrence evolving clavicle clavicle clavicle clavicle .',
 'hilar postoperative cannot near curvature two cannot determined eventration calcification degenerate frontal cannot midclavicular aspect blunted evolving clavicle clavicle .',
 'hilar probable great detecting comparisons years blunted rightward degree exposure blunted geographic suggests midline questioned recurrence recurrence it blunted .',
 'possibly hemidiaphragm subcentimeter midclavicular radiating rightward resection both definitely blunted probable perihilar blunted were blunted blunted reaches inferior blunted .',
 'decreased decreased eventration blunted dictation dictation left-to-right suggests accounting closely blunted dictation cardio volume pericardial blunted evolving compartment clavicle .',
 'decreased aspect subcentimeter midclavicular radiating rightward resection both definitely blunted p

## Check empty sentences

In [6]:
sum(1 for s in sentences if len(s.split()) == 0)

0

## Remove trailing dot

In [32]:
repeated_sentences = defaultdict(lambda: 1)
reduced_sentences = set()

for sentence in sentences:
    sentence = sentence.split()
    if sentence[-1] == '.':
        sentence = sentence[:-1]
    sentence = ' '.join(sentence)
    if sentence in reduced_sentences:
        repeated_sentences[sentence] += 1

    reduced_sentences.add(sentence)
    
len(reduced_sentences), len(sentences)

(1729616, 1729625)

In [33]:
repeated_sentences

defaultdict(<function __main__.<lambda>()>,
            {'heart size is normal and lungs are clear': 2,
             'heart size normal . lungs clear': 2,
             'UNK vascular congestion and diffuse interstitial edema UNK stable cardiomegaly UNK low lung volumes': 2,
             'lungs are clear without focal consolidation , effusion , or pneumothorax . normal heart size . bony thorax and soft tissues grossly unremarkable': 2,
             'UNK low lung volumes UNK lungs appear clear UNK heart and pulmonary xxxx are normal UNK pleural spaces are clear': 2,
             'heart size normal and lungs are clear': 2,
             'heart size is normal the lungs are clear': 2,
             'heart size is normal . lungs are clear . no evidence of tuberculosis': 2,
             'no evidence of acute cardiopulmonary disease': 2})

## Remove repeated tokens

In [8]:
def remove_duplicated_tokens(tokens):
    return [
        token
        for i, token in enumerate(tokens)
        if i == 0 or token != tokens[i-1]
    ]

In [26]:
remove_duplicated_tokens(['there', 'there', 'is', 'stable', 'there'])

['there', 'is', 'stable', 'there']

In [41]:
repeated_sentences = defaultdict(lambda: 1)
reduced_sentences = set()

for sentence in sentences:
    sentence = [
        token
        for token in sentence.split()
        if token not in ('END', ',', '.', 'xxxx')
    ]
    
    sentence = remove_duplicated_tokens(sentence)
    
    sentence = ' '.join(sentence)
    if sentence in reduced_sentences:
        repeated_sentences[sentence] += 1

    reduced_sentences.add(sentence)
    
len(reduced_sentences), len(sentences)

(1483520, 1729625)

In [42]:
sorted(((k, v) for k, v in repeated_sentences.items()), key=lambda x:x[1], reverse=True)

[('without', 767),
 ('the', 618),
 ('from as', 587),
 ('of', 444),
 ('and', 426),
 ('', 416),
 ('are', 364),
 ('compatible without', 354),
 ('in', 340),
 ('are of', 302),
 ('from', 273),
 ('of the', 251),
 ('are in', 241),
 ('of in', 239),
 ('on', 235),
 ('the of', 218),
 ('or', 211),
 ('the right', 206),
 ('are the', 205),
 ('no', 200),
 ('without from', 198),
 ('no of', 190),
 ('pleural', 188),
 ('with', 185),
 ('is in', 185),
 ('is', 184),
 ('to without', 179),
 ('the are', 178),
 ('noted', 170),
 ('is of', 167),
 ('from without from', 161),
 ('no are', 153),
 ('to', 152),
 ('the of the', 151),
 ('and of', 151),
 ('and without', 151),
 ('and the', 148),
 ('and in', 146),
 ('in on', 143),
 ('of noted', 141),
 ('from without', 140),
 ('are of in', 139),
 ('the in', 138),
 ('the and', 137),
 ('no of the', 135),
 ('and noted', 133),
 ('are noted', 130),
 ('no the', 129),
 ('the left', 122),
 ('degenerative in', 122),
 ('and are', 120),
 ('and with', 120),
 ('the lung', 118),
 ('right', 

# Clean sentences

## Apply cleaning

In [60]:
%run ../metrics/report_generation/labeler_correctness/light_labeler.py

In [61]:
clean_sentence('there - there / &lt  asdf UNK'.split())

['there', 'asdf']

In [62]:
clean_sentences = [
    ' '.join(clean_sentence(sentence.split()))
    for sentence in df['sentences']
]
len(set(clean_sentences)), len(clean_sentences), len(df)

(1643585, 1937880, 1937880)

In [63]:
df['clean_sentences'] = clean_sentences
df.head()

Unnamed: 0,sentences,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,clean_sentences
0,apical placement proximal opacity convexity bl...,-2.0,-2.0,-2.0,-2.0,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,apical placement proximal opacity convexity bl...
1,hilar postoperative cannot near curvature two ...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,hilar postoperative cannot near curvature two ...
2,hilar probable great detecting comparisons yea...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,hilar probable great detecting comparisons yea...
3,possibly hemidiaphragm subcentimeter midclavic...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,possibly hemidiaphragm subcentimeter midclavic...
4,decreased decreased eventration blunted dictat...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,decreased eventration blunted dictation left-t...


## Remove duplicated

In [64]:
cols = [c for c in df.columns if 'sentence' not in c]
len(cols)

14

In [65]:
unique_df = df.groupby('clean_sentences').first()
unique_df.head()

Unnamed: 0_level_0,sentences,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
clean_sentences,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,END .,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
NUMBER,NUMBER .,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
NUMBER a,NUMBER a .,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
NUMBER a NUMBER in NUMBER of,xxxx NUMBER a NUMBER in xxxx NUMBER of xxxx xx...,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
NUMBER a NUMBER in the cm in a cm there of in to NUMBER effusion,xxxx xxxx xxxx xxxx NUMBER a NUMBER xxxx in th...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,1.0,-2.0,-2.0,-2.0


In [66]:
len(unique_df)

1643585

In [77]:
unique_df = unique_df.reset_index(drop=False)
del unique_df['sentences']
unique_df = unique_df.rename(columns={'clean_sentences': 'sentences'})
unique_df.head()

Unnamed: 0,sentences,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,NUMBER,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,NUMBER a,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,NUMBER a NUMBER in NUMBER of,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,NUMBER a NUMBER in the cm in a cm there of in ...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,1.0,-2.0,-2.0,-2.0


In [78]:
unique_df.to_csv(FPATH, index=False)