## VinDR
Get label co-occurance matrix to determine how the classes should be split. Ideally labels that often occur together should go into the same group.

In [1]:
import seaborn as sns
import pandas as pd
info = pd.read_pickle('data/vindr_cxr_labels.pkl')
cols = ['Aortic enlargement', 'Atelectasis',
       'Calcification', 'Cardiomegaly', 'Clavicle fracture', 'Consolidation',
       'Edema', 'Emphysema', 'Enlarged PA', 'ILD', 'Infiltration',
       'Lung Opacity', 'Lung cavity', 'Lung cyst', 'Mediastinal shift',
       'Nodule/Mass', 'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis', 'Rib fracture', 'Other lesion', 'COPD',
       'Lung tumor', 'Pneumonia', 'Tuberculosis', 'Other diseases',
       'No finding']
corr = info[cols].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Clavicle fracture,Consolidation,Edema,Emphysema,Enlarged PA,ILD,Infiltration,Lung Opacity,Lung cavity,Lung cyst,Mediastinal shift,Nodule/Mass,Pleural effusion,Pleural thickening,Pneumothorax,Pulmonary fibrosis,Rib fracture,Other lesion,COPD,Lung tumor,Pneumonia,Tuberculosis,Other diseases,No finding
Aortic enlargement,1.0,0.041825,0.176993,0.643071,0.063068,0.076466,0.019219,0.068356,0.121737,0.132257,0.139395,0.277982,0.022222,0.021297,0.057272,0.208834,0.218146,0.426645,0.027066,0.29488,0.102392,0.285935,0.031201,0.102468,0.181835,0.110629,0.746344,-0.721954
Atelectasis,0.041825,1.0,0.090288,0.022137,-0.004985,0.181268,-0.003336,0.031522,-0.000562,0.081453,0.102755,0.152923,0.063802,0.025479,0.345062,0.081855,0.213052,0.146257,0.133346,0.183338,0.002849,0.130763,-0.005708,0.058784,0.201998,0.199055,0.155614,-0.190006
Calcification,0.176993,0.090288,1.0,0.127142,0.00704,0.035722,0.016924,0.004177,0.033849,0.112046,0.023064,0.081475,0.024899,0.03869,0.082517,0.225578,0.07101,0.190578,0.003263,0.201722,0.029224,0.178851,0.036431,0.051402,0.054143,0.167794,0.280659,-0.294477
Cardiomegaly,0.643071,0.022137,0.127142,1.0,0.054047,0.060075,0.053344,0.010971,0.161352,0.081153,0.074072,0.186953,0.006145,0.003201,0.014986,0.106824,0.13164,0.277585,0.002728,0.150736,0.06379,0.215645,-0.008717,0.039769,0.124184,0.004093,0.648057,-0.630447
Clavicle fracture,0.063068,-0.004985,0.00704,0.054047,1.0,0.002457,-0.00108,-0.002751,-0.003544,0.015248,0.014031,0.024295,-0.002323,-0.001773,-0.003922,0.008168,0.034842,0.040298,-0.003207,0.036775,0.089704,0.054473,-0.001848,-0.005828,0.028843,0.003328,0.061384,-0.061828
Consolidation,0.076466,0.181268,0.035722,0.060075,0.002457,1.0,0.02222,0.004729,0.018446,0.104832,0.258108,0.358308,0.108176,0.025285,0.113286,0.162976,0.272994,0.169977,0.117454,0.182531,0.040444,0.123224,0.000404,0.209971,0.564581,0.14798,0.165424,-0.246176
Edema,0.019219,-0.003336,0.016924,0.053344,-0.00108,0.02222,1.0,0.028502,0.092126,0.029003,0.049291,0.038251,-0.001555,0.045757,-0.002625,-0.006579,0.035278,0.009042,-0.002146,-0.002236,-0.002019,0.041509,0.043819,-0.0039,0.026549,0.003202,0.043146,-0.041378
Emphysema,0.068356,0.031522,0.004177,0.010971,-0.002751,0.004729,0.028502,1.0,0.021896,0.031857,0.042464,0.034519,0.038468,0.182017,0.035453,0.046794,0.078823,0.104537,0.02536,0.092669,0.027595,0.074372,0.494131,0.001541,0.031685,0.091839,0.100811,-0.100035
Enlarged PA,0.121737,-0.000562,0.033849,0.161352,-0.003544,0.018446,0.092126,0.021896,1.0,0.049591,0.036243,0.08035,0.01692,-0.003894,0.004509,0.055379,0.047062,0.062598,0.000957,0.049899,0.00187,0.12333,0.00977,0.031872,0.046441,0.014286,0.14158,-0.135777
ILD,0.132257,0.081453,0.112046,0.081153,0.015248,0.104832,0.029003,0.031857,0.049591,1.0,0.212535,0.117244,0.068498,0.054011,0.047832,0.120364,0.127996,0.159188,0.019561,0.199408,0.022598,0.128703,0.044507,0.073577,0.314989,0.155122,0.227839,-0.282867


## Meta Split
Split data into meta train/test/validation sets. Images that have train or validation labels are excluded from the meta-train set.

In [None]:
import pandas as pd
import numpy as np

from utils.labels import VINDR_SPLIT

info = pd.read_pickle('data/vindr_cxr_labels.pkl')
allocations_vindr = {}
for i in range(len(info)):
    df = info.iloc[i]
    res = []
    for s in ['train', 'test', 'val']:
        if df[VINDR_SPLIT[s]].sum() > 0:
            res.append(s)
    allocations_vindr[df['image_id']] = res

for f, s in allocations_vindr.items():
    if len(s) == 1:
        info.loc[info['image_id'] == f, 'meta_split'] = s[0]
        continue
    if 'val' in s and 'test' not in s:
        info.loc[info['image_id'] == f, 'meta_split'] = 'val'
        continue
    if 'test' in s and 'val' not in s:
        info.loc[info['image_id'] == f, 'meta_split'] = 'test'
        continue

    df = info[info['image_id'] == f].iloc[0]
    val_priority = ['COPD']
    if df[val_priority].sum() > 0:
        info.loc[info['image_id'] == f, 'meta_split'] = 'val'
        continue
    test_priority = ['Edema','Lung cyst','Clavicle fracture','Lung cavity']
    if df[test_priority].sum() > 0:
        info.loc[info['image_id'] == f, 'meta_split'] = 'test'
        continue
    if np.random.uniform() > 0.5:
        info.loc[info['image_id'] == f, 'meta_split'] = 'test'
    else:
        info.loc[info['image_id'] == f, 'meta_split'] = 'val'

info.to_pickle('vindr_cxr_split_labels.pkl')

### Training Query Set

In [5]:
import pandas as pd
from utils.data import select_query_set, count_classes

info = pd.read_pickle('data/vindr_cxr_split_labels.pkl')
info = info[info['meta_split'] == 'train']

query_set, selected = select_query_set(info, VINDR_SPLIT['train'], 50)

print(len(selected))
print('query', count_classes(info[info['image_id'].isin(selected)], VINDR_SPLIT['train']))
print('support', count_classes(info[~info['image_id'].isin(selected)], VINDR_SPLIT['train']))

700
query {'No finding': 61, 'Other diseases': 573, 'Aortic enlargement': 372, 'Cardiomegaly': 273, 'Pleural thickening': 328, 'Pulmonary fibrosis': 295, 'Lung Opacity': 248, 'Other lesion': 171, 'Pneumonia': 257, 'Pleural effusion': 200, 'Tuberculosis': 164, 'Infiltration': 186, 'ILD': 132, 'Consolidation': 108}
support {'No finding': 12569, 'Other diseases': 2611, 'Aortic enlargement': 1877, 'Cardiomegaly': 1640, 'Pleural thickening': 943, 'Pulmonary fibrosis': 696, 'Lung Opacity': 449, 'Other lesion': 401, 'Pneumonia': 347, 'Pleural effusion': 407, 'Tuberculosis': 265, 'Infiltration': 209, 'ILD': 217, 'Consolidation': 92}


In [2]:
import pandas as pd
from utils.data import select_query_set, count_classes
from utils.labels import VINDR_SPLIT

info = pd.read_pickle('data/vindr_cxr_split_labels.pkl')
info = info[info['meta_split'] == 'train']

query_set, selected = select_query_set(info, VINDR_SPLIT['train'], 40)

print(len(selected))
print('query', count_classes(info[info['image_id'].isin(selected)], VINDR_SPLIT['train']))
print('support', count_classes(info[~info['image_id'].isin(selected)], VINDR_SPLIT['train']))

560
query {'No finding': 45, 'Other diseases': 461, 'Aortic enlargement': 287, 'Cardiomegaly': 216, 'Pleural thickening': 249, 'Pulmonary fibrosis': 240, 'Lung Opacity': 197, 'Other lesion': 135, 'Pneumonia': 199, 'Pleural effusion': 164, 'Tuberculosis': 127, 'Infiltration': 147, 'ILD': 97, 'Consolidation': 89}
support {'No finding': 12585, 'Other diseases': 2723, 'Aortic enlargement': 1962, 'Cardiomegaly': 1697, 'Pleural thickening': 1022, 'Pulmonary fibrosis': 751, 'Lung Opacity': 500, 'Other lesion': 437, 'Pneumonia': 405, 'Pleural effusion': 443, 'Tuberculosis': 302, 'Infiltration': 248, 'ILD': 252, 'Consolidation': 111}


In [3]:
import pickle
with open('vindr_train_query_set.pkl', 'wb') as fp:
    pickle.dump(query_set, fp)

## MuReD

In [1]:
import seaborn as sns
import pandas as pd
info = pd.read_pickle('data/mured_labels.pkl')
cols = ['DR', 'NORMAL', 'MH', 'ODC', 'TSLN', 'ARMD', 'DN', 'MYA', 'BRVO', 'ODP', 'CRVO', 'CNV', 'RS', 'ODE', 'LS', 'CSR', 'HTR', 'ASR', 'CRS', 'OTHER']
corr = info[cols].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,DR,NORMAL,MH,ODC,TSLN,ARMD,DN,MYA,BRVO,ODP,CRVO,CNV,RS,ODE,LS,CSR,HTR,ASR,CRS,OTHER
DR,1.0,-0.288214,-0.105743,-0.100446,-0.025313,-0.149237,-0.130437,-0.093603,-0.097703,-0.005913,-0.085918,-0.089843,-0.088291,-0.087507,0.225728,-0.069206,-0.024749,-0.012513,-0.063089,-0.129543
NORMAL,-0.288214,1.0,-0.154357,-0.197156,-0.147831,-0.148848,-0.150868,-0.109881,-0.10328,-0.091132,-0.085694,-0.089609,-0.088062,-0.087279,-0.078206,-0.069026,-0.059338,-0.066042,-0.062925,-0.189567
MH,-0.105743,-0.154357,1.0,0.072943,0.133359,-0.060098,-0.04181,0.001628,-0.027943,-0.007687,-0.046014,-0.048116,-0.047286,-0.046865,-0.041994,-0.037064,-0.036538,-0.035462,-0.004358,-0.063196
ODC,-0.100446,-0.197156,0.072943,1.0,0.133238,-0.058691,-0.049849,0.073925,-0.025668,-0.045576,-0.058773,-0.061458,-0.042912,-0.05986,-0.004691,-0.025261,-0.046668,-0.045294,0.029307,-0.04802
TSLN,-0.025313,-0.147831,0.133359,0.133238,1.0,0.094882,0.010536,-0.056507,-0.005534,0.060123,-0.044069,-0.046082,-0.045286,-0.044884,-0.015469,-0.035497,-0.034993,-0.033963,-0.03236,-0.040732
ARMD,-0.149237,-0.148848,-0.060098,-0.058691,0.094882,1.0,-0.037686,0.157514,-0.044018,-0.036552,-0.044372,0.429104,-0.045598,-0.045193,-0.040495,-0.035742,0.07731,0.023729,-0.032582,-0.052664
DN,-0.130437,-0.150868,-0.04181,-0.049849,0.010536,-0.037686,1.0,-0.057668,-0.044852,0.015257,-0.044974,-0.025663,-0.046217,-0.045806,-0.004561,-0.036226,0.019915,0.008286,-0.033025,-0.016944
MYA,-0.093603,-0.109881,0.001628,0.073925,-0.056507,0.157514,-0.057668,1.0,-0.014683,0.020921,-0.032756,-0.034252,-0.033661,-0.033362,-0.013771,-0.026385,-0.02601,-0.025244,-0.024053,-0.060771
BRVO,-0.097703,-0.10328,-0.027943,-0.025668,-0.005534,-0.044018,-0.044852,-0.014683,1.0,-0.032742,-0.030788,-0.032195,-0.031639,-0.031358,-0.028098,-0.0248,-0.024447,0.036561,-0.001545,-0.055423
ODP,-0.005913,-0.091132,-0.007687,-0.045576,0.060123,-0.036552,0.015257,0.020921,-0.032742,1.0,-0.009576,-0.028408,-0.027917,-0.027669,0.10957,-0.000235,-0.021572,-0.020937,0.003732,0.039666


Split data into meta train/test/validation sets

In [12]:
import pandas as pd
import numpy as np

from utils.labels import MURED_SPLIT

info = pd.read_pickle('data/mured_labels.pkl')
allocations_mured = {}
for i in range(len(info)):
    df = info.iloc[i]
    res = []
    for s in ['train', 'test', 'val']:
        if df[MURED_SPLIT[s]].sum() > 0:
            res.append(s)
    allocations_mured[df['image_id']] = res

val_test_classes = {}
for f, s in allocations_mured.items():
    if len(s) == 1:
        info.loc[info['image_id'] == f, 'meta_split'] = s[0]
        continue
    if 'val' in s and 'test' not in s:
        info.loc[info['image_id'] == f, 'meta_split'] = 'val'
        continue
    
    info.loc[info['image_id'] == f, 'meta_split'] = 'test'

for s in ['train', 'test', 'val']:
    print(s, len(info[info['meta_split'] == s]))

info.to_pickle('mured_split_labels.pkl')

train 1746
test 230
val 232


In [13]:
import pandas as pd
from utils.data import select_query_set, count_classes

info = pd.read_pickle('data/mured_split_labels.pkl')
info = info[info['meta_split'] == 'train']

query_set, selected = select_query_set(info, MURED_SPLIT['train'], 10)

print(len(selected))
print('query', count_classes(info[info['image_id'].isin(selected)], MURED_SPLIT['train']))
print('support', count_classes(info[~info['image_id'].isin(selected)], MURED_SPLIT['train']))

100
query {'DR': 17, 'NORMAL': 10, 'ODC': 22, 'OTHER': 17, 'MH': 13, 'DN': 14, 'ARMD': 20, 'TSLN': 17, 'MYA': 11, 'CNV': 13}
support {'DR': 417, 'NORMAL': 482, 'ODC': 218, 'OTHER': 205, 'MH': 147, 'DN': 133, 'ARMD': 125, 'TSLN': 123, 'MYA': 72, 'CNV': 37}


In [14]:
import pickle
with open('mured_train_query_set.pkl', 'wb') as fp:
    pickle.dump(query_set, fp)