In [1]:
import pandas as pd
import numpy as np
import pydicom
import glob
from typing import List
from pathlib import Path
from wsl.locations import wsl_data_dir
from wsl.networks.medinet.utils import rle2mask
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('original.csv', index_col=0)
print(len(df))
df.head()

12954


Unnamed: 0,ImageId,EncodedPixels
420,1.2.276.0.7230010.3.1.4.8323329.3678.151787517...,194329 37 960 69 919 111 913 117 910 116 912 5...
9772,1.2.276.0.7230010.3.1.4.8323329.4200.151787518...,-1
1542,1.2.276.0.7230010.3.1.4.8323329.4862.151787518...,344265 2 1020 7 1015 9 1012 12 1010 14 1008 15...
7532,1.2.276.0.7230010.3.1.4.8323329.12313.15178752...,-1
5542,1.2.276.0.7230010.3.1.4.8323329.14214.15178752...,-1


In [3]:
files = glob.glob(str(wsl_data_dir) + '/siim/*/*/*.dcm')
len(files), files[0]

(10355,
 '/data/2015P002510/Mehak/git_wsl/data/siim/1.2.276.0.7230010.3.1.2.8323329.11585.1517875233.741598/1.2.276.0.7230010.3.1.3.8323329.11585.1517875233.741597/1.2.276.0.7230010.3.1.4.8323329.11585.1517875233.741599.dcm')

In [4]:
data = {'Id': [],
        'ImageId': []
       }
plot = False

for file in files:
    parents = file.split('/')
    Id = parents[-3] + '/' + parents[-2] + '/' + Path(file).stem
    if plot:
        ds = pydicom.dcmread(file)
        study = ds.PatientID
        img = ds.pixel_array
        rows, cols = ds.Rows, ds.Columns

        plt.figure(figsize=(8, 8))
        plt.imshow(img, cmap=plt.cm.bone)
        eps = df[df.ImageId == Path(file).stem].EncodedPixels.to_list()
        mask = np.zeros((cols, rows))
        for ep in eps:
            if ep != '-1':
                mask += rle2mask(ep, cols, rows).T
                plt.imshow(img + mask, cmap=plt.cm.bone)
        plt.show()

    data['Id'].append(Id)
    data['ImageId'].append(Path(file).stem)

In [5]:
info = pd.DataFrame.from_dict(data)
info = pd.merge(info, df, on='ImageId', how='inner')
info['Pneumothorax'] = info.apply(lambda row: 0 if row['EncodedPixels'] == '-1' else 1, axis=1)
info = info[['Id', 'pneumothorax', 'EncodedPixels']]
display(info)
info.to_csv('info.csv', index=False)

Unnamed: 0,Id,Pneumothorax,EncodedPixels
0,1.2.276.0.7230010.3.1.2.8323329.11585.15178752...,0,-1
1,1.2.276.0.7230010.3.1.2.8323329.11586.15178752...,0,-1
2,1.2.276.0.7230010.3.1.2.8323329.11587.15178752...,0,-1
3,1.2.276.0.7230010.3.1.2.8323329.11588.15178752...,0,-1
4,1.2.276.0.7230010.3.1.2.8323329.11589.15178752...,0,-1
...,...,...,...
11082,1.2.276.0.7230010.3.1.2.8323329.7049.151787520...,0,-1
11083,1.2.276.0.7230010.3.1.2.8323329.705.1517875164...,1,338106 1 1019 4 1017 6 1016 8 1015 8 1014 9 10...
11084,1.2.276.0.7230010.3.1.2.8323329.7050.151787520...,0,-1
11085,1.2.276.0.7230010.3.1.2.8323329.7051.151787520...,0,-1


In [6]:
import random
patients = list(set(info['Id'].tolist()))
random.shuffle(patients)
print(len(patients))

splits = {'train':patients[:8284],
          'valid':patients[8284:9320],
          'test':patients[9320:]}

for key, value in splits.items():
    pdf = info[info.Id.isin(value)]
    display(pdf.head())
    pdf = pd.DataFrame(pdf.Id.tolist(), columns=['Id'])
    pdf.to_csv(key + '.csv', index=False)
    display(pdf.head())

10324


Unnamed: 0,Id,Pneumothorax,EncodedPixels
1,1.2.276.0.7230010.3.1.2.8323329.11586.15178752...,0,-1
2,1.2.276.0.7230010.3.1.2.8323329.11587.15178752...,0,-1
3,1.2.276.0.7230010.3.1.2.8323329.11588.15178752...,0,-1
4,1.2.276.0.7230010.3.1.2.8323329.11589.15178752...,0,-1
5,1.2.276.0.7230010.3.1.2.8323329.1159.151787516...,0,-1


Unnamed: 0,Id
0,1.2.276.0.7230010.3.1.2.8323329.11586.15178752...
1,1.2.276.0.7230010.3.1.2.8323329.11587.15178752...
2,1.2.276.0.7230010.3.1.2.8323329.11588.15178752...
3,1.2.276.0.7230010.3.1.2.8323329.11589.15178752...
4,1.2.276.0.7230010.3.1.2.8323329.1159.151787516...


Unnamed: 0,Id,Pneumothorax,EncodedPixels
17,1.2.276.0.7230010.3.1.2.8323329.1160.151787516...,0,-1
40,1.2.276.0.7230010.3.1.2.8323329.11616.15178752...,0,-1
65,1.2.276.0.7230010.3.1.2.8323329.11638.15178752...,0,-1
66,1.2.276.0.7230010.3.1.2.8323329.11639.15178752...,1,551082 3 1015 10 1011 14 1006 19 1001 24 997 2...
82,1.2.276.0.7230010.3.1.2.8323329.11653.15178752...,0,-1


Unnamed: 0,Id
0,1.2.276.0.7230010.3.1.2.8323329.1160.151787516...
1,1.2.276.0.7230010.3.1.2.8323329.11616.15178752...
2,1.2.276.0.7230010.3.1.2.8323329.11638.15178752...
3,1.2.276.0.7230010.3.1.2.8323329.11639.15178752...
4,1.2.276.0.7230010.3.1.2.8323329.11653.15178752...


Unnamed: 0,Id,Pneumothorax,EncodedPixels
0,1.2.276.0.7230010.3.1.2.8323329.11585.15178752...,0,-1
9,1.2.276.0.7230010.3.1.2.8323329.11593.15178752...,0,-1
35,1.2.276.0.7230010.3.1.2.8323329.11612.15178752...,0,-1
36,1.2.276.0.7230010.3.1.2.8323329.11613.15178752...,1,257413 10 1006 22 997 33 985 57 961 69 947 78 ...
37,1.2.276.0.7230010.3.1.2.8323329.11613.15178752...,1,340146 4 1015 11 1006 19 1002 22 997 27 995 29...


Unnamed: 0,Id
0,1.2.276.0.7230010.3.1.2.8323329.11585.15178752...
1,1.2.276.0.7230010.3.1.2.8323329.11593.15178752...
2,1.2.276.0.7230010.3.1.2.8323329.11612.15178752...
3,1.2.276.0.7230010.3.1.2.8323329.11613.15178752...
4,1.2.276.0.7230010.3.1.2.8323329.11613.15178752...
