# 0 IMPORT

In [1]:
import numpy as np
import pandas as pd

from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *
import pydicom

# 1 LOADING THE DATASET

In [2]:
train_image_path = "../input/siim-covid19-detection/train_image_level.csv"
sample_sub_path = "../input/siim-covid19-detection/sample_submission.csv"
train_study_path = "../input/siim-covid19-detection/train_study_level.csv"
print ("__study level csv__")
train_study_level = pd.read_csv(train_study_path)
print (train_study_level.head())

print ("__sample submission__")
sample_sub = pd.read_csv(sample_sub_path)
print (sample_sub.head())

print ("__train level csv__")
train_image_level = pd.read_csv(train_image_path)
print (train_image_level.head())

__study level csv__
                   id  Negative for Pneumonia  Typical Appearance  \
0  00086460a852_study                       0                   1   
1  000c9c05fd14_study                       0                   0   
2  00292f8c37bd_study                       1                   0   
3  005057b3f880_study                       1                   0   
4  0051d9b12e72_study                       0                   0   

   Indeterminate Appearance  Atypical Appearance  
0                         0                    0  
1                         0                    1  
2                         0                    0  
3                         0                    0  
4                         0                    1  
__sample submission__
                   id    PredictionString
0  00188a671292_study  negative 1 0 0 1 1
1  004bd59708be_study  negative 1 0 0 1 1
2  00508faccd39_study  negative 1 0 0 1 1
3  006486aa80b2_study  negative 1 0 0 1 1
4  00655178fdfc_study  nega

# 2 WORKING ON DATAFRAMES

In [3]:
#rename the column to merge the dataframes
train_study_level.rename(columns = {'id': 'StudyInstanceUID'}, inplace =True)
train_study_level[:1]

Unnamed: 0,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0


In [4]:
#remove the _study in 'StudyInstanceUID'
train_study_level['StudyInstanceUID'] = train_study_level['StudyInstanceUID'].str.strip('_study')
train_study_level[:5]

Unnamed: 0,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852,0,1,0,0
1,000c9c05fd14,0,0,0,1
2,00292f8c37b,1,0,0,0
3,005057b3f880,1,0,0,0
4,0051d9b12e72,0,0,0,1


In [5]:
#merging the two dataframes
train_df = train_image_level.merge(train_study_level)
train_df[:1]

Unnamed: 0,id,boxes,label,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 1026.65662, 'height': 1917.30292}, {'x': 2245.91208, 'y': 591.20528, 'width': 1094.66162, 'height': 1761.54944}]",opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472,5776db0cec75,0,1,0,0


We then clean up the dataframe. We drop a few columns and make a single column as class label. We also drop `boxes` since we can use `label` to obtain coordinates for binding boxes.

In [6]:
train_df['id'] = train_df['id'].str.strip('_image')
train_df.loc[train_df['Negative for Pneumonia']==1, 'class_y'] = 'Negative'
train_df.loc[train_df['Typical Appearance']==1, 'class_y'] = 'Typical'
train_df.loc[train_df['Indeterminate Appearance']==1, 'class_y'] = 'Indeterminate'
train_df.loc[train_df['Atypical Appearance']==1, 'class_y'] = 'Atypical'
train_df.drop(['boxes', 'Negative for Pneumonia', 'Typical Appearance', 
             'Indeterminate Appearance', 'Atypical Appearance', 'StudyInstanceUID'], axis=1, inplace=True)
train_df[:5]

Unnamed: 0,id,label,class_y
0,000a312787f2,opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472,Typical
1,0012ff7358bc,opacity 1 677.42216 197.97662 1545.21983 1197.75876 opacity 1 1792.69064 402.5525 2409.71798 1606.9105,Typical
2,001398f4ff4f,opacity 1 2729 2181.33331 3677.00012 2785.33331,Atypical
3,0022227f5adf,opacity 1 1857.2065 508.30565 2233.23384 907.83476,Indeterminate
4,0023f02ae886,none 1 0 0 1 1,Negative


In [7]:
train_df.class_y.value_counts()

Typical          2665
Negative         1507
Indeterminate     970
Atypical          432
Name: class_y, dtype: int64

In [8]:
#identifying number of boxes
num_of_boxes = []
for i in train_df.index:
    label_len = len(train_df.label[i].split(' '))
    num_box = label_len//6
    num_of_boxes.append(num_box)

In [9]:
train_df['num_of_boxes'] = num_of_boxes
train_df.head() 

Unnamed: 0,id,label,class_y,num_of_boxes
0,000a312787f2,opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472,Typical,2
1,0012ff7358bc,opacity 1 677.42216 197.97662 1545.21983 1197.75876 opacity 1 1792.69064 402.5525 2409.71798 1606.9105,Typical,2
2,001398f4ff4f,opacity 1 2729 2181.33331 3677.00012 2785.33331,Atypical,1
3,0022227f5adf,opacity 1 1857.2065 508.30565 2233.23384 907.83476,Indeterminate,1
4,0023f02ae886,none 1 0 0 1 1,Negative,1


In [10]:
train_df.num_of_boxes.value_counts()

2    2750
1    2636
3     165
4      22
5       1
Name: num_of_boxes, dtype: int64

Now we extract co-ordinates of the binding boxes from `label`

In [11]:

label_COORD = []
for i in train_df.index:
    num_of_boxes = train_df.num_of_boxes[i]
    val = train_df.label[i].split(' ')
    if num_of_boxes == 1 : coord = val[2:6]
    if num_of_boxes == 2 : coord = val[2:6] + val [8:12]
    if num_of_boxes == 3 : coord = val[2:6] + val [8:12] + val [14:18]
    if num_of_boxes == 4 : coord = val[2:6] + val [8:12] + val [14:18] + val[20:24]
    if num_of_boxes == 5 : coord = val[2:6] + val [8:12] + val [14:18] + val[20:24] + val[26:30]
    label_COORD.append(coord)
     

In [12]:
train_df['label_COORD'] = label_COORD
del train_df['label']
train_df.head()

Unnamed: 0,id,class_y,num_of_boxes,label_COORD
0,000a312787f2,Typical,2,"[789.28836, 582.43035, 1815.94498, 2499.73327, 2245.91208, 591.20528, 3340.5737, 2352.75472]"
1,0012ff7358bc,Typical,2,"[677.42216, 197.97662, 1545.21983, 1197.75876, 1792.69064, 402.5525, 2409.71798, 1606.9105]"
2,001398f4ff4f,Atypical,1,"[2729, 2181.33331, 3677.00012, 2785.33331]"
3,0022227f5adf,Indeterminate,1,"[1857.2065, 508.30565, 2233.23384, 907.83476]"
4,0023f02ae886,Negative,1,"[0, 0, 1, 1]"


Rename the column `id` to `SOPInstanceUID` to merge the dataframe with meta data

In [13]:
train_df.rename(columns = {'id':'SOPInstanceUID'},inplace = True)
train_df[:1]

Unnamed: 0,SOPInstanceUID,class_y,num_of_boxes,label_COORD
0,000a312787f2,Typical,2,"[789.28836, 582.43035, 1815.94498, 2499.73327, 2245.91208, 591.20528, 3340.5737, 2352.75472]"


# 3 LOADING THE META DATA

[We then load the DICOM metadata that we have obtained](https://www.kaggle.com/slimshadymm/visualizing-dicoms) 

In [14]:
dicom_df = pd.read_pickle('../input/visualizing-dicoms/dicoms_df.pkl')
dicom_df.head()

Unnamed: 0,SpecificCharacterSet,ImageType,SOPClassUID,SOPInstanceUID,StudyDate,StudyTime,AccessionNumber,Modality,Unnamed: 9,PatientName,...,img_mean,img_std,img_pct_window,ImageType2,ImageType3,ImageType4,ImageType5,ImageType6,ImageType7,ImageType8
0,ISO_IR 100,ORIGINAL,71228e4340de,d8ba599611e5,837af68926b7,81c1a714dbda,c4aef4fc15a2,DX,GEIIS,"(1, d, f, 0, 3, 0, 4, 6, 2, 4, f, 5)",...,15830.900366,3066.972578,0.0,,,,,,,
1,ISO_IR 100,ORIGINAL,71228e4340de,29b23a11d1e4,f8a528b18394,55d5bf2dc87c,179cea9403ed,DX,,"(c, 3, 3, 9, 2, f, 0, 6, a, 4, 0, 1)",...,2271.539958,1024.988986,0.033598,,,,,,,
2,ISO_IR 100,ORIGINAL,03a65300fa41,8174f49500a5,6e5063be381d,a5847f6338e8,dc2b1c79543a,CR,GEIIS,"(e, 7, b, b, d, f, 4, b, 2, 4, 1, 2)",...,2199.463155,881.320778,0.01532,,,,,,,
3,ISO_IR 100,ORIGINAL,03a65300fa41,d54f6204b044,6e5063be381d,a5847f6338e8,dc2b1c79543a,CR,GEIIS,"(e, 7, b, b, d, f, 4, b, 2, 4, 1, 2)",...,2305.6004,871.53166,0.0042,,,,,,,
4,ISO_IR 100,ORIGINAL,71228e4340de,d51cadde8626,03e9532a93cc,b2d584ec0e2d,14a2a048836a,DX,,"(d, 4, 2, 6, 8, f, 2, 6, e, d, 0, 5)",...,1789.618069,1174.376957,0.204002,,,,,,,


In [15]:
dicom_merge = pd.merge(dicom_df,train_df, on = 'SOPInstanceUID')
dicom_merge[:1]

Unnamed: 0,SpecificCharacterSet,ImageType,SOPClassUID,SOPInstanceUID,StudyDate,StudyTime,AccessionNumber,Modality,Unnamed: 9,PatientName,...,ImageType2,ImageType3,ImageType4,ImageType5,ImageType6,ImageType7,ImageType8,class_y,num_of_boxes,label_COORD
0,ISO_IR 100,ORIGINAL,71228e4340de,d8ba599611e5,837af68926b7,81c1a714dbda,c4aef4fc15a2,DX,GEIIS,"(1, d, f, 0, 3, 0, 4, 6, 2, 4, f, 5)",...,,,,,,,,Negative,1,"[0, 0, 1, 1]"


Save the dataframe as `.csv` file. Before that, we check that the file path `fname` corresponds to the correct `SOPINstanceUID`

In [16]:
dicom_merge['fname'][100]

'../input/siim-covid19-detection/train/096eae67f478/4479c991df08/45f0608b7850.dcm'

In [17]:
dicom_merge['SOPInstanceUID'][100]

'45f0608b7850'

In [18]:
dicom_merge.to_csv('dicom_merge.csv', index = False)