Create annotations for xray images in appropriate Yolov3 format. 

Yolov3 format: 
Row format: image_file_path box1 box2 ... boxN;

path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3

path/to/img2.jpg 120,300,250,600,2
...

In [1]:
import argparse
import io
import os
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train_path='input\stage_1_train_images'
val_path='input\stage_1_test_images'
train_path = os.path.expanduser(train_path)
val_path = os.path.expanduser(val_path)
files = os.listdir(train_path)

In [3]:
train_labels = pd.read_csv('input/stage_1_train_labels.csv', index_col='patientId')
train_labels.head()

Unnamed: 0_level_0,x,y,width,height,Target
patientId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1


In [4]:
#https://www.kaggle.com/peterchang77/exploratory-data-analysis

def parse_data(df):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:

      parsed = {
        
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...

      }

    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    df.head()
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        print(row)
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': 'input/stage_1_train_images/%s.dcm' % pid,
                'label': row['Target'],
                'boxes': []}

        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

In [5]:
train_labels

Unnamed: 0_level_0,x,y,width,height,Target
patientId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1
00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1
00569f44-917d-4c86-a842-81832af98c30,,,,,0
006cec2e-6ce2-4549-bffa-eadfcd1e9970,,,,,0
00704310-78a8-4b38-8475-49f4573b2dbb,323.0,577.0,160.0,104.0,1
00704310-78a8-4b38-8475-49f4573b2dbb,695.0,575.0,162.0,137.0,1


In [None]:
parsed = parse_data(train_labels)

In [None]:
train_labels = pd.read_csv('input/stage_1_train_labels.csv', index_col='patientId')
train_labels.head()

In [None]:
N = len(train_labels.index.unique())
print(N)

In [None]:
output_file=open('train1.txt','w')
for i in range(N):
    # component 1: data_path + image_name
    part1 = os.path.join(train_path,train_labels.index.unique()[i]+'.dcm')
    # component 2: box data. 
    data = parsed[train_labels.index.unique()[i]]
    boxes = data['boxes']
    part2 = ''
    if len(boxes)>0:
        for j in range(len(boxes)):
            bj = np.asarray(boxes[j])
            bjj=np.zeros(bj.shape)
            bjj[0]=bj[1]
            bjj[1]=bj[0]
            bjj[2]=bj[1]+bj[3]
            bjj[3]=bj[0]+bj[2]
            sj=np.array2string(bjj.astype(int),separator=',')
            sj1=sj.replace(" ", "")
            part2 = part2 + sj1[1:-1] + ',0 '
        fullline = part1 + ' ' + part2[:-1]
        output_file.write(fullline)
        output_file.write('\n')
        print(fullline)
#     else:
#         fullline = part1
#     output_file.write(fullline)
#     output_file.write('\n')
#     print(fullline)
output_file.close()    