The dataset used below are MRI scan images of high grade glioma from the Multimodal Brain Tumor Segmentation Challenge (BRATS) 2015 as compiled by Larxel on Kaggle. Link: https://www.kaggle.com/andrewmvd/brain-tumor-segmentation-in-mri-brats-2015

# Install and import libraries

In [1]:
!pip install pascal-voc-writer
!pip install nibabel
!pip install pydicom
!pip install MedPy



In [2]:
import os
import h5py
import tarfile
import gzip
import shutil

from pascal_voc_writer import Writer
import nibabel as nib
import pydicom
import medpy
from medpy.io import load

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

from random import shuffle
import math
import re

# Mount drive

Only run below if running on Colab

In [None]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

In [None]:
#%cd /content/drive/My\ Drive/project_try_6/data

# Access data

Only run below if running on local drive

In [3]:
%cd C:\Users\Gyan\Documents\data_processing\data

C:\Users\Gyan\Documents\data_processing\data


# Extract relevant slices from MHA files

In [4]:
segments_folder = 'HGG'
scan_save_folder = 'brats_hgg_scans_images'
segments_save_folder = 'brats_hgg_segments_images'

In [5]:
t1_reg = re.compile('.*(T1\.[0-9]*).*')
seg_reg = re.compile('.*(OT).*')

In [6]:
array = {}
count = 1

for folder in os.listdir(segments_folder):
    print(count)
    count += 1
    folder_path = os.path.join(segments_folder, folder)
    subject = '_'.join(folder.split('_')[1:3])
    scan = np.array([])
    seg = np.array([])
    for f in os.listdir(folder_path):
        if t1_reg.match(f):
            scan, header = load(os.path.join(folder_path, f))
        if seg_reg.match(f):
            seg, header = load(os.path.join(folder_path, f))
    slice_list = np.unique(np.nonzero(seg)[2])
    df = pd.DataFrame(data={'slice_index': [], 'area': []}, dtype=int)
    for x in slice_list:
        area = np.nonzero(scan[:, :, x])[0].shape[0]
        df = df.append({'slice_index': x, 'area': area}, ignore_index=True)
    df.sort_values(by='area', inplace=True, ascending=False)
    roi = df.iloc[np.arange(0, 20)]
    for i in roi['slice_index'].values:
        tup = np.nonzero(seg[:, :, i])
        xs = tup[0]
        ys = tup[1]
        xmin = min(xs)
        ymin = min(ys)
        xmax = max(xs)
        ymax = max(ys)
        if (ymax - ymin) < 20 or (xmax - xmin) < 20 or tup[0].shape[0] < 400:
            continue
        print(subject + '-hgg-{}'.format(i))
        array[subject + '-hgg-{}'.format(i)] = (scan[:, :, i], seg[:, :, i], [xmin, ymin, xmax, ymax])
        plt.imsave(os.path.join(scan_save_folder, subject + '-hgg-{}.jpg'.format(i)), scan[:, :, i], cmap='gray')
        plt.imsave(os.path.join(segments_save_folder, subject + '-hgg-{}.jpg'.format(i)), seg[:, :, i], cmap='gray')

1
2013_pat0001-hgg-71
2013_pat0001-hgg-73
2013_pat0001-hgg-74
2013_pat0001-hgg-72
2013_pat0001-hgg-75
2013_pat0001-hgg-69
2013_pat0001-hgg-70
2013_pat0001-hgg-76
2013_pat0001-hgg-77
2013_pat0001-hgg-68
2013_pat0001-hgg-78
2013_pat0001-hgg-67
2013_pat0001-hgg-79
2013_pat0001-hgg-80
2013_pat0001-hgg-81
2013_pat0001-hgg-66
2013_pat0001-hgg-82
2013_pat0001-hgg-83
2013_pat0001-hgg-65
2013_pat0001-hgg-84
2
2013_pat0002-hgg-72
2013_pat0002-hgg-73
2013_pat0002-hgg-74
2013_pat0002-hgg-75
2013_pat0002-hgg-76
2013_pat0002-hgg-77
2013_pat0002-hgg-78
2013_pat0002-hgg-79
2013_pat0002-hgg-80
2013_pat0002-hgg-81
2013_pat0002-hgg-82
2013_pat0002-hgg-83
2013_pat0002-hgg-84
2013_pat0002-hgg-85
2013_pat0002-hgg-86
3
2013_pat0003-hgg-76
2013_pat0003-hgg-75
2013_pat0003-hgg-77
2013_pat0003-hgg-78
2013_pat0003-hgg-79
2013_pat0003-hgg-74
2013_pat0003-hgg-73
2013_pat0003-hgg-80
2013_pat0003-hgg-72
2013_pat0003-hgg-81
2013_pat0003-hgg-71
2013_pat0003-hgg-70
2013_pat0003-hgg-82
2013_pat0003-hgg-69
2013_pat0003-h

tcia_pat113-hgg-91
tcia_pat113-hgg-75
tcia_pat113-hgg-92
tcia_pat113-hgg-74
tcia_pat113-hgg-93
24
tcia_pat117-hgg-73
tcia_pat117-hgg-74
tcia_pat117-hgg-75
tcia_pat117-hgg-72
tcia_pat117-hgg-76
tcia_pat117-hgg-77
tcia_pat117-hgg-71
tcia_pat117-hgg-78
tcia_pat117-hgg-79
tcia_pat117-hgg-70
tcia_pat117-hgg-80
tcia_pat117-hgg-69
tcia_pat117-hgg-81
tcia_pat117-hgg-82
tcia_pat117-hgg-68
tcia_pat117-hgg-67
tcia_pat117-hgg-83
tcia_pat117-hgg-84
tcia_pat117-hgg-66
tcia_pat117-hgg-85
25
tcia_pat118-hgg-80
tcia_pat118-hgg-79
tcia_pat118-hgg-81
tcia_pat118-hgg-82
tcia_pat118-hgg-83
tcia_pat118-hgg-84
tcia_pat118-hgg-85
tcia_pat118-hgg-78
tcia_pat118-hgg-86
tcia_pat118-hgg-77
tcia_pat118-hgg-87
tcia_pat118-hgg-88
tcia_pat118-hgg-89
tcia_pat118-hgg-76
tcia_pat118-hgg-90
tcia_pat118-hgg-75
tcia_pat118-hgg-91
tcia_pat118-hgg-74
tcia_pat118-hgg-92
tcia_pat118-hgg-73
26
tcia_pat120-hgg-71
tcia_pat120-hgg-70
tcia_pat120-hgg-69
tcia_pat120-hgg-68
tcia_pat120-hgg-67
tcia_pat120-hgg-66
tcia_pat120-hgg-65
tci

tcia_pat167-hgg-73
tcia_pat167-hgg-81
tcia_pat167-hgg-72
tcia_pat167-hgg-82
tcia_pat167-hgg-71
tcia_pat167-hgg-83
tcia_pat167-hgg-84
tcia_pat167-hgg-70
tcia_pat167-hgg-69
tcia_pat167-hgg-85
tcia_pat167-hgg-86
tcia_pat167-hgg-68
tcia_pat167-hgg-67
49
tcia_pat168-hgg-73
tcia_pat168-hgg-72
tcia_pat168-hgg-71
tcia_pat168-hgg-74
tcia_pat168-hgg-70
tcia_pat168-hgg-69
tcia_pat168-hgg-75
tcia_pat168-hgg-68
tcia_pat168-hgg-76
tcia_pat168-hgg-67
tcia_pat168-hgg-66
tcia_pat168-hgg-77
tcia_pat168-hgg-65
tcia_pat168-hgg-78
tcia_pat168-hgg-79
tcia_pat168-hgg-64
tcia_pat168-hgg-80
tcia_pat168-hgg-63
tcia_pat168-hgg-81
tcia_pat168-hgg-82
50
tcia_pat170-hgg-78
tcia_pat170-hgg-77
tcia_pat170-hgg-76
tcia_pat170-hgg-79
tcia_pat170-hgg-80
tcia_pat170-hgg-75
tcia_pat170-hgg-74
tcia_pat170-hgg-81
tcia_pat170-hgg-73
tcia_pat170-hgg-82
tcia_pat170-hgg-83
tcia_pat170-hgg-72
tcia_pat170-hgg-84
tcia_pat170-hgg-71
tcia_pat170-hgg-85
tcia_pat170-hgg-86
tcia_pat170-hgg-70
tcia_pat170-hgg-87
tcia_pat170-hgg-69
tcia_p

tcia_pat199-hgg-65
tcia_pat199-hgg-81
tcia_pat199-hgg-82
tcia_pat199-hgg-64
tcia_pat199-hgg-83
72
tcia_pat200-hgg-73
tcia_pat200-hgg-74
tcia_pat200-hgg-72
tcia_pat200-hgg-71
tcia_pat200-hgg-75
tcia_pat200-hgg-76
tcia_pat200-hgg-70
tcia_pat200-hgg-77
tcia_pat200-hgg-69
tcia_pat200-hgg-78
tcia_pat200-hgg-68
tcia_pat200-hgg-67
tcia_pat200-hgg-79
tcia_pat200-hgg-80
tcia_pat200-hgg-81
tcia_pat200-hgg-82
tcia_pat200-hgg-66
tcia_pat200-hgg-83
tcia_pat200-hgg-65
tcia_pat200-hgg-84
73
tcia_pat201-hgg-71
tcia_pat201-hgg-70
tcia_pat201-hgg-69
tcia_pat201-hgg-72
tcia_pat201-hgg-73
tcia_pat201-hgg-68
tcia_pat201-hgg-74
tcia_pat201-hgg-67
tcia_pat201-hgg-75
tcia_pat201-hgg-76
tcia_pat201-hgg-66
tcia_pat201-hgg-77
tcia_pat201-hgg-65
tcia_pat201-hgg-78
tcia_pat201-hgg-79
tcia_pat201-hgg-64
tcia_pat201-hgg-63
tcia_pat201-hgg-80
tcia_pat201-hgg-81
tcia_pat201-hgg-62
74
tcia_pat203-hgg-71
tcia_pat203-hgg-72
tcia_pat203-hgg-70
tcia_pat203-hgg-73
tcia_pat203-hgg-69
tcia_pat203-hgg-77
tcia_pat203-hgg-75
tci

tcia_pat258-hgg-79
tcia_pat258-hgg-70
tcia_pat258-hgg-69
tcia_pat258-hgg-68
tcia_pat258-hgg-80
tcia_pat258-hgg-67
tcia_pat258-hgg-66
tcia_pat258-hgg-65
tcia_pat258-hgg-81
tcia_pat258-hgg-64
tcia_pat258-hgg-82
tcia_pat258-hgg-63
97
tcia_pat260-hgg-76
tcia_pat260-hgg-75
tcia_pat260-hgg-74
tcia_pat260-hgg-73
tcia_pat260-hgg-77
tcia_pat260-hgg-78
tcia_pat260-hgg-79
tcia_pat260-hgg-72
tcia_pat260-hgg-80
tcia_pat260-hgg-81
tcia_pat260-hgg-82
tcia_pat260-hgg-83
tcia_pat260-hgg-84
tcia_pat260-hgg-85
tcia_pat260-hgg-86
98
tcia_pat260-hgg-76
tcia_pat260-hgg-75
tcia_pat260-hgg-74
tcia_pat260-hgg-77
tcia_pat260-hgg-78
tcia_pat260-hgg-79
tcia_pat260-hgg-80
tcia_pat260-hgg-81
tcia_pat260-hgg-82
tcia_pat260-hgg-83
tcia_pat260-hgg-84
tcia_pat260-hgg-85
tcia_pat260-hgg-86
99
tcia_pat260-hgg-76
tcia_pat260-hgg-75
tcia_pat260-hgg-77
tcia_pat260-hgg-78
tcia_pat260-hgg-79
tcia_pat260-hgg-80
tcia_pat260-hgg-81
tcia_pat260-hgg-82
tcia_pat260-hgg-83
tcia_pat260-hgg-84
tcia_pat260-hgg-85
tcia_pat260-hgg-86
100

tcia_pat309-hgg-69
tcia_pat309-hgg-79
tcia_pat309-hgg-80
tcia_pat309-hgg-68
tcia_pat309-hgg-81
tcia_pat309-hgg-82
tcia_pat309-hgg-67
tcia_pat309-hgg-83
tcia_pat309-hgg-84
tcia_pat309-hgg-66
tcia_pat309-hgg-65
122
tcia_pat309-hgg-73
tcia_pat309-hgg-74
tcia_pat309-hgg-75
tcia_pat309-hgg-72
tcia_pat309-hgg-76
tcia_pat309-hgg-71
tcia_pat309-hgg-77
tcia_pat309-hgg-78
tcia_pat309-hgg-70
tcia_pat309-hgg-69
tcia_pat309-hgg-79
tcia_pat309-hgg-80
tcia_pat309-hgg-68
tcia_pat309-hgg-81
tcia_pat309-hgg-82
tcia_pat309-hgg-67
tcia_pat309-hgg-83
tcia_pat309-hgg-84
tcia_pat309-hgg-66
tcia_pat309-hgg-65
123
tcia_pat314-hgg-75
tcia_pat314-hgg-73
tcia_pat314-hgg-74
tcia_pat314-hgg-76
tcia_pat314-hgg-72
tcia_pat314-hgg-71
tcia_pat314-hgg-77
tcia_pat314-hgg-78
tcia_pat314-hgg-70
tcia_pat314-hgg-79
tcia_pat314-hgg-69
tcia_pat314-hgg-80
tcia_pat314-hgg-81
tcia_pat314-hgg-68
tcia_pat314-hgg-82
tcia_pat314-hgg-67
tcia_pat314-hgg-83
tcia_pat314-hgg-84
tcia_pat314-hgg-66
tcia_pat314-hgg-85
124
tcia_pat314-hgg-75


tcia_pat404-hgg-90
tcia_pat404-hgg-89
tcia_pat404-hgg-82
tcia_pat404-hgg-91
tcia_pat404-hgg-81
tcia_pat404-hgg-92
tcia_pat404-hgg-93
tcia_pat404-hgg-94
tcia_pat404-hgg-80
tcia_pat404-hgg-79
tcia_pat404-hgg-95
tcia_pat404-hgg-96
tcia_pat404-hgg-78
tcia_pat404-hgg-77
181
tcia_pat406-hgg-67
tcia_pat406-hgg-71
tcia_pat406-hgg-72
tcia_pat406-hgg-68
tcia_pat406-hgg-73
tcia_pat406-hgg-69
tcia_pat406-hgg-74
tcia_pat406-hgg-70
tcia_pat406-hgg-66
tcia_pat406-hgg-65
tcia_pat406-hgg-75
tcia_pat406-hgg-76
tcia_pat406-hgg-64
tcia_pat406-hgg-63
tcia_pat406-hgg-77
tcia_pat406-hgg-78
tcia_pat406-hgg-62
182
tcia_pat409-hgg-79
tcia_pat409-hgg-80
tcia_pat409-hgg-78
tcia_pat409-hgg-81
tcia_pat409-hgg-82
tcia_pat409-hgg-77
tcia_pat409-hgg-83
tcia_pat409-hgg-76
tcia_pat409-hgg-84
tcia_pat409-hgg-75
tcia_pat409-hgg-85
tcia_pat409-hgg-74
tcia_pat409-hgg-86
tcia_pat409-hgg-87
tcia_pat409-hgg-72
tcia_pat409-hgg-88
tcia_pat409-hgg-71
tcia_pat409-hgg-89
183
tcia_pat411-hgg-67
tcia_pat411-hgg-66
tcia_pat411-hgg-65


tcia_pat444-hgg-66
tcia_pat444-hgg-65
tcia_pat444-hgg-64
tcia_pat444-hgg-63
tcia_pat444-hgg-62
204
tcia_pat447-hgg-77
tcia_pat447-hgg-79
tcia_pat447-hgg-78
tcia_pat447-hgg-80
tcia_pat447-hgg-81
tcia_pat447-hgg-82
tcia_pat447-hgg-76
tcia_pat447-hgg-75
tcia_pat447-hgg-83
tcia_pat447-hgg-84
tcia_pat447-hgg-74
tcia_pat447-hgg-73
tcia_pat447-hgg-85
tcia_pat447-hgg-86
tcia_pat447-hgg-87
tcia_pat447-hgg-72
tcia_pat447-hgg-71
tcia_pat447-hgg-88
tcia_pat447-hgg-89
tcia_pat447-hgg-90
205
tcia_pat447-hgg-77
tcia_pat447-hgg-79
tcia_pat447-hgg-78
tcia_pat447-hgg-80
tcia_pat447-hgg-81
tcia_pat447-hgg-82
tcia_pat447-hgg-76
tcia_pat447-hgg-75
tcia_pat447-hgg-83
tcia_pat447-hgg-84
tcia_pat447-hgg-74
tcia_pat447-hgg-73
tcia_pat447-hgg-85
tcia_pat447-hgg-86
tcia_pat447-hgg-87
tcia_pat447-hgg-72
tcia_pat447-hgg-71
tcia_pat447-hgg-88
tcia_pat447-hgg-89
tcia_pat447-hgg-90
206
tcia_pat447-hgg-77
tcia_pat447-hgg-79
tcia_pat447-hgg-78
tcia_pat447-hgg-80
tcia_pat447-hgg-81
tcia_pat447-hgg-82
tcia_pat447-hgg-76


# Save image filenames and bounding box coordinates to csv

In [7]:
keys = list(array.keys())
df_out = pd.DataFrame({'name': [], 'xmin': [], 'ymin': [], 'xmax': [], 'ymax': []}, dtype=str)
for key in keys:
    item = array[key]
    [xmin, ymin, xmax, ymax] = item[-1]
    df_out = df_out.append({'name': key, 'xmin': str(xmin), 'ymin': str(ymin), 'xmax': str(xmax), 'ymax': str(ymax)}, 
                           ignore_index=True)
df_out.to_csv('brats_hgg_labels_and_bbox.csv', index=False)