The dataset used below are MRI scan images of low grade glioma from the Multimodal Brain Tumor Segmentation Challenge (BRATS) 2015 as compiled by Larxel on Kaggle. Link: https://www.kaggle.com/andrewmvd/brain-tumor-segmentation-in-mri-brats-2015

# Install and import libraries

In [4]:
!pip install pascal-voc-writer
!pip install nibabel
!pip install pydicom
!pip install MedPy

Collecting MedPy
  Using cached MedPy-0.4.0.tar.gz (151 kB)
Collecting SimpleITK>=1.1.0
  Using cached SimpleITK-1.2.4-cp37-cp37m-win_amd64.whl (28.0 MB)
Building wheels for collected packages: MedPy
  Building wheel for MedPy (setup.py): started
  Building wheel for MedPy (setup.py): finished with status 'done'
  Created wheel for MedPy: filename=MedPy-0.4.0-py3-none-any.whl size=214962 sha256=04a1a6bb5d7974c3f5fafa49eac3cb98f733357154894253212e8a30a20f4c88
  Stored in directory: c:\users\gyan\appdata\local\pip\cache\wheels\b0\57\3a\da1183f22a6afb42e11138daa6a759de233fd977a984333602
Successfully built MedPy
Installing collected packages: SimpleITK, MedPy
Successfully installed MedPy-0.4.0 SimpleITK-1.2.4


In [6]:
import os
import h5py
import tarfile
import gzip
import shutil

from pascal_voc_writer import Writer
import nibabel as nib
import pydicom
import medpy
from medpy.io import load

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

from random import shuffle
import math
import re

# Mount drive

Only run below if running on Colab

In [None]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

In [None]:
#%cd /content/drive/My\ Drive/project_try_6/data

# Access data

Only run below if running on local drive

In [7]:
%cd C:\Users\Gyan\Documents\data_processing\data

C:\Users\Gyan\Documents\data_processing\data


# Extract relevant slices from MHA files

In [26]:
segments_folder = 'LGG'
scan_save_folder = 'brats_lgg_scans_images'
segments_save_folder = 'brats_lgg_segments_images'

In [27]:
t1_reg = re.compile('.*(T1\.[0-9]*).*')
seg_reg = re.compile('.*(OT).*')

In [29]:
array = {}
count = 1

for folder in os.listdir(segments_folder):
    print(count)
    count += 1
    folder_path = os.path.join(segments_folder, folder)
    subject = '_'.join(folder.split('_')[1:3])
    scan = np.array([])
    seg = np.array([])
    for f in os.listdir(folder_path):
        if t1_reg.match(f):
            scan, header = load(os.path.join(folder_path, f))
        if seg_reg.match(f):
            seg, header = load(os.path.join(folder_path, f))
    slice_list = np.unique(np.nonzero(seg)[2])
    df = pd.DataFrame(data={'slice_index': [], 'area': []}, dtype=int)
    for x in slice_list:
        area = np.nonzero(scan[:, :, x])[0].shape[0]
        df = df.append({'slice_index': x, 'area': area}, ignore_index=True)
    df.sort_values(by='area', inplace=True, ascending=False)
    roi = df.iloc[np.arange(0, 20)]
    for i in roi['slice_index'].values:
        tup = np.nonzero(seg[:, :, i])
        xs = tup[1]
        ys = tup[0]
        xmin = min(xs)
        ymin = min(ys)
        xmax = max(xs)
        ymax = max(ys)
        if (ymax - ymin) < 20 or (xmax - xmin) < 20 or tup[0].shape[0] < 400:
            continue
        print(subject + '-{}'.format(i))
        array[subject + '-{}'.format(i)] = (scan[:, :, i], seg[:, :, i], [xmin, ymin, xmax, ymax])
        plt.imsave(os.path.join(scan_save_folder, subject + '-{}.jpg'.format(i)), scan[:, :, i], cmap='gray')
        plt.imsave(os.path.join(segments_save_folder, subject + '-{}.jpg'.format(i)), seg[:, :, i], cmap='gray')

1
2013_pat0001-76
2013_pat0001-77
2013_pat0001-75
2013_pat0001-79
2013_pat0001-74
2013_pat0001-78
2013_pat0001-73
2013_pat0001-80
2013_pat0001-72
2013_pat0001-71
2013_pat0001-70
2013_pat0001-69
2013_pat0001-68
2013_pat0001-67
2013_pat0001-66
2013_pat0001-65
2013_pat0001-64
2013_pat0001-63
2
2013_pat0002-78
2013_pat0002-79
2013_pat0002-77
2013_pat0002-76
2013_pat0002-75
2013_pat0002-80
2013_pat0002-74
2013_pat0002-81
2013_pat0002-73
2013_pat0002-82
2013_pat0002-83
2013_pat0002-72
2013_pat0002-71
2013_pat0002-84
2013_pat0002-70
2013_pat0002-85
2013_pat0002-69
2013_pat0002-86
2013_pat0002-68
2013_pat0002-87
3
2013_pat0004-76
2013_pat0004-77
2013_pat0004-70
2013_pat0004-74
2013_pat0004-75
2013_pat0004-72
2013_pat0004-73
2013_pat0004-71
2013_pat0004-78
2013_pat0004-69
2013_pat0004-79
2013_pat0004-80
2013_pat0004-68
2013_pat0004-81
2013_pat0004-67
2013_pat0004-82
2013_pat0004-66
2013_pat0004-83
2013_pat0004-84
2013_pat0004-65
4
2013_pat0006-102
2013_pat0006-103
2013_pat0006-104
2013_pat0006-

tcia_pat325-85
tcia_pat325-86
33
tcia_pat330-73
tcia_pat330-74
tcia_pat330-76
tcia_pat330-75
tcia_pat330-72
tcia_pat330-71
tcia_pat330-70
tcia_pat330-69
tcia_pat330-68
tcia_pat330-67
34
tcia_pat346-72
tcia_pat346-73
tcia_pat346-74
tcia_pat346-75
tcia_pat346-71
tcia_pat346-76
tcia_pat346-77
tcia_pat346-70
tcia_pat346-78
tcia_pat346-69
tcia_pat346-68
tcia_pat346-79
tcia_pat346-67
tcia_pat346-80
tcia_pat346-81
tcia_pat346-66
tcia_pat346-82
tcia_pat346-65
tcia_pat346-64
tcia_pat346-83
35
tcia_pat351-78
tcia_pat351-77
tcia_pat351-79
tcia_pat351-80
tcia_pat351-81
tcia_pat351-82
tcia_pat351-83
tcia_pat351-84
tcia_pat351-85
tcia_pat351-86
36
tcia_pat354-59
tcia_pat354-58
tcia_pat354-57
tcia_pat354-56
tcia_pat354-55
tcia_pat354-54
tcia_pat354-53
tcia_pat354-52
tcia_pat354-51
tcia_pat354-50
tcia_pat354-49
tcia_pat354-48
tcia_pat354-47
37
tcia_pat387-76
tcia_pat387-77
tcia_pat387-78
tcia_pat387-79
tcia_pat387-80
tcia_pat387-81
tcia_pat387-82
tcia_pat387-83
tcia_pat387-84
tcia_pat387-85
38
tcia_pa

# Save image filenames and bounding box coordinates to csv

In [30]:
keys = list(array.keys())
df_out = pd.DataFrame({'name': [], 'xmin': [], 'ymin': [], 'xmax': [], 'ymax': []}, dtype=str)
for key in keys:
    item = array[key]
    [xmin, ymin, xmax, ymax] = item[-1]
    df_out = df_out.append({'name': key, 'xmin': str(xmin), 'ymin': str(ymin), 'xmax': str(xmax), 'ymax': str(ymax)}, 
                           ignore_index=True)
df_out.to_csv('brats_lgg_labels_and_bbox.csv', index=False)