The dataset used below is from The Cancer Genome Atlas (TCGA) Gliobastoma Multiforme (GBM) collection, obtained from The Cancer Imaging Archive. Link: https://wiki.cancerimagingarchive.net/display/DOI/Segmentation+Labels+and+Radiomic+Features+for+the+Pre-operative+Scans+of+the+TCGA-GBM+collection

# Install and import libraries

In [1]:
!pip install pascal-voc-writer
!pip install nibabel
!pip install pydicom



In [48]:
import os
import h5py
import tarfile
import gzip
import shutil

from pascal_voc_writer import Writer
import nibabel as nib
import pydicom

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

from random import shuffle
import math
import re

# Mount drive

Only run below if running on Colab

In [None]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

In [None]:
#%cd /content/drive/My\ Drive/data

# Access data

Only run below if running on local drive

In [3]:
%cd C:\Users\Gyan\Documents\data_processing\data

C:\Users\Gyan\Documents\data_processing\data


# Extract gzip files to get NIfTI files

In [40]:
segments_folder = 'Pre-operative_TCGA_GBM_NIfTI_and_Segmentations'
scan_save_folder = 'tcga_scans'
segments_save_folder = 'tcga_segments'
gd_save_folder = 'tcga_gd_scans'

In [39]:
t1_reg = re.compile('.*([Tt]1.nii.gz)')
seg_reg = re.compile('.*(_GlistrBoost_ManuallyCorrected.nii.gz)')
seg_reg2 = re.compile('.*(_GlistrBoost.nii.gz)')

In [21]:
count = 1

for folder in os.listdir(segments_folder):
    print(count)
    count += 1
    folder_path = os.path.join(segments_folder, folder)
    has_segment = False
    for f in os.listdir(folder_path):
        name = f.split('_')[0]
        if t1_reg.match(f):
            with gzip.open(os.path.join(folder_path, f), 'rb') as f_in:
                with open(os.path.join(scan_save_folder, name + '_T1.nii'), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        if seg_reg.match(f):
            with gzip.open(os.path.join(folder_path, f), 'rb') as f_in:
                with open(os.path.join(segments_save_folder, name + '-Segmentation.nii'), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            has_segment = True
        if seg_reg2.match(f) and (not has_segment):
            with gzip.open(os.path.join(folder_path, f), 'rb') as f_in:
                with open(os.path.join(segments_save_folder, name + '-Segmentation.nii'), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102


In [22]:
for f in os.listdir(segments_save_folder):
    print(nib.load(os.path.join(segments_save_folder, f)).get_fdata().shape)

(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 240, 155)
(240, 24

# Obtain relevant slices and save as images

In [95]:
array = {}

for f in os.listdir(segments_save_folder):
    subject = '-'.join(f.split('-')[0:-1])
    scan_file_path = os.path.join(scan_save_folder, subject + '_T1.nii')
    segment = nib.load(os.path.join(segments_save_folder, f)).get_fdata()
    scan = nib.load(scan_file_path).get_fdata()
    slice_list = np.unique(np.nonzero(segment)[2])
    df = pd.DataFrame(data={'slice_index': [], 'area': []}, dtype=int)
    for x in slice_list:
        area = np.nonzero(scan[:, :, x])[0].shape[0]
        df = df.append({'slice_index': x, 'area': area}, ignore_index=True)
    df.sort_values(by='area', inplace=True, ascending=False)
    roi = df.iloc[np.arange(0, 20)]
    for i in roi['slice_index'].values:
        tup = np.nonzero(segment[:, :, i])
        xs = tup[0]
        ys = tup[1]
        xmin = min(xs)
        ymin = min(ys)
        xmax = max(xs)
        ymax = max(ys)
        if (ymax - ymin) < 20 or (xmax - xmin) < 20 or tup[0].shape[0] < 400:
            continue
        print(subject + '-{}'.format(i))
        array[subject + '-{}'.format(i)] = (scan[:, :, i], segment[:, :, i], [xmin, ymin, xmax, ymax])
        plt.imsave(os.path.join('tcga_scans_images', subject + '-{}.jpg'.format(i)), scan[:, :, i], cmap='gray')
        plt.imsave(os.path.join('tcga_segments_images', subject + '-{}.jpg'.format(i)), segment[:, :, i], cmap='gray')

TCGA-02-0006-63
TCGA-02-0006-62
TCGA-02-0006-61
TCGA-02-0006-60
TCGA-02-0006-59
TCGA-02-0006-58
TCGA-02-0006-57
TCGA-02-0009-87
TCGA-02-0009-85
TCGA-02-0009-86
TCGA-02-0009-88
TCGA-02-0009-84
TCGA-02-0009-89
TCGA-02-0009-83
TCGA-02-0009-90
TCGA-02-0009-91
TCGA-02-0009-92
TCGA-02-0009-82
TCGA-02-0009-81
TCGA-02-0009-93
TCGA-02-0009-80
TCGA-02-0009-79
TCGA-02-0009-94
TCGA-02-0009-78
TCGA-02-0009-95
TCGA-02-0009-96
TCGA-02-0011-80
TCGA-02-0011-79
TCGA-02-0011-81
TCGA-02-0011-78
TCGA-02-0011-82
TCGA-02-0011-77
TCGA-02-0011-83
TCGA-02-0011-76
TCGA-02-0011-84
TCGA-02-0011-85
TCGA-02-0011-75
TCGA-02-0011-86
TCGA-02-0011-74
TCGA-02-0011-87
TCGA-02-0011-88
TCGA-02-0011-73
TCGA-02-0011-89
TCGA-02-0011-90
TCGA-02-0011-72
TCGA-02-0011-91
TCGA-02-0027-80
TCGA-02-0027-79
TCGA-02-0027-76
TCGA-02-0027-78
TCGA-02-0027-77
TCGA-02-0027-81
TCGA-02-0027-75
TCGA-02-0027-82
TCGA-02-0027-74
TCGA-02-0027-73
TCGA-02-0027-83
TCGA-02-0027-84
TCGA-02-0027-85
TCGA-02-0027-86
TCGA-02-0027-87
TCGA-02-0033-74
TCGA-02-

TCGA-06-0145-89
TCGA-06-0145-74
TCGA-06-0145-90
TCGA-06-0145-73
TCGA-06-0145-91
TCGA-06-0145-72
TCGA-06-0149-82
TCGA-06-0149-81
TCGA-06-0149-83
TCGA-06-0149-80
TCGA-06-0149-84
TCGA-06-0149-79
TCGA-06-0149-85
TCGA-06-0149-78
TCGA-06-0149-77
TCGA-06-0149-86
TCGA-06-0149-76
TCGA-06-0149-87
TCGA-06-0149-75
TCGA-06-0149-88
TCGA-06-0149-74
TCGA-06-0149-73
TCGA-06-0149-89
TCGA-06-0149-90
TCGA-06-0149-72
TCGA-06-0149-71
TCGA-06-0154-86
TCGA-06-0154-87
TCGA-06-0154-88
TCGA-06-0154-89
TCGA-06-0154-90
TCGA-06-0154-91
TCGA-06-0154-92
TCGA-06-0154-93
TCGA-06-0154-94
TCGA-06-0154-95
TCGA-06-0154-96
TCGA-06-0154-97
TCGA-06-0154-98
TCGA-06-0154-99
TCGA-06-0154-100
TCGA-06-0154-101
TCGA-06-0154-102
TCGA-06-0158-91
TCGA-06-0158-87
TCGA-06-0158-90
TCGA-06-0158-89
TCGA-06-0158-88
TCGA-06-0158-86
TCGA-06-0158-85
TCGA-06-0158-84
TCGA-06-0158-83
TCGA-06-0158-82
TCGA-06-0158-81
TCGA-06-0162-80
TCGA-06-0162-79
TCGA-06-0162-81
TCGA-06-0162-82
TCGA-06-0162-83
TCGA-06-0162-84
TCGA-06-0162-85
TCGA-06-0162-78
TCGA-

TCGA-08-0385-79
TCGA-08-0385-78
TCGA-08-0385-80
TCGA-08-0385-77
TCGA-08-0385-75
TCGA-08-0385-76
TCGA-08-0385-81
TCGA-08-0385-73
TCGA-08-0385-74
TCGA-08-0385-71
TCGA-08-0385-72
TCGA-08-0385-82
TCGA-08-0385-83
TCGA-08-0385-70
TCGA-08-0385-69
TCGA-08-0385-84
TCGA-08-0385-85
TCGA-08-0385-86
TCGA-08-0385-68
TCGA-08-0385-67
TCGA-08-0389-81
TCGA-08-0389-80
TCGA-08-0389-79
TCGA-08-0389-78
TCGA-08-0389-77
TCGA-08-0389-82
TCGA-08-0389-83
TCGA-08-0389-84
TCGA-08-0389-85
TCGA-08-0389-76
TCGA-08-0389-86
TCGA-08-0389-75
TCGA-08-0389-87
TCGA-08-0389-74
TCGA-08-0389-73
TCGA-08-0389-72
TCGA-08-0389-88
TCGA-08-0389-71
TCGA-08-0389-70
TCGA-08-0389-69
TCGA-08-0390-72
TCGA-08-0390-71
TCGA-08-0390-73
TCGA-08-0390-74
TCGA-08-0390-75
TCGA-08-0390-70
TCGA-08-0390-76
TCGA-08-0390-69
TCGA-08-0390-77
TCGA-08-0390-78
TCGA-08-0390-68
TCGA-08-0390-79
TCGA-08-0390-80
TCGA-08-0390-67
TCGA-08-0390-81
TCGA-08-0390-66
TCGA-08-0390-82
TCGA-08-0390-65
TCGA-08-0390-83
TCGA-08-0390-64
TCGA-08-0392-79
TCGA-08-0392-80
TCGA-08-

# Save image filenames and bounding box coordinates to csv

In [101]:
keys = list(array.keys())
df_out = pd.DataFrame({'name': [], 'xmin': [], 'ymin': [], 'xmax': [], 'ymax': []}, dtype=str)
for key in keys:
    item = array[key]
    [xmin, ymin, xmax, ymax] = item[-1]
    df_out = df_out.append({'name': key, 'xmin': str(xmin), 'ymin': str(ymin), 'xmax': str(xmax), 'ymax': str(ymax)}, 
                           ignore_index=True)
df_out.to_csv('labels_and_bbox.csv', index=False)