In [None]:
import os
import glob
import SimpleITK as sitk
import numpy as np
import radiomics
import pandas as pd
from radiomics import featureextractor
from collections import Counter
import nibabel as nib
import nrrd
import matplotlib.pyplot as plt

To access relevant files. We need both images and segmentation maps. They are organized in advance to fit into center and machine based partitions.

In [None]:
main_path = '/Users/koska/Downloads/wt_nb_nifty

hutf_wilms_path = os.path.join (main_path, 'hütf_wilms')
hutf_nb_path = os.path.join (main_path, 'hütf_nb')
ext_wilms_path = os.path.join (main_path, 'external_wilms')
ext_nb_path = os.path.join (main_path, 'external_nb')

hutf_wilms_glob = glob.glob (hutf_wilms_path + '/*/*')
hutf_nb_glob = glob.glob (hutf_nb_path + '/*/*')
ext_wilms_glob = glob.glob (ext_wilms_path + '/*/*')
ext_nb_glob = glob.glob (ext_nb_path + '/*/*')

hutf_wilms_im_list = [i for i in sorted (hutf_wilms_glob) if i.split('/',10)[-1].startswith ('p')]
hutf_wilms_segm_list = [i for i in sorted (hutf_wilms_glob) if i.split('/',10)[-1].startswith ('s')]

hutf_nb_im_list = [i for i in sorted (hutf_nb_glob) if i.split('/',10)[-1].startswith ('p')]
hutf_nb_segm_list = [i for i in sorted (hutf_nb_glob) if i.split('/',10)[-1].startswith ('s')]

ext_wilms_im_list = [i for i in sorted (ext_wilms_glob) if i.split('/',10)[-1].startswith ('p')]
ext_wilms_segm_list = [i for i in sorted (ext_wilms_glob) if i.split('/',10)[-1].startswith ('s')]

ext_nb_im_list = [i for i in sorted (ext_nb_glob) if i.split('/',10)[-1].startswith ('p')]
ext_nb_segm_list = [i for i in sorted (ext_nb_glob) if i.split('/',10)[-1].startswith ('s')]

siemens_wilms_path = os.path.join (main_path, 'siemens_wilms')
siemens_nb_path = os.path.join (main_path, 'siemens_nb')
ge_wilms_path = os.path.join (main_path, 'ge_wilms')
ge_nb_path = os.path.join (main_path, 'ge_nb')
other_machine_wilms_path = os.path.join (main_path, 'other_machine_wilms')
other_machine_nb_path = os.path.join (main_path, 'other_machine_nb')


siemens_wilms_glob = glob.glob (siemens_wilms_path + '/*/*')
siemens_nb_glob = glob.glob (siemens_nb_path + '/*/*')
ge_wilms_glob = glob.glob (ge_wilms_path + '/*/*')
ge_nb_glob = glob.glob (ge_nb_path + '/*/*')
other_machine_wilms_glob = glob.glob (other_machine_wilms_path + '/*/*')
other_machine_nb_glob = glob.glob (other_machine_nb_path + '/*/*')

siemens_wilms_im_list = [i for i in sorted (siemens_wilms_glob) if i.split('/',7)[-1].startswith ('p')]
simens_wilms_segm_list = [i for i in sorted (siemens_wilms_glob) if i.split('/',7)[-1].startswith ('s')]

siemens_nb_im_list = [i for i in sorted (siemens_nb_glob) if i.split('/',7)[-1].startswith ('p')]
siemens_nb_segm_list = [i for i in sorted (siemens_nb_glob) if i.split('/',7)[-1].startswith ('s')]

ge_wilms_im_list = [i for i in sorted (ge_wilms_glob) if i.split('/',7)[-1].startswith ('p')]
ge_wilms_segm_list = [i for i in sorted (ge_wilms_glob) if i.split('/',7)[-1].startswith ('s')]

ge_nb_im_list = [i for i in sorted (ge_nb_glob) if i.split('/',7)[-1].startswith ('p')]
ge_nb_segm_list = [i for i in sorted (ge_nb_glob) if i.split('/',7)[-1].startswith ('s')]

other_machine_wilms_im_list = [i for i in sorted (other_machine_wilms_glob) if i.split('/',7)[-1].startswith ('p')]
other_machine_wilms_segm_list = [i for i in sorted (other_machine_wilms_glob) if i.split('/',7)[-1].startswith ('s')]

other_machine_nb_im_list = [i for i in sorted (other_machine_nb_glob) if i.split('/',7)[-1].startswith ('p')]
other_machine_nb_segm_list = [i for i in sorted (other_machine_nb_glob) if i.split('/',7)[-1].startswith ('s')]

We matched image and mask paths to extract radiomics features

In [None]:
def make_df (im_list,segm_list):
    im = np.array (im_list)
    segm = np.array (segm_list)
    main_df = {'im': im, 'segm': segm }
    main_df = pd.DataFrame (data= main_df) 
    return main_df

df_wilms_hutf = make_df (hutf_wilms_im_list,hutf_wilms_segm_list)
df_nb_hutf = make_df (hutf_nb_im_list,hutf_nb_segm_list)
df_wilms_ext = make_df (ext_wilms_im_list,ext_wilms_segm_list)
df_nb_ext = make_df (ext_nb_im_list,ext_nb_segm_list)


df_wilms_siemens = make_df (siemens_wilms_im_list,simens_wilms_segm_list)
df_nb_siemens = make_df (siemens_nb_im_list,siemens_nb_segm_list)
df_wilms_ge = make_df (ge_wilms_im_list,ge_wilms_segm_list)
df_nb_ge = make_df (ge_nb_im_list,ge_nb_segm_list)
df_wilms_other_machine = make_df (other_machine_wilms_im_list,other_machine_wilms_segm_list)
df_nb_other_machine = make_df (other_machine_nb_im_list,other_machine_nb_segm_list)

Then we obtained radiomics features. Feature extraction process requires a parameter file. This file should reside in your local and you should locate its path for the extractor.

In [None]:
params = '/Users/koska/Downloads/wt_nb_ct_parameters.yaml'
extractor = featureextractor.RadiomicsFeatureExtractor(params)

def obtain_features (main_df):
    features = {}
    for i in range(len (main_df)):
        image = sitk.ReadImage(main_df ['im'] [i])
        mask = sitk.ReadImage(main_df ['segm'][i])
        features[i] = extractor.execute ( image, mask) 
    feature_names = list(sorted(filter ( lambda k: k.startswith("original_") or k.startswith ("log") or k.startswith ("wavelet"), features[1] )))

    samples = np.zeros((len (main_df),len(feature_names)))
    for i in range(len(main_df)):
        a = np.array([])
        for feature_name in feature_names:
            a = np.append(a, features[i][feature_name])
        samples[i,:] = a

    # May have NaNs
    samples = np.nan_to_num(samples)
    df = pd.DataFrame (samples, columns= feature_names)
    return df

In [None]:
wilms_hutf_features = obtain_features (df_wilms_hutf)
nb_hutf_features = obtain_features (df_nb_hutf)
wilms_ext_features = obtain_features (df_wilms_ext)
nb_ext_features = obtain_features (df_nb_ext)
wilms_siemens_features = obtain_features (df_wilms_siemens)
nb_siemens_features = obtain_features (df_nb_siemens)
wilms_ge_features = obtain_features (df_wilms_ge)
nb_ge_features = obtain_features (df_nb_ge)
wilms_other_machine_features = obtain_features (df_wilms_other_machine)
nb_other_machine_features = obtain_features (df_nb_other_machine)

We need matching names in order to track the features and labels in order to train the models. We prepared them and integrated into features dataframe.

In [None]:
def get_names (im_list):
    names = []
    for i in im_list:
        name = i.split ('/',7)[-2]
        names.append (name)
    return np.array (names)

wilms_hutf_names = get_names (hutf_wilms_im_list)
nb_hutf_names = get_names (hutf_nb_im_list)
wilms_ext_names = get_names (ext_wilms_im_list)
nb_ext_names = get_names (ext_nb_im_list)
wilms_siemens_names = get_names (siemens_wilms_im_list)
nb_siemens_names = get_names (siemens_nb_im_list)
wilms_ge_names = get_names (ge_wilms_im_list)
nb_ge_names = get_names (ge_nb_im_list)
wilms_other_machine_names = get_names (other_machine_wilms_im_list)
nb_other_machine_names = get_names (other_machine_nb_im_list)


wilms_hutf_labels = np.ones (len(hutf_wilms_im_list))
nb_hutf_labels = np.zeros (len (hutf_nb_im_list))
wilms_ext_labels = np.ones (len (ext_wilms_im_list))
nb_ext_labels = np.zeros (len(ext_nb_im_list))
wilms_siemens_labels = np.ones (len (siemens_wilms_im_list))
nb_siemens_labels = np.zeros (len(siemens_nb_im_list))
wilms_ge_labels = np.ones (len(ge_wilms_im_list))
nb_ge_labels = np.zeros (len (ge_nb_im_list))
wilms_other_machine_labels = np.ones (len (other_machine_wilms_im_list))
nb_other_machine_labels = np.zeros (len (other_machine_nb_im_list))

wilms_hutf_features ['Patient code'] = wilms_hutf_names
wilms_hutf_features ['label'] = wilms_hutf_labels 

nb_hutf_features ['Patient code'] = nb_hutf_names
nb_hutf_features ['label'] = nb_hutf_labels 

wilms_ext_features ['Patient code'] = wilms_ext_names
wilms_ext_features ['label'] = wilms_ext_labels 

nb_ext_features ['Patient code'] = nb_ext_names
nb_ext_features ['label'] = nb_ext_labels 


wilms_siemens_features ['Patient code'] = wilms_siemens_names
wilms_siemens_features ['label'] = wilms_siemens_labels 

nb_siemens_features ['Patient code'] = nb_siemens_names
nb_siemens_features ['label'] = nb_siemens_labels 

wilms_ge_features ['Patient code'] = wilms_ge_names
wilms_ge_features ['label'] = wilms_ge_labels 

nb_ge_features ['Patient code'] = nb_ge_names
nb_ge_features ['label'] = nb_ge_labels 

wilms_other_machine_features ['Patient code'] = wilms_other_machine_names
wilms_other_machine_features ['label'] = wilms_other_machine_labels 

nb_other_machine_features ['Patient code'] = nb_other_machine_names
nb_other_machine_features ['label'] = nb_other_machine_labels 


hutf_complete_df = pd.concat ((nb_hutf_features,wilms_hutf_features),axis =0)
external_complete_df = pd.concat ((nb_ext_features,wilms_ext_features),axis =0)
anti_siemens_complete_df = pd.concat ((nb_ge_features, nb_other_machine_features,wilms_ge_features,wilms_other_machine_features),axis=0)
anti_ge_complete_df = pd.concat (( nb_siemens_features,nb_other_machine_features,wilms_siemens_features,wilms_other_machine_features),axis=0)
siemens_complete_df = pd.concat((nb_siemens_features,wilms_siemens_features),axis=0)
ge_complete_df = pd.concat((nb_ge_features,wilms_ge_features),axis=0)

And we stored these dataframes for later use.

In [None]:
hutf_complete_df.to_csv ('hutf_complete_df.csv')
external_complete_df.to_csv ('external_complete_df.csv')
anti_siemens_complete_df.to_csv ('anti_siemens_complete_df.csv')
anti_ge_complete_df.to_csv ('anti_ge_complete_df.csv')
siemens_complete_df.to_csv ('siemens_complete_df.csv')
ge_complete_df.to_csv ('ge_complete_df.csv')

In [None]:
To assess volumes of lesions

In [None]:
def get_ct_volumes (path):
    tmp, header = nrrd.read(path)
    dims = np.diag (header['space directions'])
    counter = Counter (list (np.ravel (tmp)))
    mask_vol = counter [1]*dims [0] *dims [1] * dims [2] / 1000
    return (mask_vol)

In [None]:
hütf_nb_volumes = []
hütf_wt_volumes = []
ext_nb_volumes = []
ext_wt_volumes = []

for i in hutf_wilms_segm_list:
    v = get_ct_volumes (i)
    hütf_wt_volumes.append (v)
    
for i in hutf_nb_segm_list:
    v = get_ct_volumes (i)
    hütf_nb_volumes.append (v)
    
for i in ext_wilms_segm_list:
    v = get_ct_volumes (i)
    ext_wt_volumes.append (v)
    
for i in ext_nb_segm_list:
    v = get_ct_volumes (i)
    ext_nb_volumes.append (v)
    
s_nb_ext = sorted (ext_nb_volumes)
s_nb_hutf = sorted (hütf_nb_volumes)
s_wt_ext = sorted (ext_wt_volumes)
s_wt_hutf = sorted (hütf_wt_volumes)

plt.hist (s_nb_ext, bins = [100,200,300, 400, 500,600, 700, 800, 900, 1000, 1100])
#plt.hist (s_nb_hutf, bins = [100,200,300, 400, 500,600, 700, 800, 900, 1000, 1100])
#plt.hist (s_wt_ext, bins = [100,200,300, 400, 500,600, 700, 800, 900, 1000, 1100])
#plt.hist (s_wt_hutf, bins = [100,200,300, 400, 500,600, 700, 800, 900, 1000, 1100])
