In [1]:
import os
import cv2
import random
import json
import datetime
import numpy as np
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GroupKFold
import tensorflow as tf
print('tensorflow version:', tf.__version__)

tensorflow version: 2.3.1


In [2]:
######################################################
## seed and defaults
######################################################

seed = 2020
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
    

DATA_DIR = '../../input/osic-pulmonary-fibrosis-progression'
GROUP_SPLITS = 5

TRAINING_FEATURES = [
    'Female', 
    'Male',
    'Currently smokes', 
    'Ex-smoker', 
    'Never smoked',
    'Percent',
    #'init_week_Percent',
    'Age', 
    'relative_week', 
    'init_week_FVC'
]

SCALED_FEATURES = [
    'Percent', 
    'Age',
    'relative_week', 
    'init_week_FVC'
]

IMG_SIZE = 224
IMG_SLICES = 12
CUTOFF = 2

EPOCHS = 30
BATCH_SIZE = 10
BATCH_PRED = 1
MODEL_NAME = 'dropout_variance'
MODEL_VERSION = 'v11b'
MODEL = MODEL_NAME + '_' + MODEL_VERSION + '_batch_' + str(BATCH_SIZE)

In [3]:
######################################################
## get files and split tabular data
######################################################

train = pd.read_csv(f'{DATA_DIR}/train.csv')
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])

test = pd.read_csv(f'{DATA_DIR}/test.csv')
subm = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

subm['Patient'] = subm['Patient_Week'].apply(lambda x: x.split('_')[0])
subm['Weeks'] = subm['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))

subm =  subm[['Patient','Weeks','Confidence','Patient_Week']]
subm = subm.merge(test.drop('Weeks', axis=1), on='Patient')

train['SPLIT'] = 'train'
test['SPLIT'] = 'test'
subm['SPLIT'] = 'submission'

data = train.append([test, subm])

######################################################
## initial week and relative week augmentations
######################################################

data['init_week'] = data['Weeks']
data.loc[data.SPLIT == 'submission', 'init_week'] = np.nan
data['init_week'] = data.groupby('Patient')['init_week'].transform('min')
data['relative_week'] = data['Weeks'] - data['init_week']

######################################################
## add initial fvc to all patients rows
######################################################

init_fvc = data.groupby('Patient')[['Patient', 'Weeks', 'init_week', 'FVC']].head()
init_fvc = init_fvc.loc[init_fvc.Weeks == init_fvc.init_week]
init_fvc.columns = ['Patient', 'Weeks', 'init_week', 'init_week_FVC']
init_fvc.drop(['Weeks', 'init_week'], axis=1, inplace=True)
data = data.merge(init_fvc, on='Patient', how='left')

del init_fvc


######################################################
## scale the continuous variables
## and dummies of categories
######################################################

min_max_scaler = MinMaxScaler()

data[SCALED_FEATURES] = min_max_scaler.fit_transform(data[SCALED_FEATURES])
data = pd.concat([data, pd.get_dummies(data.Sex), pd.get_dummies(data.SmokingStatus)], axis=1)

######################################################
## add initial percent to all patients rows
######################################################

init_perc = data.groupby('Patient')[['Patient', 'Weeks', 'init_week', 'Percent']].head()
init_perc = init_perc.loc[init_perc.Weeks == init_perc.init_week]
init_perc.columns = ['Patient', 'Weeks', 'init_week', 'init_week_Percent']
init_perc.drop(['Weeks', 'init_week'], axis=1, inplace=True)
data = data.merge(init_perc, on='Patient', how='left')

del init_perc

######################################################
## separate for training, testing, submission
######################################################

train = data.loc[data.SPLIT == 'train']
test = data.loc[data.SPLIT == 'test']
subm = data.loc[data.SPLIT == 'submission']

del data

In [4]:
subm.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,SPLIT,Confidence,Patient_Week,init_week,relative_week,init_week_FVC,Female,Male,Currently smokes,Ex-smoker,Never smoked,init_week_Percent
1540,ID00419637202311204720264,-12,3020,0.332421,0.615385,Male,Ex-smoker,submission,100.0,ID00419637202311204720264_-12,6.0,0.067901,0.3724,0,1,0,1,0,0.332421
1541,ID00419637202311204720264,-11,3020,0.332421,0.615385,Male,Ex-smoker,submission,100.0,ID00419637202311204720264_-11,6.0,0.074074,0.3724,0,1,0,1,0,0.332421
1542,ID00419637202311204720264,-10,3020,0.332421,0.615385,Male,Ex-smoker,submission,100.0,ID00419637202311204720264_-10,6.0,0.080247,0.3724,0,1,0,1,0,0.332421
1543,ID00419637202311204720264,-9,3020,0.332421,0.615385,Male,Ex-smoker,submission,100.0,ID00419637202311204720264_-9,6.0,0.08642,0.3724,0,1,0,1,0,0.332421
1544,ID00419637202311204720264,-8,3020,0.332421,0.615385,Male,Ex-smoker,submission,100.0,ID00419637202311204720264_-8,6.0,0.092593,0.3724,0,1,0,1,0,0.332421


In [5]:
#### image helpers
def get_img_seq(pat_id, slice_count, data_dir, folder, img_size):
        
    images = []

    slices = get_images(pat_id, slice_count, data_dir, folder)
    scans = get_pixels_hu(slices)

    for img_idx in range(slice_count):
        img = scans[img_idx]

        ## resize images to be same shape
        img = cv2.resize(img, (img_size, img_size))

        ## normalize the image pixels
        img = (img - np.min(img)) / (np.max(img) - np.min(img))

        #reshape for tesnor
        img = np.repeat(img[..., np.newaxis], 3, -1)
        images.append(img)     

    return np.array(images).astype(np.float32)
    
def get_pixels_hu(scans):
    '''
    hu pixel is from
    https://www.raddq.com/dicom-processing-segmentation-visualization-in-python/
    '''
    
    image = np.stack([s.pixel_array for s in scans])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 1
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)


def get_images(pat_id, slice_count, data_dir, folder):
    c_off = 2
    path = f'{data_dir}/{folder}/{pat_id}'

    file_names = sorted(os.listdir(path), key=lambda x: int(os.path.splitext(x)[0]))

    idxs = [
        int(i * len(file_names) / (slice_count + 2 * c_off)) 
        for i in range(slice_count + 2 * c_off)
    ]

    image_array = [
        pydicom.read_file(path + '/' + file_names[idx])
        for idx in idxs[c_off:-c_off]
    ]

    if len(image_array) < slice_count:
        for i in range(slice_count - len(image_array)):
            image_array.append(pydicom.read_file(path + '/' + os.listdir(path)[-1]))

    return image_array

######################################################
## data generator, used to feed data to TensorFlow in batches
######################################################

class DataGen(tf.keras.utils.Sequence):
    def __init__(
        self, 
        df, 
        tab_features,
        data_dir,
        batch_size=8, 
        mode='fit', 
        shuffle=False, 
        cutoff=2,
        folder='train',
        slice_count=12, 
        img_size=224):

        self.df = df
        self.data_dir = data_dir
        self.shuffle = shuffle
        self.mode = mode
        self.batch_size = batch_size
        self.folder = folder
        self.img_size = img_size
        self.slice_count = slice_count
        self.tab_features = tab_features
        self.on_epoch_end()
        
    def __len__(self):

        return int(np.floor(len(self.df) / self.batch_size))
    
    def on_epoch_end(self):
        
        self.indexes = np.arange(len(self.df))

        if self.shuffle:
            np.random.shuffle(self.indexes)
            
    def __getitem__(self, index):

        batch_size = min(self.batch_size, len(self.df) - index * self.batch_size)
        
        X_img = np.zeros((batch_size, self.slice_count, self.img_size, self.img_size, 3), dtype=np.float32)
        X_tab = self.df[index * self.batch_size : (index + 1) * self.batch_size][self.tab_features].values
        pats_batch = self.df[index * self.batch_size : (index + 1) * self.batch_size]['Patient'].values
        
        for i, pat_id in enumerate(pats_batch):
            imgs_seq = get_img_seq(pat_id, self.slice_count, self.data_dir, self.folder, self.img_size)
            X_img[i, ] = imgs_seq

        if self.mode == 'fit' or self.mode == 'test':
            y = np.array(
                self.df[index * self.batch_size : (index + 1) * self.batch_size]['FVC'].values, 
                dtype=np.float32
            )

            return (X_img, X_tab), y

        elif self.mode == 'predict':
            y = np.zeros(batch_size, dtype=np.float32)

            return (X_img, X_tab), y


In [8]:
######################################################
## prediction time
######################################################

predictions = {}
test_accuracy = {}
test_predictions = {}

for i in range(3):
    
    model_file = f'{MODEL}_fold_{i}.h5'
    model = tf.keras.models.load_model(model_file)
    print('model loaded:', model_file)

    sub_datagen = DataGen(
        df=subm,
        tab_features=TRAINING_FEATURES,
        data_dir=DATA_DIR,
        batch_size=BATCH_PRED,
        mode='predict', 
        shuffle=False, 
        folder='test',
        slice_count=IMG_SLICES, 
        img_size=IMG_SIZE
    )
    
    eval_datagen = DataGen(
        df=test,
        tab_features=TRAINING_FEATURES,
        data_dir=DATA_DIR,
        batch_size=BATCH_PRED,
        mode='test', 
        shuffle=False, 
        folder='test',
        slice_count=IMG_SLICES, 
        img_size=IMG_SIZE
    )
    
    eval_preds_datagen = DataGen(
        df=test,
        tab_features=TRAINING_FEATURES,
        data_dir=DATA_DIR,
        batch_size=BATCH_PRED,
        mode='predict', 
        shuffle=False, 
        folder='test',
        slice_count=IMG_SLICES, 
        img_size=IMG_SIZE
    )
    
    # Evaluate the model on the test data using `evaluate`
    print("Evaluate on test data")
    test_acc = model.evaluate(eval_datagen)
    print("test loss, test acc:", test_acc)
    test_accuracy['fold_' + str(i)] = test_acc
    
    # predict the test vals and compare to actual
    print("Predictions on test data")
    test_preds = model.predict(eval_preds_datagen, verbose=1)
    print('Test preds shape:', test_preds.shape)
    print('Test preds sample:', test_preds[0])
    test_predictions['fold_' + str(i) + '_predicted'] = test_preds.flatten()
    test_predictions['fold_' + str(i) + '_actual'] = test.FVC
    
    preds_subm = model.predict(sub_datagen, verbose=1)
    print('predictions shape:', preds_subm.shape)
    print('predictions sample:', preds_subm[0])
    
    predictions['fold_' + str(i)] = preds_subm.flatten()


model loaded: dropout_variance_v11b_batch_10_fold_0.h5
Evaluate on test data
test loss, test acc: 211.0601043701172
Predictions on test data
Test preds shape: (5, 1)
Test preds sample: [2834.957]
predictions shape: (730, 1)
predictions sample: [2861.0234]
model loaded: dropout_variance_v11b_batch_10_fold_1.h5
Evaluate on test data
test loss, test acc: 221.6521453857422
Predictions on test data
Test preds shape: (5, 1)
Test preds sample: [2908.4402]
predictions shape: (730, 1)
predictions sample: [2917.7158]
model loaded: dropout_variance_v11b_batch_10_fold_2.h5
Evaluate on test data
test loss, test acc: 210.0867156982422
Predictions on test data
Test preds shape: (5, 1)
Test preds sample: [2821.4285]
predictions shape: (730, 1)
predictions sample: [2845.0442]


In [9]:
test_accuracy

{'fold_0': 211.0601043701172,
 'fold_1': 221.6521453857422,
 'fold_2': 210.0867156982422}

In [10]:
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.head()

Unnamed: 0,fold_0_predicted,fold_0_actual,fold_1_predicted,fold_1_actual,fold_2_predicted,fold_2_actual
1535,2834.957031,3020,2908.440186,3020,2821.428467,3020
1536,2929.029785,2739,3002.82373,2739,2887.949463,2739
1537,2308.489746,1930,2501.883789,1930,2288.532471,1930
1538,3160.848633,3294,3185.474854,3294,3129.4646,3294
1539,2756.41333,2925,2872.531738,2925,2745.155273,2925


In [None]:
subm['FVC'] = subm_preds['means'].to_numpy()
subm['Confidence'] = subm_preds['std'].to_numpy()