In [1]:
import os
import cv2
import random
import json
import datetime
import numpy as np
import pandas as pd
import pydicom
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import efficientnet.tfkeras as efn
print('tensorflow version:', tf.__version__)

tensorflow version: 2.3.1


In [2]:
## concept of this notebook was from:
# https://www.kaggle.com/khoongweihao/efficientnets-quantile-regression-inference

In [3]:
######################################################
## seed and defaults
######################################################

seed = 2020
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
    

DATA_DIR = '../../input/osic-pulmonary-fibrosis-progression'
TRAINING = 'train'
TEST = 'test'
TRAIN_FEATURES = [
    'Age',
    'Ex-smoker',
    'Never smoked',
    'Currently smokes',
    'Male',
    'Female'
]
GROUP_SPLITS = 20

IMG_SIZE = 512
DIMS = 1

EPOCHS = 30
BATCH_SIZE = 2
MODEL_NAME = 'linear_no_effent'
MODEL_VERSION = 'v1'
MODEL = MODEL_NAME + '_' + MODEL_VERSION + '_batch_' + str(BATCH_SIZE) + '_epochs_' + str(EPOCHS)
SAVE_BEST = True
LR = 0.1

In [4]:
train = pd.read_csv(DATA_DIR + '/train.csv')
test = pd.read_csv(DATA_DIR + '/test.csv') 
submission = pd.read_csv(DATA_DIR + '/sample_submission.csv') 

submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))

print('Train shape', train.shape)
print('Test shape', test.shape)
print('Submission shape', submission.shape)


Train shape (1549, 7)
Test shape (5, 7)
Submission shape (730, 5)


In [5]:
train_df = train.copy()
test_df = test.copy()

## scale age

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(train_df[['Age']])

test_df['Age'] = min_max_scaler.transform(test_df[['Age']])

## expand categories

def add_col(df, col):
    df[col] = 0

smoking_cats = train_df.SmokingStatus.unique().tolist()
sex = train_df.Sex.unique().tolist()

[add_col(test_df, i) for i in smoking_cats]
[add_col(test_df, i) for i in sex]

for p in test_df.Patient.unique():
    sex_val = test_df.loc[test_df.Patient == p, 'Sex'].values[0]
    test_df.loc[test_df.Patient == p, sex_val] = 1
    
    smoke_stat_val = test_df.loc[test_df.Patient == p, 'SmokingStatus'].values[0]
    test_df.loc[test_df.Patient == p, smoke_stat_val] = 1

In [6]:
test_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Ex-smoker,Never smoked,Currently smokes,Male,Female
0,ID00419637202311204720264,6,3020,70.186855,0.615385,Male,Ex-smoker,1,0,0,1,0
1,ID00421637202311550012437,15,2739,82.045291,0.487179,Male,Ex-smoker,1,0,0,1,0
2,ID00422637202311677017371,6,1930,76.672493,0.615385,Male,Ex-smoker,1,0,0,1,0
3,ID00423637202312137826377,17,3294,79.258903,0.589744,Male,Ex-smoker,1,0,0,1,0
4,ID00426637202313170790466,0,2925,71.824968,0.615385,Male,Never smoked,0,1,0,1,0


In [7]:
def get_training_components(df, features):
    patients = df.Patient.unique().tolist()
    coefs = {}
    meta_data = {}

    for i, p in enumerate(patients):
        p_df = train.loc[train.Patient == p, :] 
        c = np.vstack([p_df.Weeks.values, np.ones(len(p_df.Weeks.values))]).T
        coef, _ = np.linalg.lstsq(c, p_df.FVC.values)[0]

        coefs[p] = coef
        meta_data[p] = df[features].values[0]
        
    return patients, coefs, meta_data

def get_img(path):
    d = pydicom.dcmread(path)
    return cv2.resize((d.pixel_array - d.RescaleIntercept) / (d.RescaleSlope * 1000), (512, 512))

def get_model(trained_model, seed, droput_rate=0.5, noise=0.2, img_size=512, img_dims=1, meta_size=6):
    
    inputs_images = tf.keras.layers.Input(shape=(img_size, img_size, img_dims))
    x_image = tf.keras.layers.Conv2D(32, (3, 3))(inputs_images)
    x_image = tf.keras.layers.Dense(16, activation='relu')(x_image)
    x_image = tf.keras.layers.MaxPooling2D((2, 2))(x_image)
    x_image = tf.keras.layers.Conv2D(64, (3, 3))(x_image)
    x_image = tf.keras.layers.Dense(32, activation='relu')(x_image)
    x_image = tf.keras.layers.MaxPooling2D((2, 2))(x_image)
    x_image = tf.keras.layers.Conv2D(64, (3, 3))(x_image)
    x_image = tf.keras.layers.Dense(4, activation='relu')(x_image)
    x_image = tf.keras.layers.MaxPooling2D((2, 2))(x_image)
    x_image = tf.keras.layers.GlobalAveragePooling2D()(x_image)
    
    
    # meta data
    inp_meta = tf.keras.layers.Input(shape=(meta_size,))
    x_meta = tf.keras.layers.GaussianNoise(noise)(inp_meta)
    
    # combine
    x = tf.keras.layers.Concatenate()([x_image, x_meta]) 
    x = tf.keras.layers.Dropout(droput_rate, seed=seed)(x) 
    x = tf.keras.layers.Dense(1, activation='linear')(x)
    
    model = tf.keras.Model([inputs_images, inp_meta] , x)
    model.load_weights(trained_model)

    return model


model = get_model(MODEL + '.h5', seed)

In [8]:
init_week, init_percent, prediction_means, init_week_prediction = {}, {}, {}, {}

_, _, meta_data = get_training_components(test_df, TRAIN_FEATURES)

for p in test_df.Patient.unique():

    ldir = os.listdir(f'{DATA_DIR}/{TEST}/{p}/')
    ct_slices = [] 
    init_meta_data = [] 
    
    for i in ldir:
        if int(i[:-4]) / len(ldir) < 1.1 and int(i[:-4]) / len(ldir) > -0.1:
            ct_slices.append(get_img(f'{DATA_DIR}/{TEST}/{p}/{i}')) 
            init_meta_data.append(meta_data[p]) 
        if len(ct_slices) <= 1:
            continue

    ## expand to (width, height, depth)
    ct_slices = np.expand_dims(ct_slices, axis=-1)
    init_meta_data = np.array(init_meta_data)

    ## predict the intial based on
    ## each slice in the scan set
    print('Predicting', p)
    predictions = model.predict([ct_slices, init_meta_data], verbose=1)
    print('Predicting done...')

    prediction_means[p] = np.mean(predictions)

    init_FVC = test_df.FVC[test_df.Patient == p].values
    init_week[p] = test_df.Weeks[test_df.Patient == p].values
    init_percent[p] = test_df.Percent[test_df.Patient == p].values 

    init_week_prediction[p] = init_FVC - prediction_means[p] * init_week[p]
    print(init_week_prediction[p])

for k in submission.Patient_Week.values:
    match = submission.Patient_Week == k
    p = submission.loc[match, 'Patient'].values[0]
    w = submission.loc[match, 'Weeks'].values[0]
    
    submission.loc[match, 'FVC'] = prediction_means[p] * w + init_week_prediction[p]
    submission.loc[match, 'Confidence'] = (
        init_percent[p] - prediction_means[p] * abs(init_week[p] - w))

  if __name__ == '__main__':


Predicting ID00419637202311204720264
Predicting done...
[3038.86692286]
Predicting ID00421637202311550012437
Predicting done...
[2786.16730714]
Predicting ID00422637202311677017371
Predicting done...
[1948.86692286]
Predicting ID00423637202312137826377
Predicting done...
[3347.45628548]
Predicting ID00426637202313170790466
Predicting done...
[2925.]


In [10]:
init_week_prediction

{'ID00419637202311204720264': array([3038.86692286]),
 'ID00421637202311550012437': array([2786.16730714]),
 'ID00422637202311677017371': array([1948.86692286]),
 'ID00423637202312137826377': array([3347.45628548]),
 'ID00426637202313170790466': array([2925.])}

In [None]:
submission = submission.sort_values(by=['Patient', 'Weeks'])
submission[['Patient_Week', 'FVC', 'Confidence']].head()

In [None]:
submission[['Patient_Week','FVC','Confidence']].to_csv('submission.csv', index=False)