In [None]:
import os
from os import listdir
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as Layers
import tensorflow.keras.models as Models

In [None]:
list(os.listdir("../input/osic-pulmonary-fibrosis-progression"))

In [None]:
trainDataFrame = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
testDataFrame = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
print(trainDataFrame.shape)
print("---------------------------------------------------")
print(testDataFrame.shape)
print("---------------------------------------------------")
print(trainDataFrame.info())
print("---------------------------------------------------")
print(testDataFrame.info())

In [None]:
print(f"The total patient ids are {trainDataFrame['Patient'].count()}")
print(f"Number of unique ids are {trainDataFrame['Patient'].value_counts().shape[0]} ")

In [None]:
trainDataFrame.drop_duplicates(subset=['Patient','Weeks'], keep = False, inplace = True)

In [None]:
## CHECK SUBMISSION FORMAT
submissionDataFrame = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")
print(f"The sample submission contains: {submissionDataFrame.shape[0]} rows and {submissionDataFrame.shape[1]} columns.")
# split Patient_Week Column and re-arrage columns
submissionDataFrame[['Patient','Weeks']] = submissionDataFrame.Patient_Week.str.split("_",expand = True)
submissionDataFrame =  submissionDataFrame[['Patient','Weeks','Confidence', 'Patient_Week']]
submissionDataFrame = submissionDataFrame.merge(testDataFrame.drop('Weeks', axis = 1), on = "Patient")

In [None]:
# introduce a column to indicate the source (train/test) for the data
trainDataFrame['Source'] = 'train'
submissionDataFrame['Source'] = 'test'
dataFrame = trainDataFrame.append([submissionDataFrame])
dataFrame.reset_index(inplace = True)
dataFrame.head()

In [None]:
def get_week(df):
    # make a copy to not change original df    
    copy = df.copy()
    # ensure all Weeks values are INT and not accidentaly saved as string
    copy['Weeks'] = copy['Weeks'].astype(int)
    # as test data is containing all weeks, 
    copy.loc[copy.Source == 'test','min_week'] = np.nan
    copy["min_week"] = copy.groupby('Patient')['Weeks'].transform('min')
    copy['baselined_week'] = copy['Weeks'] - copy['min_week']
    
    return copy  

In [None]:
def get_base_FVC(df):
    # same as above
    copy = df.copy()
    base = copy.loc[copy.Weeks == copy.min_week]
    base = base[['Patient','FVC']].copy()
    base.columns = ['Patient','base_FVC']
    
    # add a row which contains the cumulated sum of rows for each patient
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    
    # drop all except the first row for each patient (=unique rows!), containing the min_week
    base = base[base.nb == 1]
    base.drop('nb', axis = 1, inplace = True)
    
    # merge the rows containing the base_FVC on the original _df
    copy = copy.merge(base, on = 'Patient', how = 'left')    
    copy.drop(['min_week'], axis = 1)
    
    return copy

In [None]:
dataFrame = get_week(dataFrame)
dataFrame = get_base_FVC(dataFrame)
dataFrame.head()

In [None]:
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#from sklearn.compose import ColumnTransformer

# define which attributes shall not be transformed, are numeric or categorical
not_to_transform_attribute = ['Patient', 'Weeks', 'min_week']
transform_Attributes = ['FVC', 'Percent', 'Age', 'baselined_week', 'base_FVC']
categoricalFeature = ['Sex', 'SmokingStatus']

In [None]:
def own_MinMaxColumnScaler(df, columns):
    """Adds columns with scaled numeric values to range [0, 1]
    using the formula X_scld = (X - X.min) / (X.max - X.min)"""
    for col in columns:
        new_col_name = col + '_scld'
        col_min = df[col].min()
        col_max = df[col].max()        
        df[new_col_name] = (df[col] - col_min) / ( col_max - col_min )

In [None]:
def own_OneHotColumnCreator(df, columns):
    """OneHot Encodes categorical features. Adds a column for each unique value per column"""
    for col in cat_attribs:
        for value in df[col].unique():
            df[value] = (df[col] == value).astype(int)

In [None]:
## APPLY DEFINED TRANSFORMATIONS
own_MinMaxColumnScaler(dataFrame, transform_Attributes)
own_OneHotColumnCreator(dataFrame, categoricalFeature)

#data_df[data_df.Source != "train"].head()
#data_df.head()

In [None]:
trainDataFrame = dataFrame.loc[dataFrame.Source == 'train']
Submission = dataFrame.loc[dataFrame.Source == 'test']

In [None]:
featureList = ['baselined_week_scld', 'Percent_scld', 'Age_scld', 'base_FVC_scld', 'Male', 'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']
#EPOCHS = 1000
EPOCHS = 100
#BATCH_SIZE = 128
BATCH_SIZE = 200
## LOSS; set tradeoff btw. Pinball-loss and adding score
_lambda = 0.8 # 0.8 default
## Optimizers
ADAM = tf.keras.optimizers.Adam(lr = 0.1,beta_1 = 0.9, beta_2 = 0.999,decay = 0.01)
SGD = tf.keras.optimizers.SGD()
# choose ADAM or SGD
optimizer = ADAM

In [None]:
# create constants for the loss function
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")

# define competition metric
def score(y_true, y_pred):
    """Calculate the competition metric"""
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype = tf.float32) )
    metric = (delta / sigma_clip) * sq2 + tf.math.log(sigma_clip * sq2)
    return K.mean(metric)

# define pinball loss
def qloss(y_true, y_pred):
    """Calculate Pinball loss"""
    # IMPORTANT: define quartiles, feel free to change here!
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype = tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q * e, (q-1) * e)
    return K.mean(v)

# combine competition metric and pinball loss to a joint loss function
def mloss(_lambda):
    """Combine Score and qloss"""
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda) * score(y_true, y_pred)
    return loss

In [None]:
def get_model():
    "Creates and returns a model"
    inp = Layers.Input((len(featureList),), name = "Patient")
    x = Layers.Dense(128, activation = "relu", name = "d1")(inp)
    x = Layers.Dropout(0.25)(x)
    x = Layers.Dense(128, activation = "relu", name = "d2")(x)
    x = Layers.Dropout(0.2)(x)
    # predicting the 
    p1 = Layers.Dense(3, activation = "relu", name = "p1")(x)
    # quantile adjusting p1 predictions
    p2 = Layers.Dense(3, activation = "relu", name = "p2")(x)
    preds = Layers.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis = 1), 
                     name = "preds")([p1, p2])
    
    model = Models.Model(inp, preds, name = "NeuralNet")
    model.compile(loss = mloss(_lambda), optimizer = optimizer, metrics = [score])
    
    return model

In [None]:
neuralNet = get_model()
neuralNet.summary()

In [None]:
# get target value
y = trainDataFrame['FVC'].values
y=y.astype(float)
# get training & test data
X_train = trainDataFrame[featureList].values
X_test = Submission[featureList].values

# instantiate target arrays
train_preds = np.zeros((X_train.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))
print(y.dtype)

In [None]:
#NFOLD = 10
NFOLD = 5
kf = KFold(n_splits = NFOLD)

count = 0
for train_idx, val_idx in kf.split(X_train):
    count += 1
    print(f"FOLD {count}:")
    
    # create and fit model
    net = get_model()
    net.fit(X_train[train_idx], y[train_idx], batch_size = BATCH_SIZE, epochs = EPOCHS, verbose = 1 ,validation_data = (X_train[val_idx], y[val_idx]), shuffle = True , workers = 3) 
    
    # evaluate
    print("Train:", net.evaluate(X_train[train_idx], y[train_idx], verbose = 1, batch_size = BATCH_SIZE))
    print("Val:", net.evaluate(X_train[val_idx], y[val_idx], verbose = 0, batch_size = BATCH_SIZE))
    
    # generate predictions for the known train data and the unknown test data
    train_preds[val_idx] = net.predict(X_train[val_idx], batch_size = BATCH_SIZE, verbose = 0)
    
    print("Predicting Test...")
    test_preds += net.predict(X_test, batch_size = BATCH_SIZE, verbose = 0) / NFOLD

In [None]:
sigma_opt = mean_absolute_error(y, train_preds[:,1])
sigma_uncertain = train_preds[:,2] - train_preds[:,0]
sigma_mean = np.mean(sigma_uncertain)
print(sigma_opt, sigma_mean)

In [None]:
Submission.head()

In [None]:
Submission['FVC1'] = test_preds[:, 1]
Submission['Confidence1'] = test_preds[:,2] - test_preds[:,0]

# get rid of unused data and show some non-empty data
submission = Submission[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()
submission.loc[~submission.FVC1.isnull()].head(10)

In [None]:
submission.loc[~submission.FVC1.isnull(),'FVC'] = submission.loc[~submission.FVC1.isnull(),'FVC1']

if sigma_mean < 70:
    submission['Confidence'] = sigma_opt
else:
    submission.loc[~submission.FVC1.isnull(),'Confidence'] = submission.loc[~submission.FVC1.isnull(),'Confidence1']

In [None]:
submission.head()

In [None]:
submission.describe().T

In [None]:
org_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

for i in range(len(org_test)):
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'FVC'] = org_test.FVC[i]
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'Confidence'] = 70

In [None]:
submission[["Patient_Week","FVC","Confidence"]].to_csv("/kaggle/working/submission.csv", index = False)