In [None]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

def laplace_log_likelihood(y_true, y_pred, sigma):
    sigma_clipped = np.maximum(sigma, 70)
    delta_clipped = np.minimum(np.abs(y_true - y_pred), 1000)
    score = - np.sqrt(2) * delta_clipped / sigma_clipped - np.log(np.sqrt(2) * sigma_clipped)
    return np.mean(score)

MYPATH = 'E:/OSIC-Pul/'

trainData = pd.read_csv(MYPATH + 'train.csv')

# Encode categorical data to numerical
encoder = {"Sex": {"Male": 0, "Female": 1},
           "SmokingStatus": {"Never smoked": 0, "Ex-smoker": 0.5, "Currently smokes": 1}}

# Label encoding
trainData.replace(encoder, inplace=True)

trainData['Patient_Week'] = trainData['Patient'] + '_' + trainData['Weeks'].astype(str)
patients = trainData.groupby(['Patient'])

trainData = pd.DataFrame(columns=trainData.columns)
trainData['Baseline_FVC'] = []
trainData['Weeks_Since'] = []
for _, patient in patients:
    patient.insert(len(patient.columns), 'Baseline_FVC', patient['FVC'].iloc[0])
    patient.insert(len(patient.columns), 'Weeks_Since', patient['Weeks'] - patient['Weeks'].iloc[0])
    trainData = trainData.append(patient)

X = trainData[['Age', 'Sex', 'SmokingStatus', 'Baseline_FVC', 'Weeks_Since']].to_numpy()
Y = trainData[['FVC']].to_numpy().ravel()

# Train over entire dataset
regr = LinearRegression()
x_train = X
y_train = Y
regr.fit(x_train, y_train)

# Predict FVC
y_pred = regr.predict(x_train)

# Predict Sigma using optimal sigma=sqrt(2)*delta
deltas = math.sqrt(2)*abs(y_train - y_pred)

# Values are exponentially distributed
lambda_hat = np.mean(deltas)
sigma_train = deltas
regs = LinearRegression()

regs.fit(x_train, sigma_train)
sigma_pred = regs.predict(x_train)

# Evaluate Performance
print('FVC MSE: %.2f'% mean_squared_error(y_train, y_pred))
print('FVC R2: %.2f'% r2_score(y_train, y_pred))

print('Sigma Coefs: \n', regs.coef_)
print('Sigma MSE: %.2f'% mean_squared_error(sigma_train, sigma_pred))
print('Sigma R2: %.2f'% r2_score(sigma_train, sigma_pred))

stdev = np.std(deltas)
metric = []
for i in range(len(y_pred)):
    metric.append(laplace_log_likelihood(y_pred[i], y_train[i], sigma_pred[i]))

print('toy metric: ' + np.mean(metric).astype(str))

#plt.subplot(221)
#plt.scatter(y_train,y_pred, marker='.')

#plt.subplot(222)
#plt.hist(y_pred-y_train,50)

#plt.subplot(223)
#plt.scatter(sigma_train,sigma_pred, marker='.')

#plt.subplot(224)
#plt.hist(sigma_train,50)
#plt.show()

# Construct Test Input
test = pd.read_csv(MYPATH + 'test.csv').rename(columns={
        'FVC' : 'Baseline_FVC', 'Weeks' : 'Baseline_Weeks'})
test.replace(encoder, inplace=True)

submission = pd.read_csv(MYPATH + 'sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)

test = submission.merge(test, on='Patient')
test['Weeks_Since'] = test['Weeks'] - test['Baseline_Weeks']

# Perform Prediction
x_test = test[['Age', 'Sex', 'SmokingStatus', 'Baseline_FVC', 'Weeks_Since']].to_numpy()
y_test = regr.predict(x_test)
sigma_test = regs.predict(x_test)

test['FVC'] = y_test
test['Confidence'] = sigma_test

submission = test[['Patient_Week', 'FVC', 'Confidence']]

submission.to_csv('submission.csv', index=False)
