In [1]:
# TODO: add crop for log_sigma and for abs diff for FVC
# TODO: add normalization for data
# TODO: ...

In [2]:
%config Completer.use_jedi = False

import os
import platform
from collections import namedtuple
import time

# import tqdm
import pandas as pd
import numpy as np
import sparse

from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.modules.loss import _Loss
from torch.optim.lr_scheduler import _LRScheduler
# from torchvision import transforms
# from torchsummary import summary
# from efficientnet_pytorch_3d import EfficientNet3D
from my_efficientnet_pytorch_3d import EfficientNet3D

# from utils import *


########################

RUNNING_IN_KAGGLE = 'linux' in platform.platform().lower()
IMAGE_PATH = "../input/osic-pulmonary-fibrosis-progression/" if RUNNING_IN_KAGGLE else 'data/'
PROCESSED_PATH = 'FIX IT!' if RUNNING_IN_KAGGLE else 'data/processed-data/'  # TODO: fix this line

dtype = torch.float32
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
device = torch.device(device)

In [3]:
class CTDataset(Dataset):
    _ReturnValue = namedtuple('ReturnValue', ['weeks', 'fvcs', 'features', 'masks', 'images'])

    def __init__(
            self, root, csv_path, train=True, test_size=0.25, random_state=42):
        """
        :param dataset:

        :param root:
        :param train:
        :param train_test_split:
        :param random_state:
        """
        assert test_size is not None

        self.root = root
        self.train = train
        self.csv_path = csv_path
        self.test_size = test_size
        self.random_state = random_state

        if not os.path.exists(self.root):
            raise ValueError('Data is missing')

        self._patients = list(sorted(os.listdir(self.root)))

        if self.test_size == 0:
            self._train_patients, self._test_patients = self._patients, []
        else:
            self._train_patients, self._test_patients = train_test_split(
                self._patients, test_size=self.test_size, random_state=random_state
            )

        self._table_features = dict()
        table_data = pd.read_csv(self.csv_path)
        for patient in self._patients:
            patient_data = table_data[table_data.Patient == patient]

            all_weeks = patient_data.Weeks.tolist()
            all_fvcs = patient_data.FVC.tolist()

            all_weeks, all_fvcs = zip(*sorted(zip(all_weeks, all_fvcs), key=lambda x: x[0]))

            age = sorted(zip(*np.unique(patient_data.Age, return_counts=True)), key=lambda x: x[1])[-1][0]
            sex = sorted(zip(*np.unique(patient_data.Sex, return_counts=True)), key=lambda x: x[1])[-1][0]
            smoking_status = sorted(zip(*np.unique(patient_data.SmokingStatus, return_counts=True)), key=lambda x: x[1])[-1][0]

            sex = [0, 1] if sex == 'Female' else [1, 0]
            smoking_status = (
                [1, 0, 0] if smoking_status == 'Ex-smoker' else
                [0, 1, 0] if smoking_status == 'Never smoked' else
                [0, 0, 1] if smoking_status == 'Currently smokes' else
                [0, 0, 0]
            )
            self._table_features[patient] = (
                all_weeks, all_fvcs, [age] + sex + smoking_status
            )

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        patient = self._train_patients[index] if self.train else self._test_patients[index]
        base_path = os.path.join(self.root, patient)

        meta = np.load(os.path.join(base_path, 'meta.npy'), allow_pickle=True).tolist()
        masks = sparse.load_npz(os.path.join(base_path, 'masks.npz'))
        images = np.load(os.path.join(base_path, 'images.npy'))

        meta_processed = dict()
        for key, values in meta.items():
            if key in {'SliceLocation', 'InstanceNumber'}:
                continue
            else:
                unique_values, values_cnt = np.unique(values, return_counts=True, axis=0)
                most_frequent = sorted(zip(unique_values, values_cnt), key=lambda x: x[1])[-1][0]
                most_frequent = np.array(most_frequent).reshape(-1)
                if key in {
                    'SliceThickness', 'TableHeight', 'WindowCenter', 'WindowWidth'
                }:
                    meta_processed[key] = most_frequent[0]
                elif key == 'PixelSpacing':
                    if len(most_frequent) == 1:
                        meta_processed['PixelSpacingX'], meta_processed['PixelSpacingY'] = (
                            most_frequent[0], most_frequent[0]
                        )
                    else:
                        meta_processed['PixelSpacingX'], meta_processed['PixelSpacingY'] = (
                            most_frequent[0], most_frequent[1]
                        )
                elif key == 'PatientPosition':
                    pass
                elif key == 'PositionReferenceIndicator':
                    pass

        all_weeks, all_fvcs, features = self._table_features[patient]
        features = [value for key, value in meta_processed.items()] + features

        return CTDataset._ReturnValue(weeks=all_weeks, fvcs=all_fvcs, features=features, masks=masks, images=images)

    def __len__(self):
        return len(self._train_patients if self.train else self._test_patients)

    def __repr__(self):
        fmt_str = 'OSIC Pulmonary Fibrosis Progression Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        tmp = 'train' if self.train is True else 'test'
        fmt_str += '    Split: {}\n'.format(tmp)
        fmt_str += '    Root Location: {}\n'.format(self.root)
        return fmt_str


class LaplaceLoss(_Loss):
    def forward(self, y_true, preds, log_sigma):
        abs_diff = (y_true - preds).abs()
#         abs_diff.clamp_max_(1_000)
        log_sigma.clamp_(-5, 5)  # -np.log(70), np.log(70)
#         log_sigma.clamp_min_(-5)
        losses = np.sqrt(2) * abs_diff / log_sigma.exp() + log_sigma + np.log(2) / 2
        return losses.mean()


class SqueezeLayer(nn.Module):
    def forward(self, x):
        return x.squeeze()


class FeatureExtractor(nn.Module):
    def __init__(self, net):
        super().__init__()
        self.net = net

    def forward(self, x):
        return self.net.extract_features(x.unsqueeze(0).unsqueeze(0))


class OSICNet(nn.Module):
    def __init__(self, dtype, device, efficient_net_model_number, hidden_size, dropout_rate):  # , output_size
        super().__init__()

        self.dtype = dtype
        self.device = device

        self.CT_features_extractor = nn.Sequential(
            FeatureExtractor(
                EfficientNet3D.from_name(
                    f'efficientnet-b{efficient_net_model_number}', override_params={'num_classes': 1}, in_channels=1
                )
            ),
            nn.AdaptiveAvgPool3d(1),
            SqueezeLayer()
        )

        self.predictor = nn.Sequential(
            nn.Linear(1280, hidden_size),  # 1294
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 5)  # output_size
        )
        
        self._initialize_weights()

        self.CT_features_extractor.to(self.device)
        self.predictor.to(self.device)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv3d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
                m.weight.data.normal_(0, np.sqrt(2. / n))
            elif isinstance(m, torch.nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

        
#     def forward(self, data):
#         mean_dataset, std_dataset = -971.4692260919278, 117.84143467421829
#         lungs = -1000 * (1.0 - data.masks) + data.masks * data.images
#         lungs = (lungs - mean_dataset) / std_dataset
#         lungs = torch.tensor(lungs, dtype=self.dtype, device=self.device)
#         lungs_features = self.CT_features_extractor(lungs)

#         data_weeks = torch.tensor(data.weeks, dtype=self.dtype)
#         weeks = torch.empty(len(data.weeks), 4, dtype=self.dtype)
#         weeks[:, 3] = 1
#         weeks[:, 2] = data_weeks
#         weeks[:, 1] = data_weeks ** 2
#         weeks[:, 0] = data_weeks ** 3

#         agg_loss = 0
#         for week, FVC in zip(data.weeks, data.fvcs):
#             table_features = torch.tensor(np.r_[week, FVC, data.features], dtype=self.dtype, device=self.device)
#             X = lungs_features  # torch.cat([lungs_features, table_features])

#             pred_numbers = self.predictor(X).cpu()
#             coefs = pred_numbers[:4]
#             log_sigma = pred_numbers[4]

#             FVC_preds = (weeks * coefs).sum(dim=1)
#             FVC_true = torch.tensor(data.fvcs, dtype=self.dtype)

#             agg_loss += LaplaceLoss()(FVC_true, FVC_preds, log_sigma)

#         return agg_loss / len(data.weeks)

    def forward(self, data):
        mean_dataset, std_dataset = -971.4692260919278, 117.84143467421829
        lungs = -1000 * (1.0 - data.masks) + data.masks * data.images
        lungs = (lungs - mean_dataset) / std_dataset
        lungs = torch.tensor(lungs, dtype=self.dtype, device=self.device)
        lungs_features = self.CT_features_extractor(lungs)

#         data_weeks = torch.tensor(data.weeks, dtype=self.dtype)
#         weeks = torch.empty(len(data.weeks), 4, dtype=self.dtype)
#         weeks[:, 3] = 1
#         weeks[:, 2] = data_weeks
#         weeks[:, 1] = data_weeks ** 2
#         weeks[:, 0] = data_weeks ** 3

#         agg_loss = 0
        all_preds = []
        for week, FVC in zip(data.weeks, data.fvcs):
            table_features = torch.tensor(np.r_[week, FVC, data.features], dtype=self.dtype, device=self.device)
            X = lungs_features  # torch.cat([lungs_features, table_features])

            pred_numbers = self.predictor(X).cpu()
            all_preds.append(pred_numbers)
#             coefs = pred_numbers[:4]
#             log_sigma = pred_numbers[4]

#             FVC_preds = (weeks * coefs).sum(dim=1)
#             FVC_true = torch.tensor(data.fvcs, dtype=self.dtype)
            
#             agg_loss += LaplaceLoss()(FVC_true, FVC_preds, log_sigma)

#         return agg_loss / len(data.weeks)
        return all_preds


class LinearDecayLR(_LRScheduler):
    def __init__(self, optimizer, start_epoch, stop_epoch, start_lr, stop_lr, last_epoch=-1):
        self.optimizer = optimizer

        self.start_epoch = start_epoch
        self.stop_epoch = stop_epoch

        self.start_lr = start_lr
        self.stop_lr = stop_lr

        self.last_epoch = last_epoch

        super().__init__(optimizer, last_epoch)

    def get_lr(self) -> list:
        if self.last_epoch < self.start_epoch:
            new_lr = self.start_lr
        elif self.last_epoch > self.stop_epoch:
            new_lr = self.stop_lr
        else:
            new_lr = self.start_lr + (
                (self.stop_lr - self.start_lr) *
                (self.last_epoch - self.start_epoch) /
                (self.stop_epoch - self.start_epoch)
            )
        return [new_lr for _ in self.optimizer.param_groups]

In [None]:
# train_dataset = CTDataset(
#     f'{PROCESSED_PATH}/train',
#     f'{IMAGE_PATH}/train.csv',
#     train=True, test_size=0.25, random_state=42
# )

# test_dataset = CTDataset(
#     f'{PROCESSED_PATH}/train',
#     f'{IMAGE_PATH}/train.csv',
#     train=False, test_size=0.25, random_state=42
# )

In [4]:
dataset_all = CTDataset(
    f'{PROCESSED_PATH}/train',
    f'{IMAGE_PATH}/train.csv',
    train=True, test_size=0.0, random_state=42
)

In [7]:
dataset_all[0]

ReturnValue(weeks=(-4, 5, 7, 9, 11, 17, 29, 41, 57), fvcs=(2315, 2214, 2061, 2144, 2069, 2101, 2000, 2064, 2057), features=[0.390625, 1.3046879768371582, 0.1953125, 130.0, -500.0, -1500.0, 79, 1, 0, 1, 0, 0], masks=<COO: shape=(192, 256, 256), dtype=bool, nnz=1535057, fill_value=False>, images=array([[[-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        ...,
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.]],

       [[-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        ...,
        [-1000., -1000., -1000., ..., -1000., -1000., -1000.],
        [-1000., -1000., -1000., ..., -1000., -

In [8]:
# images = [-1000 * (1.0 - dataset_all[i].masks) + dataset_all[i].masks * dataset_all[i].images
#           for i in range(len(dataset_all))]

data = [(dataset_all[i].weeks, dataset_all[i].fvcs, dataset_all[i].features) for i in range(len(dataset_all))]

In [11]:
len(data)

176

In [13]:
data[0]

((-4, 5, 7, 9, 11, 17, 29, 41, 57),
 (2315, 2214, 2061, 2144, 2069, 2101, 2000, 2064, 2057),
 [0.390625,
  1.3046879768371582,
  0.1953125,
  130.0,
  -500.0,
  -1500.0,
  79,
  1,
  0,
  1,
  0,
  0])

In [21]:
all_weeks = []
for i in range(len(data)):
    all_weeks += list(data[i][0])

weeks = np.array(all_weeks)
weeks.mean(), weeks.std()

(31.861846352485475, 23.240045178171002)

In [22]:
all_FVCs = []
for i in range(len(data)):
    all_FVCs += list(data[i][1])

FVCs = np.array(all_FVCs)
FVCs.mean(), FVCs.std()

(2690.479018721756, 832.5021066817238)

In [None]:
other_ftrs = []
for i in range(len(data)):
    other_ftrs.append(list(data[i][2]))

other_ftrs = np.array(other_ftrs)

In [35]:
# pd.DataFrame(other_ftrs).describe().loc[['mean', 'std']]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
mean,2.765619,1.423738,1.256083,133.76608,-523.857955,-1241.545455,67.261364,0.789773,0.210227,0.670455,0.278409,0.051136
std,2.354473,0.147985,0.993669,58.766154,192.607394,850.688735,7.088009,0.408632,0.408632,0.471389,0.449495,0.220904


In [36]:
other_ftrs.mean(axis=0)

array([ 2.76561876e+00,  1.42373805e+00,  1.25608294e+00,  1.33766080e+02,
       -5.23857955e+02, -1.24154545e+03,  6.72613636e+01,  7.89772727e-01,
        2.10227273e-01,  6.70454545e-01,  2.78409091e-01,  5.11363636e-02])

In [37]:
other_ftrs.std(axis=0)

array([2.34777445e+00, 1.47563586e-01, 9.90841780e-01, 5.85989667e+01,
       1.92059435e+02, 8.48268563e+02, 7.06784382e+00, 4.07469958e-01,
       4.07469958e-01, 4.70048134e-01, 4.48215873e-01, 2.20275818e-01])

In [25]:
other_ftrs.shape

(176, 12)

In [38]:
all_means = np.r_[weeks.mean(), FVCs.mean(), other_ftrs.mean(axis=0)]
all_means

array([ 3.18618464e+01,  2.69047902e+03,  2.76561876e+00,  1.42373805e+00,
        1.25608294e+00,  1.33766080e+02, -5.23857955e+02, -1.24154545e+03,
        6.72613636e+01,  7.89772727e-01,  2.10227273e-01,  6.70454545e-01,
        2.78409091e-01,  5.11363636e-02])

In [39]:
all_stds = np.r_[weeks.std(), FVCs.std(), other_ftrs.std(axis=0)]
all_stds

array([2.32400452e+01, 8.32502107e+02, 2.34777445e+00, 1.47563586e-01,
       9.90841780e-01, 5.85989667e+01, 1.92059435e+02, 8.48268563e+02,
       7.06784382e+00, 4.07469958e-01, 4.07469958e-01, 4.70048134e-01,
       4.48215873e-01, 2.20275818e-01])

In [4]:
sum_image = 0
sum_sq_image = 0
for image in images:
    sum_image += image.sum()
    sum_sq_image += (image ** 2).sum()

N = np.prod((176., 192., 256., 256.))

mean = sum_image / N

mean

var = sum_sq_image / N + mean ** 2 - 2 * mean * sum_image / N

std = var ** 0.5

mean, std