In [1]:
import sys

from pathlib import Path
from PIL import ImageDraw, ImageFont, Image
from matplotlib import patches, patheffects
import time
from random import randint
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,log_loss
from scipy.stats import ks_2samp

import pdb

import scipy as sp
from tqdm import tqdm, tqdm_notebook

import os
import glob

import torch
torch.cuda.current_device()

import torch.nn as nn
import torch.utils.data as D
import torch.nn.functional as F
import torch.utils as U

import torchvision
from torchvision import transforms as T
from torchvision import models as M

import matplotlib.pyplot as plt

PATH = Path('C:/StudioProjects/Hemorrhage')
PATH_WORK = Path('C:/StudioProjects/Hemorrhage/running')

import lightgbm as lgb
from collections import defaultdict, Counter
import random
import seaborn as sn
from eli5.permutation_importance import get_score_importances

pd.set_option("display.max_columns", 100)

all_ich = ['any','epidural','intraparenchymal','intraventricular','subarachnoid','subdural']

sys.path.insert(0, "C:\\fastai")
from fastai import *
from fastai.vision import *
from fastai.tabular import *
from fastprogress import *

In [26]:
DATA_SMALL = True

In [2]:
cols_cat, cols_float = pickle.load(open(PATH_WORK/'covs','rb'))

In [3]:
filename = PATH_WORK/'yuvals_indexes_file.pkl'
all_idx, train_ids, val_ids = pickle.load(open(filename,'rb'))

train_md = pd.read_csv(PATH_WORK/'train_md.csv').sort_values(['SeriesInstanceUID','pos_idx'])
train_md['img_id'] = train_md.SOPInstanceUID.str.split('_').apply(lambda x: x[1])

In [4]:
trn_data = train_md.loc[train_md.img_id.isin(all_idx[train_ids])]
val_data = train_md.loc[train_md.img_id.isin(all_idx[val_ids])]

In [5]:
assert len(trn_data.SeriesInstanceUID.unique()) + len(val_data.SeriesInstanceUID.unique()) \
    == len(train_md.SeriesInstanceUID.unique())

In [6]:
assert len(trn_data.PatientID.unique()) + len(val_data.PatientID.unique()) \
    >= len(train_md.PatientID.unique())

In [7]:
ids_df = pd.DataFrame(all_idx, columns = ['img_id'])
ids_df = ids_df.join(train_md[['img_id','SeriesInstanceUID','pos_idx']].set_index('img_id'), on = 'img_id')

assert len(ids_df.SeriesInstanceUID.unique()) == 19530

# Pre-processing

In [2]:
filename = PATH_WORK/'yuvals_model_Densenet161_3_vehrsion_basic_classifier_type_features_train_split_2.pkl'
feats = pickle.load(open(filename,'rb'))

In [74]:
for series_id in tqdm(ids_df.SeriesInstanceUID.unique()):
    mask = torch.BoolTensor(ids_df.SeriesInstanceUID.values == series_id)
    feats_id = feats[mask]
    pickle.dump(feats_id, open(PATH_WORK/'features/densenet161_v3/{}'.format(series_id),'wb'))

100%|████████████████████████████████████████████████████████████████████████████| 19530/19530 [25:29<00:00, 12.77it/s]


# Model

In [23]:
class RSNA_DataSet(D.Dataset):
    def __init__(self, metadata, ids_df, mode='train'):
        
        super(RSNA_DataSet, self).__init__()
        
        #self.records = df.to_records(index=False)
        self.mode = mode
        self.series = metadata.SeriesInstanceUID.unique()
        self.metadata = metadata
        self.ids_df = ids_df
    
    def __getitem__(self, index):
        
        series_id = self.series[index]
        df = self.metadata.loc[self.metadata.SeriesInstanceUID == series_id]
        
        path = PATH_WORK/'features/densenet161_v3/{}'.format(series_id)
        feats = pickle.load(open(path,'rb'))
        ids_df_sub = ids_df.loc[ids_df.SeriesInstanceUID.values == series_id]
        
        if feats.shape[0] > len(df):
            mask_dup = ~ids_df_sub.img_id.duplicated().values
            ids_df_sub = ids_df_sub.loc[mask_dup]
            feats = feats[torch.BoolTensor(mask_dup)]
        
        assert feats.shape[0] == len(df)
        assert len(ids_df_sub) == len(df)
        assert np.all(ids_df_sub.img_id.isin(df.img_id).values)
        order = np.argsort(ids_df_sub.pos_idx.values)
        assert np.all(ids_df_sub.img_id.values[order] == df.img_id.values)
        feats = feats[torch.LongTensor(order)]
        
        feats = torch.cat([feats, torch.Tensor(df[cols_cat + cols_float].values)], dim=1)
        feats = torch.cat([feats, torch.zeros((60 - feats.shape[0], feats.shape[1]))], dim=0)
        
        #target = torch.cat([torch.IntTensor(df[all_ich].values), 
        #                    torch.zeros((60 - len(df), len(all_ich)), dtype=torch.int32)], dim=0)
        target = torch.cat([torch.Tensor(df[all_ich].values), 
                            torch.zeros((60 - len(df), len(all_ich)))], dim=0)
        
        return feats.transpose(1,0), target
    
    def __len__(self):
        return len(self.series) if not DATA_SMALL else int(0.01*len(self.series))

In [9]:
class _Loss(Module):
    def __init__(self, size_average=None, reduce=None, reduction='mean'):
        super(_Loss, self).__init__()
        if size_average is not None or reduce is not None:
            self.reduction = _Reduction.legacy_get_string(size_average, reduce)
        else:
            self.reduction = reduction

class BCEWithLogitsLoss(_Loss):
    __constants__ = ['weight', 'pos_weight', 'reduction']

    def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
        super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
        self.register_buffer('weight', weight)
        self.register_buffer('pos_weight', pos_weight)

    def forward(self, input, target):
        return F.binary_cross_entropy_with_logits(input.squeeze(), target,
                                                  self.weight,
                                                  pos_weight=self.pos_weight,
                                                  reduction=self.reduction)

In [10]:
class FeatProduct(nn.Module):
    def __init__(self, in_feature, out_feature):
        super(FeatProduct, self).__init__()
        self.in_feature = in_feature
        self.out_feature = out_feature
        self.weight = nn.Parameter(torch.Tensor(out_feature, in_feature))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, x):
        output = F.linear(x, self.weight)
        return output

In [13]:
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, n_cont:int, out_sz:int, layers:Collection[int], ps:Collection[float]=None,
                 emb_drop:float=0., use_bn:bool=True, bn_final:bool=False, feat_sz=2208):
        super().__init__()
        ps = ifnone(ps, [0]*len(layers))
        ps = listify(ps, layers)
        self.bn_cont = nn.BatchNorm1d(feat_sz + n_cont)
        self.n_cont = n_cont
        sizes = self.get_sizes(layers, out_sz)
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        layers = []
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+ps,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
        if bn_final: layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*layers)
        self.feat_product = FeatProduct(feat_sz + n_cont, 20)

    def get_sizes(self, layers, out_sz):
        return [1200] + layers + [out_sz]

    def forward(self, x) -> Tensor:
        x = self.bn_cont(x)
        x = x.transpose(1,2)
        x = self.feat_product(x)
        x = x.reshape(x.shape[0],-1)
        x = self.layers(x)
        x = x.reshape(x.shape[0],60,6)
        return x

In [39]:
def train_one(weight = None):
    
    sampler = None
    if False:
        ww = torch.DoubleTensor(wt)
        sampler = torch.utils.data.sampler.WeightedRandomSampler(ww, len(ww), replacement=True)
    
    trn_ds = RSNA_DataSet(trn_data, ids_df, mode='train')
    val_ds = RSNA_DataSet(val_data, ids_df, mode='valid')
    df = DataBunch.create(train_ds=trn_ds, valid_ds=val_ds, bs=100, num_workers=0)
    
    tab_model = TabularModel(n_cont = len(cols_float) + len(cols_cat), out_sz=360, \
                             layers=[500,200], ps=[0.5,0.5], bn_final=True)
    model = Learner(df, tab_model, path=PATH_WORK, loss_func=BCEWithLogitsLoss())#.mixup()
    model.fit(2, 1e-1, wd=5e-3)

    predictions = np.array(model.get_preds(ds_type=DatasetType.Valid)[0])
    #model.data.add_test(df_test)
    #predictions[data_filt['fold'] == i] = model.get_preds(ds_type=DatasetType.Test)[0].reshape(-1)
    
    val_sz = len(val_data.SeriesInstanceUID.unique())
    if DATA_SMALL: val_sz = int(0.01*val_sz)
    val_series = val_data.SeriesInstanceUID.unique()[:val_sz]
    val_target = np.zeros((val_sz,60,6))
    for i, series in enumerate(val_series):
        mask = val_data.SeriesInstanceUID == series
        val_target[i,:mask.sum()] = val_data.loc[mask, all_ich]
    
    print('Log-loss', log_loss(val_target.reshape(-1),predictions.reshape(-1),eps=1e-6))
    print('correlation', np.corrcoef(val_target.reshape(-1),predictions.reshape(-1))[0,1])
    
    return predictions

In [None]:
train_one()

epoch,train_loss,valid_loss,time
