In [1]:
from pathlib import Path
import subprocess

PL_PATH = Path("/kaggle/input/pytorch-lightning")
subprocess.call(
    ["pip", "install", PL_PATH / "pytorch_lightning-0.8.1-py3-none-any.whl"]
)

0

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, WeightedRandomSampler
from PIL import Image
import os, random
import pytorch_lightning.core.lightning as pl
from sklearn.preprocessing import StandardScaler
from torchvision import transforms, models
from pytorch_lightning.metrics.functional import auroc, accuracy
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning import Trainer
from sklearn.metrics import roc_auc_score
from pytorch_lightning.callbacks import *
from tqdm.auto import tqdm

In [3]:
root_train = '../input/melanoma-external-malignant-256/train/train/'
root_test = '../input/melanoma-external-malignant-256/test/test/'

In [4]:
def encode_cat_variables(x, help_dict = None, N = 100):
    uniqs = sorted(np.unique(x).tolist())
    if help_dict is None: help_dict = {v:k+1 for k, v in enumerate(uniqs)}
    levels = len(help_dict.keys()) + 1
    x_t = np.array([help_dict.get(x_i, 0) for x_i in x])
    return x_t, help_dict, levels

In [5]:
df = pd.read_csv('../input/melanoma-external-malignant-256/train_concat.csv')
df_test = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')

df['sex'] = df['sex'].fillna('missing')
df_test['sex'] = df_test['sex'].fillna('missing')

df['age_missing'] = df['age_approx'].isna()
df_test['age_missing'] = df_test['age_approx'].isna()

df['age_approx'] = df['age_approx'].fillna(50).astype(np.int8)
df_test['age_approx'] = df_test['age_approx'].fillna(50).astype(np.int8)

df['anatom_site_general_challenge'] = df['anatom_site_general_challenge'].fillna('missing')
df_test['anatom_site_general_challenge'] = df_test['anatom_site_general_challenge'].fillna('missing')

df['age_approx'] = (df['age_approx']-df['age_approx'].mean())/df['age_approx'].std()
df_test['age_approx'] = (df_test['age_approx']-df_test['age_approx'].mean())/df['age_approx'].std()

df['patient_id'] = df['patient_id'].str.split('_', expand=True)[0]
df_test['patient_id'] = df_test['patient_id'].str.split('_', expand=True)[0]

df['patient_id'] = df['patient_id'].fillna('missing')
df_test['patient_id'] = df_test['patient_id'].fillna('missing')

cat_cols = ['sex', 'anatom_site_general_challenge', 'age_missing', 'patient_id']
df['sex'], sex_dict, _ = encode_cat_variables(df['sex'])
df_test['sex'], sex_dict, _ = encode_cat_variables(df_test['sex'])

df['anatom_site_general_challenge'], anatomy_dict, _ = encode_cat_variables(df['anatom_site_general_challenge'])
df_test['anatom_site_general_challenge'], anatomy_dict, _ = encode_cat_variables(df_test['anatom_site_general_challenge'])

df['age_missing'], age_dict, _ = encode_cat_variables(df['age_missing'])
df_test['age_missing'], age_dict, _ = encode_cat_variables(df_test['age_missing'])

df['patient_id'], id_dict, _ = encode_cat_variables(df['patient_id'].astype('string'))
df_test['patient_id'], id_dict, _ = encode_cat_variables(df_test['patient_id'].astype('string'))

In [6]:
def open_img(f, root):
    return Image.open(os.path.join(root, f+'.jpg'))

In [7]:
tfms = transforms.Compose([
                          transforms.ToTensor(),
                          transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

In [8]:
emb_c = {col: df[col].nunique()  for col in cat_cols}
emb_szs = [(c, min(50, max(6, c**0.5))) for _,c in emb_c.items()]

In [9]:
class MelanomaDataset(Dataset):
    def __init__(self, df, tfms=tfms, train=True):
        self.img_id = df['image_name'].values
        self.cat_cols = df[cat_cols].values
        self.age = df['age_approx'].astype(np.float32)
        self.ys = df['target'] if train else None
        self.tfms = tfms
        self.train = train
        
    def __getitem__(self, idx):
        root = root_train if self.train else root_test
        img = self.tfms(open_img(self.img_id[idx], root))
        if self.train:
            return img, self.cat_cols[idx], self.age[idx], self.ys[idx]
        else:
            return img, self.cat_cols[idx], self.age[idx]
    
    def __len__(self):
        return len(self.img_id)

In [10]:
ds = MelanomaDataset(df)

train_ds, valid_ds = random_split(ds, [30118, 7530])

In [11]:
class AdaptiveConcatPool2d(nn.Module):
    def __init__(self, sz=1):
        super().__init__()
        self.output_size = sz
        self.ap = nn.AdaptiveAvgPool2d(sz)
        self.mp = nn.AdaptiveMaxPool2d(sz)
    def forward(self, x): return torch.cat([self.mp(x), self.ap(x)], 1)
    
class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x*(torch.tanh(F.softplus(x)))

In [12]:
def get_base():
    m = models.resnet34()
    m.avgpool = AdaptiveConcatPool2d()
    return m

class LabelSmoothingLoss(nn.Module):
    def __init__(self, smooth_factor=0.05):
        super().__init__()
        self.smooth_factor = smooth_factor

    def _smooth_labels(self, num_classes, target):
        target_one_hot = F.one_hot(target, num_classes).float()
        target_one_hot[target_one_hot == 1] = 1 - self.smooth_factor
        target_one_hot[target_one_hot == 0] = self.smooth_factor
        return target_one_hot

    def forward(self, input, target):
        logp = F.log_softmax(input, dim=1)
        target_one_hot = self._smooth_labels(input.size(1), target)
        return F.kl_div(logp, target_one_hot, reduction='sum')

loss_fn = LabelSmoothingLoss()

def calc_score(preds, y):
    return roc_auc_score(preds, y)

In [13]:
class Model(pl.LightningModule):
    def __init__(self, emb_szs):
        super().__init__()
        m = get_base()
        self.enc =  nn.Sequential(*list(m.children())[:-1], nn.Flatten())    
        nc = list(m.children())[-1].in_features
        self.head = nn.Sequential(nn.Linear(2*nc+25,512),Mish(),
                                    nn.BatchNorm1d(512), nn.Dropout(0.5),nn.Linear(512,2))
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        
    def forward(self, xb, x_cat, x_cont):
        x1 = [e(x_cat[:,i]-1) for i,e in enumerate(self.embs)]
        x1 = torch.cat(x1, 1)
        x_img = self.enc(xb)
        x = torch.cat([x1, x_cont.unsqueeze(1)], 1)
        x = torch.cat([x, x_img], 1)
        return self.head(x)
    
    def training_step(self, batch, batch_idx):
        xb, x_cat, x_cont, yb = batch
        y_hat = self(xb=xb, x_cat=x_cat, x_cont =x_cont)
        loss = loss_fn(y_hat, yb)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        xb, x_cat, x_cont, yb = batch
        y_hat = self(xb=xb, x_cat=x_cat, x_cont =x_cont)
        loss = loss_fn(y_hat, yb)
        return {'valid_loss': loss, 'yb': yb, 'predictions': y_hat}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['valid_loss'] for x in outputs]).mean()
        yb = torch.cat([x['yb'] for x in outputs],0)
        y_hat = torch.cat([x['predictions'] for x in outputs],0)
        preds = torch.argmax(y_hat, dim=1)
        acc = accuracy(preds, yb, num_classes=2)
        score = calc_score(preds.detach().cpu().numpy(), yb.detach().cpu().numpy()) if preds.shape[0]>50 else 1
        print(f'accuracy: {acc}, score: {score}')
        return {'val_loss': avg_loss, 'score': accuracy}
    
    def configure_optimizers(self):
        opt = optim.Adam(self.parameters(), lr=0.0001)
        scheduler = optim.lr_scheduler.OneCycleLR(opt, max_lr = 5e-4, epochs=10, steps_per_epoch=104)
        return [opt], [scheduler]
    
    def train_dataloader(self):
        train_dl = DataLoader(train_ds, pin_memory=True, shuffle=True, batch_size=32, num_workers=4)
        return train_dl
    
    def val_dataloader(self):
        valid_dl = DataLoader(valid_ds, pin_memory=True, batch_size=64, num_workers=4)
        return valid_dl
    
    def test_dataloader(self):
        ds = MelanomaDataset(df, train=False)
        test_dl = DataLoader(ds, pin_memory=True, batch_size=64, num_workers=4)
        return test_dl
    
    def test_step(self, batch, batch_idx):
        xb, x_cat, x_cont = batch
        y_hat = self(xb=xb, x_cat= x_cat, x_cont=x_cont)
        return {'test_pred_b': y_hat}
    
    def test_epoch_end(self, outputs):
        ys = torch.cat([x['test_pred_b'] for x in outputs], 0)
        preds = torch.argmax(ys, dim=1)
        return preds

In [14]:
model = Model(emb_szs)

In [15]:
early_stopping = EarlyStopping(monitor = 'val_loss',
                              min_delta = 1e-4,
                              patience=3,
                              mode= 'min',
                              verbose=True)

checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(os.getcwd(), 'model_dir'),
#     save_top_k=True,
    verbose=True,
    monitor='val_loss',
    mode='min',
    prefix=''
)

In [16]:
trainer = Trainer(max_epochs=15, 
                  early_stop_callback = early_stopping,
                  gpus=1,
                  gradient_clip_val=1.0,
                  weights_save_path=os.getcwd(),
                  checkpoint_callback = checkpoint_callback,
                  num_sanity_val_steps=0
                 )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [17]:
trainer.fit(model)


  | Name | Type       | Params
------------------------------------
0 | enc  | Sequential | 21 M  
1 | head | Sequential | 539 K 
2 | embs | ModuleList | 120   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 12.34969 (best 12.34969), saving model to /kaggle/working/_ckpt_epoch_1.ckpt as top 1


accuracy: 0.968393087387085, score: 0.9301843804565458


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00003: val_loss reached 3.00059 (best 3.00059), saving model to /kaggle/working/_ckpt_epoch_3.ckpt as top 1


accuracy: 0.9837981462478638, score: 0.988358272025335


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00004: val_loss  was not in top 1


accuracy: 0.9745019674301147, score: 0.9708491803875549


In [18]:
trainer.test()

RuntimeError: Error(s) in loading state_dict for Model:
	Unexpected key(s) in state_dict: "embs.0.weight", "embs.1.weight", "embs.2.weight", "embs.3.weight". 