In [1]:
import sys
package_path = 'EfficientNet-PyTorch/'
sys.path.append(package_path)

from efficientnet_pytorch import EfficientNet
from radam import RAdam, PlainRAdam, AdamW
from am_softmax import AMSoftmaxLoss, AngleSimpleLinear

In [2]:
import os
import gc
import numpy as np 
import pandas as pd
from PIL import Image
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.utils.data as D
from torch.optim.lr_scheduler import ExponentialLR
from torchvision import models, transforms as T
import torch.nn.functional as F

from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer
from ignite.metrics import Loss, Accuracy
from ignite.contrib.handlers.tqdm_logger import ProgressBar
from ignite.handlers import  EarlyStopping, ModelCheckpoint

import warnings
warnings.filterwarnings('ignore')

torch.cuda.empty_cache()

In [3]:
config = {
    'SEED': 42,
    'CLASSES': 1108,
    'PATH_DATA': '/home/tienen/kaggle_dataset_drugs/',
    'DEVICE': 'cuda',
    'BATCH_SIZE': 16,
    'VAL_SIZE': 0.05,
    'MODEL_NAME': 'EfficientNet_b2_AMSLoss_RAdam',
    'USE_BN': True,
    'USE_ANGULAR': True,
    'LR': 1e-4,
    'LR_STR': '1e-4',
    'TURN_OFF_ON_N_EPOCHS': 0,
}

best_epoch = 11

In [4]:
def seed_torch(seed=42):
    import random; import os
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(config['SEED'])

## Phase 2: train on each category

In [6]:
class ImagesDS(D.Dataset):
    def __init__(self, df, img_dir, mode='train', site=1, channels=[1,2,3,4,5,6]):
        self.records = df.to_records(index=False)
        self.channels = channels
        self.site = site
        self.mode = mode
        self.img_dir = img_dir
        self.len = df.shape[0]
        
    @staticmethod
    def _load_img_as_tensor(file_name):
        with Image.open(file_name) as img:
            return T.ToTensor()(img)

    def _get_img_path(self, index, channel):
        experiment, well, plate = self.records[index].experiment, self.records[index].well, self.records[index].plate
        return '/'.join([self.img_dir,self.mode,experiment,f'Plate{plate}',f'{well}_s{self.site}_w{channel}.png'])
        
    def __getitem__(self, index):
        paths = [self._get_img_path(index, ch) for ch in self.channels]
        img = torch.cat([self._load_img_as_tensor(img_path) for img_path in paths])
        if self.mode == 'train':
            return img, int(self.records[index].sirna)
        else:
            return img, self.records[index].id_code

    def __len__(self):
        return self.len

In [7]:
class EffNet(nn.Module):
    def __init__(self, num_classes=1000, num_channels=6, use_bn=False, use_angular=False):
        super().__init__()
        self.use_angular = use_angular
        self.use_bn = use_bn
        if self.use_bn:
            self.bn = nn.BatchNorm2d(6)
        
        self.features = EfficientNet.from_pretrained('efficientnet-b2', num_classes=num_classes)
        # print(self.features)
        
        trained_kernel = self.features._conv_stem.weight
        new_conv = nn.Sequential(nn.Conv2d(num_channels, 32, kernel_size=(3,3), stride=(2,2), bias=False),
                    nn.ZeroPad2d(padding=(0, 1, 0, 1)))
        with torch.no_grad():
            new_conv[0].weight[:,:] = torch.stack([torch.mean(trained_kernel, 1)]*6, dim=1)
        self.features._conv_stem = new_conv
        
        if self.use_angular:
            self.features._fc = AngleSimpleLinear(1408, num_classes)
        
    def forward(self, x):
        if self.use_bn:
            x = self.bn(x)
        out = self.features(x)
        return out

In [8]:
df = pd.read_csv(config['PATH_DATA']+'/train.csv')
df_test = pd.read_csv(config['PATH_DATA']+'/test.csv')

df['category'] = df['experiment'].apply(lambda x: x.split('-')[0])
df_test['category'] = df_test['experiment'].apply(lambda x: x.split('-')[0])

## Training

In [9]:
categories = df['category'].unique()
# ES BY ACCURACY
for category in categories:
    category_df = df[df['category'] == category]
    cat_test_df = df_test[df_test['category'] == category].copy()
    category_df_train, category_df_val = train_test_split(category_df,
                                                          test_size=config['VAL_SIZE'],
                                                          # stratify=category_df.sirna,
                                                          random_state=config['SEED'])

    print('\n' + '=' * 40)
    print("CURRENT CATEGORY:", category)
    print('-' * 40)

    # LOAD MODEL
    model = EffNet(num_classes=config['CLASSES'], use_bn=config['USE_BN'], use_angular=config['USE_ANGULAR'])
    checkpoint = torch.load('{0}/all_exps_{0}_lr{1}_{2}.pth'.format(config['MODEL_NAME'],
                                                                    config['LR_STR'],
                                                                    best_epoch))
    model.load_state_dict(checkpoint)
    model.to(config['DEVICE'])
    model.train()

    criterion = AMSoftmaxLoss(margin_type='cos')
    optimizer = RAdam(model.parameters(), lr=config['LR'])
    metrics = {'loss': Loss(criterion), 'accuracy': Accuracy()}

    trainer = create_supervised_trainer(model, optimizer, criterion, device=config['DEVICE'])
    val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=config['DEVICE'])

    # HELPERS
    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_and_display_val_metrics(engine):
        epoch = engine.state.epoch
        metrics = val_evaluator.run(val_loader).metrics
        print("Validation Results - Epoch: {}  Average Loss: {:.4f} | Accuracy: {:.4f} "
              .format(engine.state.epoch, 
                          metrics['loss'], 
                          metrics['accuracy']))

    lr_scheduler = ExponentialLR(optimizer, gamma=0.95)
    @trainer.on(Events.EPOCH_COMPLETED)
    def update_lr_scheduler(engine):
        lr_scheduler.step()
        lr = float(optimizer.param_groups[0]['lr'])
        print("Learning rate: {}".format(lr))

    handler = EarlyStopping(patience=4, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer)
    val_evaluator.add_event_handler(Events.COMPLETED, handler)

    checkpoints = ModelCheckpoint(config['MODEL_NAME'], category,
                                  save_interval=1, n_saved=10, create_dir=True, require_empty=False)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoints,
                              {config['MODEL_NAME']+'_lr{}'.format(config['LR_STR']): model})

    pbar = ProgressBar(bar_format='')
    pbar.attach(trainer, output_transform=lambda x: {'loss': x})

    if not 'KAGGLE_WORKING_DIR' in os.environ:  #  If we are not on kaggle server
        from ignite.contrib.handlers.tensorboard_logger import *
        tb_logger = TensorboardLogger("board/"+config['MODEL_NAME']+'_'+category)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", output_transform=lambda loss: {'loss': loss}),
                         event_name=Events.ITERATION_COMPLETED)

        tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=["accuracy", "loss"],
                         another_engine=trainer),event_name=Events.EPOCH_COMPLETED)
        tb_logger.close()

    # DATA
    ds_1 = ImagesDS(category_df_train, config['PATH_DATA'], site=1, mode='train')
    ds_2 = ImagesDS(category_df_train, config['PATH_DATA'], site=2, mode='train')
    ds = D.ConcatDataset([ds_1, ds_2])

    ds_val_1 = ImagesDS(category_df_val, config['PATH_DATA'], site=1, mode='train')
    ds_val_2 = ImagesDS(category_df_val, config['PATH_DATA'], site=2, mode='train')
    ds_val = D.ConcatDataset([ds_val_1, ds_val_2])

    #ds_test_1 = ImagesDS(df_test, path_data, site=1, mode='test')
    #ds_test_2 = ImagesDS(df_test, path_data, site=2, mode='test')

    train_loader = D.DataLoader(ds, batch_size=config['BATCH_SIZE'], shuffle=True, num_workers=4)
    val_loader = D.DataLoader(ds_val, batch_size=config['BATCH_SIZE'], shuffle=True, num_workers=4)

    #test_loader_1 = D.DataLoader(ds_test_1, batch_size=1, shuffle=False, num_workers=4)
    #test_loader_2 = D.DataLoader(ds_test_2, batch_size=1, shuffle=False, num_workers=4)

    # TRAINING
    trainer.run(train_loader, max_epochs=15)

    del model, trainer, ds, ds_val
    gc.collect()
    torch.cuda.empty_cache()


CURRENT CATEGORY: HEPG2
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 1  Average Loss: 12.9923 | Accuracy: 0.8273 
Learning rate: 9.5e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 2  Average Loss: 12.4485 | Accuracy: 0.8312 
Learning rate: 9.025e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 3  Average Loss: 12.4475 | Accuracy: 0.8157 
Learning rate: 8.573749999999999e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 4  Average Loss: 12.1683 | Accuracy: 0.8183 
Learning rate: 8.145062499999998e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 5  Average Loss: 12.1028 | Accuracy: 0.7977 
Learning rate: 7.737809374999998e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 6  Average Loss: 11.8014 | Accuracy: 0.8080 
Learning rate: 7.350918906249998e-05

CURRENT CATEGORY: HUVEC
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=0, max=2101), HTML(value='')))

Validation Results - Epoch: 1  Average Loss: 8.8967 | Accuracy: 0.9418 
Learning rate: 9.5e-05


HBox(children=(IntProgress(value=0, max=2101), HTML(value='')))

Validation Results - Epoch: 2  Average Loss: 8.5562 | Accuracy: 0.9203 
Learning rate: 9.025e-05


HBox(children=(IntProgress(value=0, max=2101), HTML(value='')))

Validation Results - Epoch: 3  Average Loss: 8.0921 | Accuracy: 0.9254 
Learning rate: 8.573749999999999e-05


HBox(children=(IntProgress(value=0, max=2101), HTML(value='')))

Validation Results - Epoch: 4  Average Loss: 7.6242 | Accuracy: 0.9096 
Learning rate: 8.145062499999998e-05


HBox(children=(IntProgress(value=0, max=2101), HTML(value='')))

Validation Results - Epoch: 5  Average Loss: 7.4353 | Accuracy: 0.9062 
Learning rate: 7.737809374999998e-05

CURRENT CATEGORY: RPE
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 1  Average Loss: 11.8711 | Accuracy: 0.8750 
Learning rate: 9.5e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 2  Average Loss: 11.6955 | Accuracy: 0.8621 
Learning rate: 9.025e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 3  Average Loss: 11.4444 | Accuracy: 0.8582 
Learning rate: 8.573749999999999e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 4  Average Loss: 11.2535 | Accuracy: 0.8531 
Learning rate: 8.145062499999998e-05


HBox(children=(IntProgress(value=0, max=921), HTML(value='')))

Validation Results - Epoch: 5  Average Loss: 11.2151 | Accuracy: 0.8428 
Learning rate: 7.737809374999998e-05

CURRENT CATEGORY: U2OS
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 1  Average Loss: 14.0247 | Accuracy: 0.7545 
Learning rate: 9.5e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 2  Average Loss: 13.7670 | Accuracy: 0.7515 
Learning rate: 9.025e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 3  Average Loss: 13.7171 | Accuracy: 0.7695 
Learning rate: 8.573749999999999e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 4  Average Loss: 13.7177 | Accuracy: 0.7485 
Learning rate: 8.145062499999998e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 5  Average Loss: 13.6553 | Accuracy: 0.7455 
Learning rate: 7.737809374999998e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 6  Average Loss: 13.6034 | Accuracy: 0.7695 
Learning rate: 7.350918906249998e-05


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

Validation Results - Epoch: 7  Average Loss: 13.5678 | Accuracy: 0.7395 
Learning rate: 6.983372960937497e-05


## Prediction for test

In [9]:
categories = df['category'].unique()
output_df = []
output_predicted = []

for category in categories:
    cat_test_df = df_test[df_test['category'] == category].copy()

    print('\n' + '=' * 40)
    print("CURRENT CATEGORY:", category)
    print('-' * 40)

    # LOAD MODEL
    model = EffNet(num_classes=config['CLASSES'], use_bn=config['USE_BN'], use_angular=config['USE_ANGULAR'])
    if category == 'HEPG2':
        best_epoch_on_category = 12
    if category == 'HUVEC':
        best_epoch_on_category = 15
    if category == 'RPE':
        best_epoch_on_category = 14
    if category == 'U2OS':
        best_epoch_on_category = 17
        
    checkpoint = torch.load('{0}/{2}_{0}_lr{1}_{3}.pth'.format(config['MODEL_NAME'], config['LR_STR'], category, best_epoch_on_category))
    model.load_state_dict(checkpoint)
    model.to(config['DEVICE'])
    model.eval();

    # DATA
    ds_test_1 = ImagesDS(cat_test_df, config['PATH_DATA'], site=1, mode='test')
    ds_test_2 = ImagesDS(cat_test_df, config['PATH_DATA'], site=2, mode='test')
    test_loader_1 = D.DataLoader(ds_test_1, batch_size=1, shuffle=False, num_workers=4)
    test_loader_2 = D.DataLoader(ds_test_2, batch_size=1, shuffle=False, num_workers=4)

    # PREDICTION
    with torch.no_grad():
        predicted = []  # predicted = np.empty(0)
        for (x1, id1), (x2, id2) in tqdm_notebook(zip(test_loader_1, test_loader_2)):
            assert id1 == id2

            x1 = x1.to(config['DEVICE'])
            output1 = model(x1)

            x2 = x2.to(config['DEVICE'])
            output2 = model(x2)

            result = 0.5*(output1 + output2)
            predicted.append(result.cpu().numpy())
            
    predicted = np.stack(predicted).squeeze()
    cat_test_df['sirna'] = np.argmax(predicted, axis=1).astype(int)
    output_df.append(cat_test_df[['id_code', 'experiment', 'sirna']])
    output_predicted.append(predicted)

    del model, ds_test_1, ds_test_2
    gc.collect()
    torch.cuda.empty_cache()


CURRENT CATEGORY: HEPG2
----------------------------------------
Loaded pretrained weights for efficientnet-b2
EfficientNet_b2_AMSLoss_RAdam/HEPG2_EfficientNet_b2_AMSLoss_RAdam_lr1e-4_12.pth


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



CURRENT CATEGORY: HUVEC
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



CURRENT CATEGORY: RPE
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



CURRENT CATEGORY: U2OS
----------------------------------------
Loaded pretrained weights for efficientnet-b2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
output_df = pd.concat(output_df)
submission = output_df[['id_code', 'sirna']]
submission.to_csv('submits/{}_each_exps_best_epoch_lr{}.csv'.format(config['MODEL_NAME'], config['LR_STR']),
                  index=False, columns=['id_code','sirna'])

In [11]:
submission.head()

Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,855
1,HEPG2-08_1_B04,19
2,HEPG2-08_1_B05,836
3,HEPG2-08_1_B06,609
4,HEPG2-08_1_B07,420


## Use plates leak

In [12]:
output_predicted = np.concatenate(output_predicted)
np.save('predictions/{}_each_exps_best_epoch_by_loss_lr{}'.format(config['MODEL_NAME'], config['LR_STR']),
        output_predicted)

In [15]:
plate_groups = np.zeros((1108,4), int)
for sirna in range(1108):
    grp = df.loc[df.sirna==sirna,:].plate.value_counts().index.values
    assert len(grp) == 3
    plate_groups[sirna,0:3] = grp
    plate_groups[sirna,3] = 10 - grp.sum()
    
print(plate_groups[:10,:])

[[4 2 3 1]
 [1 3 4 2]
 [2 4 1 3]
 [1 3 4 2]
 [3 1 2 4]
 [1 3 4 2]
 [1 3 4 2]
 [2 4 1 3]
 [1 3 4 2]
 [4 2 3 1]]


In [16]:
all_test_exp = df_test.experiment.unique()

group_plate_probs = np.zeros((len(all_test_exp),4))
for idx in range(len(all_test_exp)):
    preds = submission.loc[df_test.experiment == all_test_exp[idx],'sirna'].values
    pp_mult = np.zeros((len(preds),1108))
    pp_mult[range(len(preds)),preds] = 1
    
    sub_test = df_test.loc[df_test.experiment == all_test_exp[idx],:]
    assert len(pp_mult) == len(sub_test)
    
    for j in range(4):
        mask = np.repeat(plate_groups[np.newaxis, :, j], len(pp_mult), axis=0) == \
               np.repeat(sub_test.plate.values[:, np.newaxis], 1108, axis=1)
        
        group_plate_probs[idx,j] = np.array(pp_mult)[mask].sum()/len(pp_mult)

In [17]:
exp_to_group = group_plate_probs.argmax(1)
print(exp_to_group)

[3 1 0 0 0 0 2 2 3 0 0 3 1 0 0 0 3 3]


In [18]:
# this is the function that sets 75% of the sirnas to zero according to the selected assignment

def select_plate_group(pp_mult, idx):
    sub_test = df_test.loc[df_test.experiment == all_test_exp[idx],:]
    assert len(pp_mult) == len(sub_test)
    mask = np.repeat(plate_groups[np.newaxis, :, exp_to_group[idx]], len(pp_mult), axis=0) != \
           np.repeat(sub_test.plate.values[:, np.newaxis], 1108, axis=1)
    pp_mult[mask] = 0
    return pp_mult

In [19]:
output_df.head()

Unnamed: 0,id_code,experiment,sirna
0,HEPG2-08_1_B03,HEPG2-08,855
1,HEPG2-08_1_B04,HEPG2-08,19
2,HEPG2-08_1_B05,HEPG2-08,836
3,HEPG2-08_1_B06,HEPG2-08,609
4,HEPG2-08_1_B07,HEPG2-08,420


In [20]:
sub = submission.copy()

for idx in range(len(all_test_exp)):
    indices = (output_df.experiment == all_test_exp[idx])
    preds = output_predicted[indices, :].copy()
    
    preds = select_plate_group(preds, idx)
    sub.loc[indices,'sirna'] = preds.argmax(1)

In [21]:
sub.to_csv('submits/{}_each_exps_best_epoch_lr{}_plates_leak.csv'.format(config['MODEL_NAME'], config['LR_STR']),
           index=False, columns=['id_code','sirna'])

In [22]:
print((sub.sirna == submission.sirna).mean())

0.7289541136854802


In [23]:
len(submission.sirna.unique()), len(sub.sirna.unique())

(1106, 1108)

In [24]:
sub.head()

Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,855
1,HEPG2-08_1_B04,710
2,HEPG2-08_1_B05,836
3,HEPG2-08_1_B06,609
4,HEPG2-08_1_B07,420
