In [1]:
!pip install 'scikit-learn==0.23.2'

Collecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/5c/a1/273def87037a7fb010512bbc5901c31cfddfca8080bc63b42b26e3cc55b3/scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 5.4MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.23.2 threadpoolctl-2.1.0


In [2]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import random
import time
from sklearn.metrics import log_loss

pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 999

In [3]:
# sys.path.append('../input/iterative-stratification/iterative-stratification-master')

!pip install iterative-stratification

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [4]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [5]:
import torch
import torch.nn as nn
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [6]:
def get_logger(filename='log'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# global variables

BASE_PATH = './drive/My Drive/Kaggle-MoA/input_data/'
NN_PATH = './drive/My Drive/Kaggle-MoA/NN_models/'

In [9]:
train_features = pd.read_csv(BASE_PATH + 'train_features.csv')
train_targets_scored = pd.read_csv(BASE_PATH + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(BASE_PATH + 'train_targets_nonscored.csv')
train_drug = pd.read_csv(BASE_PATH + 'train_drug.csv')
test_features = pd.read_csv(BASE_PATH + 'test_features.csv')

submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')

In [10]:
train_features.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,g-8,g-9,g-10,g-11,g-12,g-13,g-14,g-15,g-16,g-17,g-18,g-19,g-20,g-21,g-22,g-23,g-24,g-25,g-26,g-27,g-28,g-29,g-30,g-31,g-32,g-33,g-34,g-35,...,c-60,c-61,c-62,c-63,c-64,c-65,c-66,c-67,c-68,c-69,c-70,c-71,c-72,c-73,c-74,c-75,c-76,c-77,c-78,c-79,c-80,c-81,c-82,c-83,c-84,c-85,c-86,c-87,c-88,c-89,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,-0.0326,0.5548,-0.0921,1.183,0.153,0.5574,-0.4015,0.1789,-0.6528,-0.7969,0.6342,0.1778,-0.3694,-0.5688,-1.136,-1.188,0.694,0.4393,0.2664,0.1907,0.1628,-0.2853,0.5819,0.2934,-0.5584,-0.0916,-0.301,-0.1537,0.2198,...,0.4805,0.4965,0.368,0.8427,0.1042,0.1403,0.1758,1.257,-0.5979,1.225,-0.0553,0.7351,0.581,0.959,0.2427,0.0495,0.4141,0.8432,0.6162,-0.7318,1.212,0.6362,-0.4427,0.1288,1.484,0.1799,0.5367,-0.1111,-1.012,0.6685,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,0.3372,-0.4047,0.8507,-1.152,-0.4201,-0.0958,0.459,0.0803,0.225,0.5293,0.2839,-0.3494,0.2883,0.9449,-0.1646,-0.2657,-0.3372,0.3135,-0.4316,0.4773,0.2075,-0.4216,-0.1161,-0.0499,-0.2627,0.9959,-0.2483,0.2655,-0.2102,...,0.4083,0.0319,0.3905,0.7099,0.2912,0.4151,-0.284,-0.3104,-0.6373,0.2887,-0.0765,0.2539,0.4443,0.5932,0.2031,0.7639,0.5499,-0.3322,-0.0977,0.4329,-0.2782,0.7827,0.5934,0.3402,0.1499,0.442,0.9366,0.8193,-0.4236,0.3192,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,0.2155,0.0065,1.23,-0.4797,-0.5631,-0.0366,-1.83,0.6057,-0.3278,0.6042,-0.3075,-0.1147,-0.057,-0.0799,-0.8181,-1.532,0.2307,0.4901,0.478,-1.397,4.624,-0.0437,1.287,-1.853,0.6069,0.429,0.1783,0.0018,-1.18,...,-0.5477,-0.7576,-0.0444,0.1894,-0.0014,-2.364,-0.4682,0.121,-0.5177,-0.0604,0.1682,-0.4436,0.4963,0.1363,0.3335,0.976,-0.0427,-0.1235,0.0959,0.069,-0.9416,-0.7548,-0.1109,-0.6272,0.3019,0.1172,0.1093,-0.3113,0.3019,-0.0873,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,0.1792,-0.1321,-1.06,-0.8269,-0.3584,-0.8511,-0.5844,-2.569,0.8183,-0.0532,-0.8554,0.116,-2.352,2.12,-1.158,-0.7191,-0.8004,-1.467,-0.0107,-0.8995,0.2406,-0.2479,-1.089,-0.7575,0.0881,-2.737,0.8745,0.5787,-1.674,...,-2.122,-0.3752,-2.382,-3.735,-2.974,-1.493,-1.66,-3.166,0.2816,-0.299,-1.187,-0.5044,-1.775,-1.612,-0.9215,-1.081,-3.052,-3.447,-2.774,-1.846,-0.5568,-3.396,-2.951,-1.155,-3.262,-1.539,-2.46,-0.9417,-1.555,0.2431,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,-0.1498,-0.8789,0.863,-0.2219,-0.5121,-0.9577,1.175,0.2042,0.197,0.1244,-1.709,-0.3543,-0.516,-0.333,-0.2685,0.7649,0.2057,1.372,0.6835,0.8056,-0.3754,-1.209,0.2965,-0.0712,0.6389,0.6674,-0.0783,1.174,-0.711,...,-0.2274,0.3215,0.1535,-0.464,-0.5943,0.3973,0.15,0.5178,0.5159,0.6091,0.1813,-0.4249,0.7832,0.6529,0.5648,0.4817,0.0587,0.5303,0.6376,-0.3966,-1.495,-0.9625,-0.0541,0.6273,0.4563,0.0698,0.8134,0.1924,0.6054,-0.1824,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


## Data Preprocessing: QuantileTransformer + PCA + Variance Thresholding

In [11]:
from sklearn.preprocessing import QuantileTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import pickle

In [12]:
def preprocessing_nn_train_only(train_features_raw, test_features_raw, random_state=42, is_train=True, n_quantiles=100, pca_g=600, pca_c=50, variance_threshold=0.8):

    train_features = train_features_raw.copy()
    test_features = test_features_raw.copy()

    g_cols = [col for col in train_features.columns if col.startswith('g-')]
    c_cols = [col for col in train_features.columns if col.startswith('c-')]
    other_cols = [col for col in train_features.columns if col not in g_cols and col not in c_cols]

    # QuantileTransformer
    transformer = QuantileTransformer(n_quantiles=n_quantiles, random_state=random_state, output_distribution="normal")
    if is_train:
        transformer.fit(train_features[g_cols + c_cols].values.reshape(-1, len(g_cols)+len(c_cols)))
        pickle.dump(transformer, open(f"{NN_PATH}SEED{random_state}_transformer.pkl", 'wb'))
    else:
        transformer = pickle.load(open(f"{NN_PATH}SEED{random_state}_transformer.pkl", 'rb'))
    
    train_features[g_cols + c_cols] = transformer.transform(train_features[g_cols + c_cols].values.reshape(-1, len(g_cols)+len(c_cols)))
    test_features[g_cols + c_cols] = transformer.transform(test_features[g_cols + c_cols].values.reshape(-1, len(g_cols)+len(c_cols)))

    # PCA
    if is_train:
        pca_g = PCA(n_components=pca_g, random_state=random_state)
        pca_g.fit(train_features[g_cols])
        pickle.dump(pca_g, open(f"{NN_PATH}SEED{random_state}_pca_g.pkl", 'wb'))
        
        pca_c = PCA(n_components=pca_c, random_state=random_state)
        train_c = pca_c.fit(train_features[c_cols])
        pickle.dump(pca_c, open(f"{NN_PATH}SEED{random_state}_pca_c.pkl", 'wb'))
    else:
        pca_g = pickle.load(open(f"{NN_PATH}SEED{random_state}_pca_g.pkl", 'rb'))
        pca_c = pickle.load(open(f"{NN_PATH}SEED{random_state}_pca_c.pkl", 'rb'))
        
    train_g = pca_g.transform(train_features[g_cols])
    test_g = pca_g.transform(test_features[g_cols])
    train_c = pca_c.transform(train_features[c_cols])
    test_c = pca_c.transform(test_features[c_cols])
    
    train_g = pd.DataFrame(train_g, columns=['g_pca_{}'.format(i) for i in range(train_g.shape[1])])
    test_g = pd.DataFrame(test_g, columns=['g_pca_{}'.format(i) for i in range(test_g.shape[1])])
    train_c = pd.DataFrame(train_c, columns=['c_pca_{}'.format(i) for i in range(train_c.shape[1])])
    test_c = pd.DataFrame(test_c, columns=['c_pca_{}'.format(i) for i in range(test_c.shape[1])])

    train_features_pca = pd.concat([train_features[g_cols + c_cols], train_g, train_c], axis=1)
    test_features_pca = pd.concat([test_features[g_cols + c_cols], test_g, test_c], axis=1)

    # Varaince thresholding
    if is_train:
        variance_threshold = VarianceThreshold(variance_threshold)
        variance_threshold.fit(train_features_pca)
        pickle.dump(variance_threshold, open(f"{NN_PATH}SEED{random_state}_variance.pkl", 'wb'))
    else:
        variance_threshold = pickle.load(open(f"{NN_PATH}SEED{random_state}_variance.pkl", 'rb'))
    
    train_features_variance = variance_threshold.transform(train_features_pca)
    test_features_variance = variance_threshold.transform(test_features_pca)

    train_features_variance = pd.DataFrame(train_features_variance, columns=['col_{}'.format(i) for i in range(train_features_variance.shape[1])])
    test_features_variance = pd.DataFrame(test_features_variance,  columns=['col_{}'.format(i) for i in range(test_features_variance.shape[1])])
    
    # categorical variable encoding
    train_features_processed = pd.concat([train_features[other_cols], train_features_variance], axis=1)
    test_features_processed = pd.concat([test_features[other_cols], test_features_variance], axis=1)

    for col in ['cp_time', 'cp_dose']:
        le = LabelEncoder()
        train_features_processed[col] = le.fit_transform(train_features_processed[col])
        test_features_processed[col] = le.transform(test_features_processed[col])

    return train_features_processed, test_features_processed

## Dataset

In [13]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct

class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct

In [14]:
class Model(nn.Module):
    def __init__(self, num_features, hidden_sizes, dropout_values):
        super(Model, self).__init__()

        self.num_features = num_features
        self.hidden_sizes = hidden_sizes
        self.dropout_values = dropout_values
        self.frozen_layers = []

        self.mlp = nn.Sequential()
        num_dim = num_features
        for i, (hidden_size, dropout) in enumerate(zip(hidden_sizes, dropout_values)):
            self.mlp.add_module(f'batch_norm_{i}', nn.BatchNorm1d(num_dim))
            self.mlp.add_module(f'dropout_{i}', nn.Dropout(dropout))

            if i != len(hidden_sizes)-1:
                self.mlp.add_module(f'linear_{i}', nn.Linear(num_dim, hidden_size))
                self.mlp.add_module(f'activation_{i}', nn.LeakyReLU(0.01))
            else:
                self.mlp.add_module(f'linear_{i}', nn.utils.weight_norm(nn.Linear(num_dim, hidden_size)))

            # update the current dimension
            num_dim = hidden_size

    def forward(self, x):
        return self.mlp(x)

    def freeze(self):
        """ freeze all the layers """

        for name, param in self.mlp.named_parameters():
            layer_index = name.split('.')[0].split('_')[-1]
            
            # freeze the parameters
            param.requires_grad = False
        
            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

    def unfreeze(self):
        """ un-freeze the last layer that is frozen """

        layer_index_to_defreeze = self.frozen_layers.pop()

        for name, param in self.mlp.named_parameters():
            layer_index = name.split('.')[0].split('_')[-1]

            if layer_index == layer_index_to_defreeze:
                param.requires_grad = True

In [15]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds

In [16]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [17]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)

        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))   

In [18]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 24
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}

PCT_START = 0.1

## Model Training

In [19]:
class NNTrainer:
    def __init__(self, params, train_features, test_features, train_targets_scored, train_targets_nonscored, train_drug, sample_submission):
        super(NNTrainer, self).__init__()
        
        self.params = params

        self.train_features = train_features
        self.test_features = test_features
        self.train_targets_scored = train_targets_scored
        self.train_targets_nonscored = train_targets_nonscored
        self.train_drug = train_drug
        self.sample_submission = sample_submission

        self.target_cols = [col for col in train_targets_scored.columns if col != 'sig_id']
        self.target_nonscored_cols = [col for col in train_targets_nonscored.columns if col != 'sig_id']

    def _set_seed(self, seed_id):
        self.seed_id = seed_id
        seed_everything(self.seed_id)

    def _preprocess(self, seed_id, is_train=True):

        train_features, test_features, train_targets_scored = self.train_features, self.test_features, self.train_targets_scored
        train_drug = self.train_drug

        train_features_processed, test_features_processed = preprocessing_nn_train_only(
            train_features,
            test_features,
            random_state=seed_id,
            is_train=is_train,
            pca_g=self.params['pca_g'],
            pca_c=self.params['pca_c'],
            variance_threshold=self.params['variance_threshold']
        )

        train = train_features_processed \
            .merge(train_targets_scored, on='sig_id') \
            .merge(train_targets_nonscored, on='sig_id') \
            .merge(train_drug, on='sig_id')
        
        train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True).drop('cp_type', axis=1)
        test = test_features_processed[test_features_processed['cp_type'] != 'ctl_vehicle'].reset_index(drop=True).drop('cp_type', axis=1)
        feature_cols = [col for col in train \
                                if col not in self.target_cols \
                                and col not in self.target_nonscored_cols \
                                and col != 'sig_id' \
                                and col != 'drug_id']

        return train, test, feature_cols

    def _split_kfold(self, train, num_folds=5, drug_threshold=18):

        vc = train['drug_id'].value_counts()
        vc1 = vc.loc[vc <= drug_threshold].index.sort_values()
        vc2 = vc.loc[vc > drug_threshold].index.sort_values()

        # STRATIFY DRUGS 18X OR LESS
        dict1 = {}
        dict2 = {}

        kfold = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=self.seed_id)
        tmp = train.groupby('drug_id')[self.target_cols].mean().loc[vc1]

        for fold, (idxT, idxV) in enumerate(kfold.split(tmp, tmp[self.target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dict1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        kfold = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=self.seed_id)
        tmp = train.loc[train['drug_id'].isin(vc2)].reset_index(drop=True)

        for fold, (idxT, idxV) in enumerate(kfold.split(tmp, tmp[self.target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dict2.update(dd)

        # ASSIGN FOLDS
        train['fold'] = train['drug_id'].map(dict1)
        train.loc[train['fold'].isna(), 'fold'] = train.loc[train['fold'].isna(), 'sig_id'].map(dict2)
        train['fold'] = train['fold'].astype('int8')

        return train

    def _train_single_fold(self, model, seed_id, fold_id, targets='ALL_TARGETS'):

        feature_cols, target_cols = self.feature_cols.copy(), self.target_cols.copy()
        if targets == 'ALL_TARGETS':
            target_cols += self.target_nonscored_cols

        df_train = self.train[self.train['fold'] != fold_id]
        df_val = self.train[self.train['fold'] == fold_id]

        train_idx, val_idx = df_train.index, df_val.index

        X_train, y_train = df_train[feature_cols], df_train[target_cols]
        X_val, y_val = df_val[feature_cols], df_val[target_cols]

        train_dataset = MoADataset(X_train.values, y_train.values)
        val_dataset = MoADataset(X_val.values, y_val.values)

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

        optimizer = Adam(model.parameters(), lr=self.params['lr'], weight_decay=WEIGHT_DECAY[targets])
        scheduler = OneCycleLR(optimizer=optimizer,
                               steps_per_epoch=len(train_loader),
                               pct_start=PCT_START,
                               div_factor=DIV_FACTOR[targets],
                               max_lr=MAX_LR[targets],
                               epochs=EPOCHS
                               )
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.001)

        oof = np.zeros((len(self.train), len(target_cols)))
        best_loss = np.inf

        for epoch in range(EPOCHS):

            # gradually de-freeze layers if in the fine-tuning mode
            if targets == 'SCORED_ONLY' and len(model.frozen_layers) > 0 and epoch % 4 == 0:
                model.unfreeze()

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, train_loader, DEVICE)
            val_loss, val_preds = valid_fn(model, loss_fn, val_loader, DEVICE)

            if np.isnan(val_loss):
                break

            if val_loss < best_loss:
                best_loss = val_loss
                oof[val_idx] = val_preds
                torch.save(model.state_dict(), f"SEED{seed_id}_FOLD{fold_id}.pth")
                torch.save(model.state_dict(), f"{NN_PATH}SEED{seed_id}_FOLD{fold_id}.pth")

        print(f"SEED: {self.seed_id}, FOLD: {fold_id}, targets: {targets}, best train_loss: {train_loss:.6f}, best val_loss: {val_loss:.6f}")

        return oof

    def _evaluate_single_model(self, seed_id, num_folds):

        num_features = len(self.feature_cols)
        num_targets = len(self.target_cols)
        loss_fn = nn.BCEWithLogitsLoss()
        oof = np.zeros((len(self.train), num_targets))

        for fold_id in range(num_folds):
            # load the best model
            model = Model(num_features,
                          self.params['hidden_sizes'] + [num_targets],
                          self.params['dropout_values'])
            model.load_state_dict(torch.load(f"{NN_PATH}SEED{seed_id}_FOLD{fold_id}.pth"))
            model = model.to(DEVICE)

            df_val = self.train[self.train['fold'] == fold_id]
            val_idx = df_val.index

            X_val, y_val = df_val[self.feature_cols], df_val[self.target_cols]
            val_dataset = MoADataset(X_val.values, y_val.values)
            val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

            val_loss, val_preds = valid_fn(model, loss_fn, val_loader, DEVICE)
            oof[val_idx] = val_preds

        return oof

    def _predict_single_fold(self, seed_id, fold_id):

        num_features = len(self.feature_cols)
        num_targets = len(self.target_cols)

        # load the best model
        model = Model(num_features,
                      self.params['hidden_sizes'] + [num_targets],
                      self.params['dropout_values'])
        model.load_state_dict(torch.load(f"{NN_PATH}SEED{seed_id}_FOLD{fold_id}.pth"))
        model = model.to(DEVICE)

        # prediction
        X_test = self.test[self.feature_cols]
        test_dataset = TestDataset(X_test.values)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        predictions = np.zeros((len(X_test), num_targets))
        predictions = inference_fn(model, test_loader, DEVICE)

        return predictions

    def _transfer_model(self, model_old):

        num_targets = len(self.target_cols)

        # create a new model from scratch and transfer the parameters over
        model_new = Model(model_old.num_features,
                          model_old.hidden_sizes,
                          model_old.dropout_values
                          ).to(DEVICE)
        model_new.load_state_dict(model_old.state_dict())

        # do not transfer the last one layer
        # add the last layer with updated num_targets
        last_index = len(model_old.hidden_sizes)-1
        del model_new.mlp[-1]
        model_new.mlp.add_module(f'linear_{last_index}', nn.utils.weight_norm(nn.Linear(model_new.hidden_sizes[-2], num_targets)))
        model_new.hidden_sizes[-1] = num_targets

        model_new = model_new.to(DEVICE)

        # freeze all the layers
        model_new.freeze()

        return model_new

    def _get_cv_score(self, train, train_targets_scored):

        target_cols = self.target_cols

        val_results = train_targets_scored.drop(columns=target_cols) \
            .merge(train[['sig_id']+target_cols], on='sig_id', how='left') \
            .fillna(0)

        y_true = train_targets_scored[target_cols].values
        y_pred = val_results[target_cols].values

        score = 0
        for i in range(len(target_cols)):
            score += log_loss(y_true[:, i], y_pred[:, i])

        return score / y_pred.shape[1]

    def run_multiple_seeds(self, seeds, num_folds, run_type="training"):

        train_oof = self.train_features[self.train_features['cp_type'] != 'ctl_vehicle'][['sig_id']].copy()
        train_oof[self.target_cols] = 0.0

        test_preds = self.test_features[self.test_features['cp_type'] != 'ctl_vehicle'][['sig_id']].copy()
        test_preds[self.target_cols] = 0.0

        for seed_id in seeds:

            # preprocess
            self._set_seed(seed_id)
            self.train, self.test, self.feature_cols = self._preprocess(seed_id=seed_id, is_train=(run_type=="training"))
            num_features = len(self.feature_cols)
            num_targets = len(self.target_cols)
            num_targets_nonscored = len(self.target_nonscored_cols)

            if run_type == "training":

                # CV split
                self.train = self._split_kfold(self.train, num_folds)

                # kfold training
                for fold_id in range(num_folds):

                    # pretrain the model on all the targets (scored + nonscored)
                    model = Model(num_features,
                                  self.params['hidden_sizes'] + [num_targets+num_targets_nonscored],
                                  self.params['dropout_values'])
                    model = model.to(DEVICE)
                    _ = self._train_single_fold(model, seed_id, fold_id, targets='ALL_TARGETS')
            
                    # load the best pretrained model
                    pretrained_model = Model(num_features,
                                             self.params['hidden_sizes'] + [num_targets+num_targets_nonscored],
                                             self.params['dropout_values'])
                    pretrained_model.load_state_dict(torch.load(f"SEED{seed_id}_FOLD{fold_id}.pth"))
                    pretrained_model = pretrained_model.to(DEVICE)

                    # transfer the model
                    final_model = self._transfer_model(pretrained_model)

                    # Fine-tune the model on scored targets only
                    oof = self._train_single_fold(final_model, seed_id, fold_id, targets='SCORED_ONLY')

                    # Accumulate OOF
                    train_oof[self.target_cols] += oof / len(seeds)
            elif run_type == "evaluation":

                # CV split
                self.train = self._split_kfold(self.train, num_folds)

                # oof preds
                oof = self._evaluate_single_model(seed_id, num_folds)
                train_oof[self.target_cols] += oof / len(seeds)
            else:
                ### model inference
                for fold_id in range(num_folds):
                    predictions = self._predict_single_fold(seed_id, fold_id)
                    test_preds[self.target_cols] += predictions / (len(seeds)*num_folds)

        if run_type == "training":
            cv_score = self._get_cv_score(train_oof, self.train_targets_scored)
            return cv_score
        elif run_type == "evaluation":
            cv_score = self._get_cv_score(train_oof, self.train_targets_scored)
            return cv_score, train_oof
        else:
            submission = self._create_submission(self.sample_submission, test_preds)
            return submission

    def _create_submission(self, sample_submission, test_preds):

        submission = sample_submission.drop(columns=self.target_cols) \
            .merge(test_preds, on='sig_id', how='left') \
            .fillna(0)

        return submission

In [20]:
params = {
    'pca_g': 600,
    'pca_c': 80,
    'variance_threshold': 0.8,
    'hidden_sizes': [2000, 1500, 1000, 500],
    'dropout_values': [0, 0.5, 0.4, 0.3, 0.1],
    'lr': 1e-3
}

In [21]:
nn_trainer = NNTrainer(params, train_features, test_features, train_targets_scored, train_targets_nonscored, train_drug, submission)

In [22]:
%%time

cv_score = nn_trainer.run_multiple_seeds(seeds=[42], num_folds=7, run_type="training")



SEED: 42, FOLD: 0, targets: ALL_TARGETS, best train_loss: 0.012180, best val_loss: 0.009017
SEED: 42, FOLD: 0, targets: SCORED_ONLY, best train_loss: 0.018225, best val_loss: 0.017099
SEED: 42, FOLD: 1, targets: ALL_TARGETS, best train_loss: 0.012123, best val_loss: 0.009221
SEED: 42, FOLD: 1, targets: SCORED_ONLY, best train_loss: 0.018174, best val_loss: 0.017034
SEED: 42, FOLD: 2, targets: ALL_TARGETS, best train_loss: 0.012157, best val_loss: 0.009150
SEED: 42, FOLD: 2, targets: SCORED_ONLY, best train_loss: 0.018046, best val_loss: 0.017672
SEED: 42, FOLD: 3, targets: ALL_TARGETS, best train_loss: 0.012230, best val_loss: 0.008963
SEED: 42, FOLD: 3, targets: SCORED_ONLY, best train_loss: 0.018522, best val_loss: 0.016700
SEED: 42, FOLD: 4, targets: ALL_TARGETS, best train_loss: 0.012125, best val_loss: 0.009292
SEED: 42, FOLD: 4, targets: SCORED_ONLY, best train_loss: 0.018032, best val_loss: 0.017419
SEED: 42, FOLD: 5, targets: ALL_TARGETS, best train_loss: 0.012115, best val_los

In [23]:
print(cv_score)

0.015862005413236926


In [24]:
cv_score, train_oof = nn_trainer.run_multiple_seeds(seeds=[42], num_folds=7, run_type="evaluation")



In [25]:
print(cv_score)

0.015862005413236926


## Create submission

In [26]:
%%time

submission = nn_trainer.run_multiple_seeds(seeds=[42], num_folds=7, run_type="submission")

CPU times: user 12.5 s, sys: 534 ms, total: 13.1 s
Wall time: 12.4 s


In [27]:
submission.shape

(3982, 207)

In [28]:
submission

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,...,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001465,0.001591,0.002146,0.016962,0.024648,0.005407,0.004150,0.004173,0.000507,0.011175,0.022979,0.001304,0.000662,0.000709,0.001376,0.001372,0.003506,0.005864,0.004618,0.002225,0.003078,0.004402,0.000847,0.002298,0.000997,0.000934,0.001195,0.001345,0.004887,0.002707,0.002056,0.002920,0.003273,0.000472,0.000554,0.000585,0.002517,0.000558,0.000931,...,0.003583,0.001200,0.004556,0.001328,0.000827,0.002951,0.001039,0.001173,0.001613,0.001682,0.014755,0.017149,0.002927,0.002875,0.001892,0.001782,0.019545,0.002410,0.000970,0.001024,0.000764,0.002668,0.000478,0.001401,0.001884,0.002368,0.001024,0.002525,0.001373,0.001379,0.001074,0.001207,0.004416,0.002474,0.001538,0.000849,0.001475,0.002229,0.002205,0.001841
1,id_001897cda,0.001032,0.001075,0.001089,0.002363,0.001492,0.002366,0.004590,0.019142,0.029156,0.033403,0.008973,0.002155,0.000697,0.009224,0.000433,0.000830,0.001175,0.001868,0.001725,0.004787,0.001163,0.001148,0.000884,0.001578,0.001548,0.003576,0.000940,0.000727,0.005000,0.002346,0.001188,0.001236,0.001265,0.001390,0.000595,0.000421,0.008582,0.008053,0.003697,...,0.001615,0.001353,0.000924,0.000738,0.001689,0.001216,0.000656,0.031223,0.000660,0.003287,0.018633,0.005614,0.001596,0.001166,0.002910,0.002150,0.008028,0.000298,0.013970,0.000560,0.009579,0.006060,0.001661,0.000774,0.000681,0.001312,0.000755,0.003488,0.002088,0.001983,0.000533,0.001404,0.001700,0.000610,0.018066,0.000670,0.008099,0.002305,0.001275,0.004330
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.001279,0.001308,0.001622,0.011054,0.020669,0.004152,0.003879,0.006234,0.000529,0.011068,0.027586,0.002134,0.000666,0.001530,0.001378,0.001386,0.002987,0.006298,0.005127,0.002726,0.001942,0.002864,0.000972,0.002733,0.001232,0.001153,0.001952,0.002452,0.006023,0.002092,0.002118,0.002528,0.003061,0.001499,0.000688,0.000661,0.003551,0.000796,0.000812,...,0.003187,0.001168,0.002913,0.000642,0.001701,0.001425,0.000818,0.000701,0.001161,0.000994,0.016972,0.057865,0.002783,0.002339,0.004424,0.002539,0.011522,0.001833,0.001517,0.000761,0.000599,0.006065,0.000393,0.002064,0.001483,0.002699,0.000912,0.002404,0.000791,0.001641,0.000717,0.001249,0.002628,0.003433,0.003375,0.000715,0.001563,0.002037,0.003058,0.002001
4,id_0027f1083,0.001972,0.001777,0.001723,0.015567,0.023992,0.005800,0.004231,0.003051,0.000586,0.014584,0.025292,0.001420,0.000513,0.000665,0.001253,0.001310,0.003933,0.005511,0.004194,0.001972,0.002998,0.004643,0.000922,0.002218,0.001167,0.000916,0.000979,0.001126,0.004959,0.003090,0.001832,0.002797,0.003992,0.000432,0.000542,0.000435,0.002239,0.000453,0.000687,...,0.003611,0.001143,0.005665,0.001250,0.000765,0.002576,0.000954,0.001291,0.001650,0.001817,0.011102,0.017334,0.002940,0.002656,0.001709,0.001317,0.019241,0.001934,0.000980,0.000983,0.000770,0.003159,0.000514,0.001540,0.002122,0.002180,0.000952,0.002348,0.002027,0.001322,0.000917,0.000835,0.003511,0.002788,0.001494,0.000909,0.001166,0.001858,0.001378,0.001986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000530,0.001109,0.001694,0.004154,0.003072,0.001340,0.001898,0.004565,0.000585,0.002180,0.004996,0.002310,0.002377,0.024155,0.001055,0.001187,0.001660,0.003131,0.002304,0.002256,0.001084,0.001729,0.001596,0.002352,0.000839,0.004236,0.001546,0.001246,0.002855,0.000693,0.000882,0.003056,0.000794,0.000999,0.000831,0.000995,0.016796,0.010426,0.050614,...,0.007396,0.002331,0.000612,0.000673,0.001020,0.001278,0.000924,0.002575,0.000576,0.000724,0.013570,0.004780,0.001076,0.000908,0.002376,0.001269,0.006700,0.000912,0.031804,0.000737,0.001822,0.001765,0.001998,0.001022,0.000482,0.001416,0.000649,0.001451,0.000442,0.001757,0.000986,0.008484,0.002844,0.104799,0.010246,0.001053,0.004915,0.001437,0.000841,0.000697
3978,id_ff925dd0d,0.001983,0.001981,0.001751,0.010068,0.018437,0.005669,0.005648,0.004829,0.001063,0.019823,0.026821,0.002307,0.000578,0.000722,0.000958,0.001195,0.003150,0.004876,0.003837,0.003002,0.002410,0.003864,0.000921,0.002331,0.001503,0.001161,0.001092,0.001240,0.004037,0.003680,0.001927,0.002433,0.003882,0.000577,0.000619,0.000531,0.003636,0.000664,0.001034,...,0.002550,0.001454,0.005119,0.001957,0.001001,0.001806,0.000935,0.001970,0.001403,0.002030,0.012207,0.019847,0.003013,0.002979,0.002040,0.002007,0.018906,0.001417,0.001808,0.000877,0.001221,0.004253,0.000458,0.001351,0.001498,0.002418,0.000952,0.003072,0.001757,0.001557,0.000818,0.001054,0.003845,0.002905,0.002288,0.000854,0.002638,0.001883,0.001739,0.002383
3979,id_ffb710450,0.002299,0.001954,0.001523,0.012810,0.032309,0.006448,0.003840,0.003467,0.000527,0.015424,0.029545,0.001103,0.000416,0.000723,0.001283,0.001143,0.005181,0.007297,0.006554,0.001833,0.003352,0.004753,0.000902,0.001703,0.001186,0.000811,0.000870,0.001128,0.005308,0.003112,0.002125,0.002419,0.003807,0.000455,0.000524,0.000393,0.002034,0.000417,0.000622,...,0.003574,0.001222,0.006076,0.000907,0.000741,0.003689,0.000859,0.001396,0.001416,0.001453,0.012139,0.024091,0.002923,0.002752,0.002145,0.001020,0.020692,0.001691,0.000931,0.000841,0.000672,0.003134,0.000602,0.002036,0.002072,0.002449,0.000832,0.002043,0.001970,0.001531,0.000814,0.000664,0.003578,0.001966,0.001463,0.000933,0.001446,0.001895,0.001115,0.001655
3980,id_ffbb869f2,0.002392,0.001559,0.001346,0.013928,0.027880,0.006124,0.004950,0.003572,0.000675,0.023560,0.035277,0.001069,0.000363,0.000776,0.001207,0.001161,0.003912,0.006346,0.005557,0.002115,0.002574,0.003989,0.000817,0.001916,0.001239,0.000850,0.000991,0.000992,0.005623,0.003303,0.001925,0.001965,0.004049,0.000509,0.000524,0.000315,0.002181,0.000415,0.000571,...,0.003402,0.001172,0.006536,0.000565,0.000848,0.001837,0.000716,0.001408,0.001303,0.001610,0.013308,0.022459,0.003388,0.003453,0.002276,0.001381,0.020718,0.001745,0.000920,0.000790,0.000672,0.004163,0.000523,0.001998,0.001750,0.002587,0.000854,0.002446,0.001411,0.001373,0.000768,0.000536,0.002684,0.001379,0.001506,0.000712,0.001223,0.001958,0.001310,0.002293
