In [147]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn import model_selection

In [148]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [192]:
train_features = pd.read_csv('/content/drive/My Drive/moa/train_features.csv')
train_targets_scored = pd.read_csv('/content/drive/My Drive/moa/train_targets_scored.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/moa/sample_submission.csv')
test_features = pd.read_csv('/content/drive/My Drive/moa/test_features.csv')

In [193]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [194]:
train_features["cp_type"].value_counts()
# Can we drop cp_type column? ctl_vehicle is 8% from total.

trt_cp         21948
ctl_vehicle     1866
Name: cp_type, dtype: int64

In [195]:
train = train_features.merge(train_targets_scored, on='sig_id')
test = test_features
# If we choose to drop train_features[train['cp_type']=='ctl_vehicle'], uncomment.
# train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
# test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]
train = train.drop('cp_type', axis=1)               # train["cp_type"].unique() = 'trt_cp'. We cant pass cp_type without encode.
train = train.drop('sig_id', axis=1)

# target                      # 23814 rows × 207 columns. # Its actually the same as train_targets_scored, if we didnt preprocess anythig.

In [196]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):              # len() will use the __len__ method if present to get your object for its length.  
        return (self.features.shape[0])
    
    def __getitem__(self, idx):     # docs: https://docs.python.org/3/reference/datamodel.html#object.__getitem__. In this case returns a dict.
        dct = { 
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),       # ex: np_array[0, :] -> [1,2]
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

In [197]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)        # Asks for the value of "x" and "y" keys.
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss   

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [198]:
# process_data(data) uses get_dummies() to create cp_time: 24, 48, 72. cp_dose: D1, D2
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])              
   
    return data

In [199]:
# Simply target without id.
target_cols = target.drop('sig_id', axis=1).columns.tolist()
# We use this comprehension to take into account the dummies created by our process_data()
feature_cols = [col for col in process_data(train).columns if col not in target_cols]

In [200]:
# HyperParameters
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 2
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1024

In [201]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [202]:
folds = train.copy()
folds = folds.sample(frac=1).reset_index(drop=True)             # to randomize
kf = model_selection.KFold(n_splits=5)
for fold, (t_idx, v_idx) in enumerate(kf.split(X=folds)):
    folds.loc[v_idx, 'kfold'] = fold
folds['kfold'] = folds['kfold'].astype(int)           # Otherwise 0.0, 1.0, 2.0, 3.0...

train = process_data(folds)

test_df = process_data(test)
x_test  = test_df[feature_cols].values
test_dataset = TestDataset(x_test)
testloader = torch.utils.data.DataLoader(test_dataset)

predictions = np.zeros((3982,206))

In [203]:
def run_training(fold):
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index

    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)

    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)

    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)      # It has a len of 187. 187 * 128(BATCH_SIZE) = 23936. Contains the 23814 rows of the train_df.
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )

    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                                max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    loss_fn = nn.BCEWithLogitsLoss()

    # oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf                  # Represents a positive infinite

    for epoch in range(EPOCHS):
        train_loss = train_fn(model, optimizer, scheduler, loss_fn, trainloader, DEVICE)
        print(f"EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            print(f"updating best model on Fold={fold}") 
            best_loss = valid_loss
            torch.save(model.state_dict(), f"FOLD{fold}_.pth")
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.load_state_dict(torch.load(f"FOLD{fold}_.pth"))
    model.to(DEVICE)
    
    fold_preds = inference_fn(model, testloader, DEVICE)
    global predictions
    predictions = predictions + fold_preds
      
for run_k_fold in range(5):              # 5 folds
    run_training(run_k_fold)

predictions /= 5

EPOCH: 0, train_loss: 0.14032905092525402
EPOCH: 0, valid_loss: 0.01896585004502221
updating best model on Fold=0
EPOCH: 1, train_loss: 0.018855824510273116
EPOCH: 1, valid_loss: 0.018226633853230038
updating best model on Fold=0
EPOCH: 0, train_loss: 0.14102164883441573
EPOCH: 0, valid_loss: 0.01894737917341684
updating best model on Fold=1
EPOCH: 1, train_loss: 0.018950409087158688
EPOCH: 1, valid_loss: 0.017634862445686992
updating best model on Fold=1
EPOCH: 0, train_loss: 0.1388026420777076
EPOCH: 0, valid_loss: 0.0197015078248162
updating best model on Fold=2
EPOCH: 1, train_loss: 0.018738220152038857
EPOCH: 1, valid_loss: 0.018097600875128256
updating best model on Fold=2
EPOCH: 0, train_loss: 0.1390480728532444
EPOCH: 0, valid_loss: 0.019028457126727228
updating best model on Fold=3
EPOCH: 1, train_loss: 0.019025844903100258
EPOCH: 1, valid_loss: 0.018177134818152377
updating best model on Fold=3
EPOCH: 0, train_loss: 0.14043975121422902
EPOCH: 0, valid_loss: 0.0188878219035503

In [204]:
sample_submission = pd.read_csv('/content/drive/My Drive/moa/sample_submission.csv')

In [205]:
y = pd.DataFrame(data=predictions)
y.columns = target_cols

In [206]:
sub = sample_submission.drop(columns=target_cols)
frames = [sub, y]
sub = pd.concat(frames, axis=1)

In [209]:
sample_submission.shape

(3982, 207)

In [210]:
submission = sub.to_csv("submission.csv",index=False)
teste = pd.read_csv("submission.csv")
teste.shape

(3982, 207)