In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import LongformerModel, LongformerConfig

In [2]:
import os
import sys
import tables

import pandas as pd
import numpy as np
from numba import cuda

from sklearn.model_selection import KFold
from scipy.stats import pearsonr
# import seaborn as sns

In [3]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [4]:
use_amp = True

In [5]:
# device = "cpu"

# load the data 

In [6]:
def identify_constant_columns(data):

    constant_cols_list = []

    for index in range(data.shape[1]):

        col = data[:, index]
        is_unique = len(np.unique(col)) == 1

        if is_unique:
            constant_cols_list.append(index)
            
    return constant_cols_list

In [7]:
# get the paths to load the data

def load_dataset(data_name):

    dir_path = "/kaggle/input/open-problems-multimodal/"

    if data_name == "citeseq":
        train_x_path = os.path.join(dir_path,"train_cite_inputs.h5")
        train_y_path = os.path.join(dir_path,"train_cite_targets.h5")
        test_x_path = os.path.join(dir_path,"test_cite_inputs.h5")

    elif data_name == "multiome":
        train_x_path = os.path.join(dir_path,"train_multi_inputs.h5")
        train_y_path = os.path.join(dir_path,"train_multi_targets.h5")
        test_x_path = os.path.join(dir_path,"test_multi_inputs.h5")

    else:
        raise NameError(f"{data_name} is not a valid name: choose between 'siteseq' and 'multiome'")
        
    train_x = pd.read_hdf(train_x_path).to_numpy()
    train_y = pd.read_hdf(train_y_path).to_numpy()
    
    test_x = pd.read_hdf(test_x_path).to_numpy()
    
    # some columns in the training sets are constant (zeros), remove them
    constant_columns_train = identify_constant_columns(train_x)
    train_x = train_x[:, [i for i in range(train_x.shape[1]) if i not in constant_columns_train]]
    test_x = test_x[:, [i for i in range(test_x.shape[1]) if i not in constant_columns_train]]
    
    return train_x, train_y, test_x

In [8]:
# submission_path = os.path.join(dir_path,"sample_submission.csv")
# evaluation_ids_path = os.path.join(dir_path,"evaluation_ids.csv")

# Pytorch util functions

In [9]:
# training loop for each epoch

def train(dataloader, model, loss_fn, optimizer, device, use_amp=use_amp):
    
    num_batches = len(dataloader)
    model.train()
    loss = 0
    corr = 0
    
    for data_dic in dataloader:
        X, y = data_dic['x'].to(device), data_dic['y'].to(device)

        # Compute prediction error
        with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
            pred = model(X)
            train_loss = loss_fn(pred, y)
            
        loss += train_loss.item()
        corr += np.mean([pearsonr(pred[i].cpu().detach().numpy(), y[i].cpu().detach().numpy())[0] for i in range(pred.shape[0])])
        
        # Backpropagation
        opt.zero_grad()
        scaler.scale(train_loss).backward()
        scaler.step(opt)
        scaler.update()
        
        # these 3 lines are to use without the torch.amp mixed precision
        # optimizer.zero_grad()
        # train_loss.backward()
        # optimizer.step()
    
    loss /= num_batches
    corr /= num_batches
    
    return loss, corr


# validation loop for each epoch
def val(dataloader, model, loss_fn, device, use_amp=use_amp):
    
    num_batches = len(dataloader)
    model.eval()
    loss = 0
    corr = 0
    
    with torch.no_grad():
        for data_dic in dataloader:
            X, y = data_dic['x'].to(device), data_dic['y'].to(device)
            
            with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
                pred = model(X)
                val_loss = loss_fn(pred, y)
                
            loss += val_loss.item()    
            corr += np.mean([pearsonr(pred[i].cpu().detach().numpy(), y[i].cpu().detach().numpy())[0] for i in range(pred.shape[0])])
            
    loss /= num_batches
    corr /= num_batches
    
    return loss, corr

# make predictions on the test set (dataloader is only made of x)

def test(dataloader, model, loss_fn, device):
    
    pred_list = []
    num_batches = len(dataloader)
    model.eval()
    
    with torch.no_grad():
        for data_dic in dataloader:
            
            X = data_dic['x'].to(device)
            pred = model(X)
            pred_list.append(pred)
    
    return pred_list

In [10]:
# Dataset class that loads the data and prepare it for the pytorch dataloader

class CompetitionDataset(Dataset):

    def __init__(self, data_tuple, mode='train'):
        self.mode = mode
        
        if self.mode == "train":
            assert len(data_tuple) == 2, "`data_tuple` should have lenght 2"
            data_x, data_y = data_tuple
        elif self.mode == "test":
            assert len(data_tuple) == 1, "`data_tuple` should have length 1"
            data_x = data_tuple[0]
        else:
            raise NameError(f"{self.mode} is not a valid mode: choose between 'train' and 'test'")

        self.filenames = dict()
        self.filenames['x'] = data_x
        
        if self.mode == "train":
            self.filenames['y'] = data_y

    def __getitem__(self, index):
        batch = dict()
        
        batch['x'] = torch.from_numpy(self.filenames['x'][index])
        
        if self.mode == "train":
            batch['y'] = torch.from_numpy(self.filenames['y'][index])
        
        return batch

    def __len__(self):
        return len(self.filenames['x'])

In [11]:
def pytorch_dataset_and_dataloader(train_x, train_y, train_index, val_index, batch_size = 256, mode = "train"):
    
    tr_x, tr_y = train_x[train_index], train_y[train_index]
    val_x, val_y = train_x[val_index], train_y[val_index]

    train_dataset = CompetitionDataset((tr_x, tr_y), mode)
    val_dataset = CompetitionDataset((val_x, val_y), mode) # here "train" is also used for validation
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=False)
    val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, drop_last=False)
    
    return train_dataloader, val_dataloader, val_x, val_y

# Pytorch longformer

In [10]:
class ModifiedLongformer(nn.Module):
    def __init__(self, num_hidden_layers, model, output_size):
        super(ModifiedLongformer, self).__init__()
        self.num_hidden_layers = num_hidden_layers
        
        self.model = model
        self.flatten = nn.Flatten(start_dim=1)
        self.lazy_linear = nn.LazyLinear(output_size)

    def forward(self, x):
        
        x = self.model[0](x)
        
        if self.num_hidden_layers > 1:
            for n_layer in range(1, self.num_hidden_layers):
                x = self.model[n_layer](x[0])
        
        x = self.flatten(x[0]) # x is a tuple here, so we take x[0]
        x = self.lazy_linear(x)
    
        return x

In [11]:
train_x, train_y, test_x = load_dataset(data_name="citeseq")

#train_x = np.hstack((train_x, np.zeros((train_x.shape[0], 3))))
train_x = np.expand_dims(train_x, 2)

In [12]:
train_x = train_x[:, :-1, :]

In [13]:
# hyperparams

# for the longformer

num_hidden_layers = 1
hidden_size = 10 # hidden size must be a multiple of number of heads, as the dim of each heads will then be hidden_size/num_attention_heads
num_attention_heads = 5
attention_window = 216

# for the output layer
output_size = train_y.shape[1]

In [14]:
lr = 1e-4
batch_size = 8

In [15]:
n_cv_fold = 5
n_epoch = 10

In [None]:
fold_number = 0

for train_index_outer, val_index_outer in KFold(n_cv_fold, shuffle=True, random_state=0).split(train_x):
    print(f"computing for fold number {fold_number+1}")
    
    train_dataloader, val_dataloader, val_x, val_y = pytorch_dataset_and_dataloader(train_x, train_y, train_index_outer, val_index_outer, batch_size, "train")
    
    # instantiate a newly initialized model and a new optimize (adam aggregates gradients so it needs to be reset)
    config = LongformerConfig(num_hidden_layers = num_hidden_layers, 
                              hidden_size = hidden_size, 
                              num_attention_heads = num_attention_heads,
                              attention_window = attention_window,
                              max_position_embeddings = 512,
                              vocab_size = 512,
                              use_cache = False)
    
    model = LongformerModel(config).encoder.layer
    model = ModifiedLongformer(num_hidden_layers, model, output_size).to(device)
    
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(n_epoch):
        
        # train for one full epoch and gather training_loss
        loss_train, corr_train = train(train_dataloader, model, loss_fn, optimizer, device)
        # compute the validation loss after the epoch
        loss_val = val(val_dataloader, model, loss_fn, device)
        
        print(f"OUTER LOOP epoch {epoch}")
        print(f"training loss: {loss_train}, corr train: {corr_train}, validation_loss: {loss_val}")

computing for fold number 1




OUTER LOOP epoch 0
training loss: 5.686488068259959, corr train: 0.766577659080332, validation_loss: (4.280856034990768, 0.834438721262542)
OUTER LOOP epoch 1
training loss: 4.208691983042275, corr train: 0.8259350987057289, validation_loss: (3.2409450081032767, 0.8635845790247748)
OUTER LOOP epoch 2
training loss: 3.0387585826920263, corr train: 0.872311985638506, validation_loss: (2.991883960038843, 0.8828303415887628)


# Pytorch Simple MLP

In [12]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, n_hidden_layers, hidden_size_list, dropout_list, output_size):
        super(NeuralNetwork, self).__init__()
        
        assert len(hidden_size_list) == n_hidden_layers, f"`hidden_size_list` should have length {n_hidden_layers}"
        assert len(dropout_list) == n_hidden_layers, f"`dropout_list` should have length {n_hidden_layers}"
        
        self.n_hidden_layers = n_hidden_layers
        self.hidden_size_list = hidden_size_list
        self.dropout_list = dropout_list
        self.output_size = output_size
        
        # first layer
        self.first_layer = nn.Sequential(
            nn.LazyLinear(self.hidden_size_list[0]),
            nn.ReLU(),
            nn.Dropout(self.dropout_list[0]),
            nn.BatchNorm1d(self.hidden_size_list[0]))
        
        # hidden layers
        self.hidden_layers_list = nn.ModuleList()
        for l in range(1, self.n_hidden_layers):
            self.hidden_layers_list.append(nn.Linear(self.hidden_size_list[l-1], self.hidden_size_list[l]))
            self.hidden_layers_list.append(nn.ReLU())
            self.hidden_layers_list.append(nn.Dropout(self.dropout_list[l]))
            self.hidden_layers_list.append(nn.BatchNorm1d(self.hidden_size_list[l]))
        
        self.output_layer = nn.Linear(self.hidden_size_list[-1], output_size)
        
    def forward(self, x):
        
        x = self.first_layer(x)
        
        if self.n_hidden_layers > 1:
            for l in range(1, self.n_hidden_layers):
                x = self.hidden_layers_list[l](x)
        
        x = self.output_layer(x)
        
        return x

In [None]:
train_x, train_y, test_x = load_dataset(data_name="citeseq")

In [57]:
# network parameters

n_hidden_layers = 2
drop_out = 0.25

hidden_size_list = [2056] * (n_hidden_layers-1)
dropout_list = [drop_out] * (n_hidden_layers-1)

output_size = train_y.shape[1]

In [52]:
lr = 1e-4
batch_size = 256

In [53]:
n_cv_fold = 5
n_epoch = 10

In [59]:
# cross validation

fold_number = 0

for train_index_outer, val_index_outer in KFold(n_cv_fold, shuffle=True, random_state=0).split(train_x):
    print(f"computing for fold number {fold_number+1}")
    
    train_dataloader, val_dataloader, val_x, val_y = pytorch_dataset_and_dataloader(train_x, train_y, train_index_outer, val_index_outer, batch_size, "train")
    
    # instantiate a newly initialized model and a new optimize (adam aggregates gradients so it needs to be reset)
    model = NeuralNetwork(n_hidden_layers-1, hidden_size_list, dropout_list, output_size).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    
    for epoch in range(n_epoch):
        
        # train for one full epoch and gather training_loss
        loss_train, corr_train = train(train_dataloader, model, loss_fn, optimizer, device)
        # compute the validation loss after the epoch
        loss_val = val(val_dataloader, model, loss_fn, device)
        
        print(f"OUTER LOOP epoch {epoch}")
        print(f"training loss: {loss_train}, corr train: {corr_train}, validation_loss: {loss_val}")
    
    pred_train = model(torch.from_numpy(val_x).to(device))
    pred_train = pred_train.cpu().detach().numpy()
        
    for train_index_inner, val_index_inner in KFold(10, shuffle=True, random_state=0).split(pred_train):
        
        train_dataloader, val_dataloader, _, _ = pytorch_dataset_and_dataloader(pred_train, val_y, train_index_inner, val_index_inner, batch_size, "train")
        
        # instantiate a newly initialized model and a new optimize (adam aggregates gradients so it needs to be reset)
        model = NeuralNetwork(n_hidden_layers-1, [1024], dropout_list, output_size).to(device)
        loss_fn = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        
        for epoch in range(50):
        
            # train for one full epoch and gather training_loss
            loss_train, corr_train = train(train_dataloader, model, loss_fn, optimizer, device)
            # compute the validation loss after the epoch
            loss_val = val(val_dataloader, model, loss_fn, device)
            
            print(f"INNER LOOP epoch {epoch}")
            print(f"training loss: {loss_train}, corr train: {corr_train}, validation_loss: {loss_val}")

    fold_number += 1

# predictions from the model and computing metrics

In [39]:
from scipy.stats import pearsonr

In [53]:
def compute_pearson(preds, truth):

    output = model(data_x).cpu().detach().numpy()

In [132]:
corr_list = []

for index in range(pred.shape[0]):
    
    if index % 5000 == 0:
        print(index)
    
    corr = pearsonr(pred[index].cpu().detach().numpy(), val_y[index])[0]
    corr_list.append(corr)

0
5000
10000


In [134]:
np.mean(corr_list)

0.8875630494726452