In [41]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim.swa_utils import AveragedModel
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics import PearsonCorrCoef
from torch.nn import functional 
from torch.nn.modules import dropout
from torch import Tensor

import os
import sys
import tables
import pickle
import copy
import tables
import random
import zipfile
import sklearn
import scipy
import gc

import pandas as pd
import numpy as np
from numba import cuda

from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import TruncatedSVD

from scipy.stats import pearsonr, mode
import scipy.sparse as sps
from scipy.sparse import csr_matrix

from matplotlib import pyplot as plt

In [42]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# feature engineering and tricks utils

In [43]:
def get_most_associated_target_input_pairs(data_x, data_y):
    
    list_argsort_correlations = []
    list_correlations = []
    
    for j in range(data_y.shape[1]):
        if j % 1000 == 0:
            print(f"computing for target number {j+1}")
            
        random_indices = np.random.choice(data_x.shape[0], 1000, replace=False)
        
        target = data_y[random_indices, j]
        
        corr_list = [np.abs(np.corrcoef(data_x[random_indices, i], target)[0,1])*-1 for i in range(data_x.shape[1])]
        argsort_array = np.argsort(corr_list)
        
        list_argsort_correlations.append(argsort_array)
        list_correlations.append(np.array(corr_list)[argsort_array])
        
    return list_argsort_correlations, list_correlations

In [44]:
def get_extra_data(data_x, dataset, correlation_threshold=0.05, max_features=100):
    
    list_argsort, list_correlations = pickle.load(open(f"{dataset}_files/list_correlations_{dataset}.p", "rb"))
    
    my_range = range(len(list_correlations))
    
    list_number_features = []
    data_to_add = np.empty((data_x.shape[0], len(list_correlations)*max_features))
    running_column_count = 0
    for i in my_range:
        data = data_x[:, list_argsort[i][list_correlations[i] < -correlation_threshold][:max_features]]
        data_to_add[:, running_column_count:running_column_count+data.shape[1]] = data
        
        running_column_count += data.shape[1]
        list_number_features.append(data.shape[1])
        
    return data_to_add[:, :np.sum(list_number_features)], list_number_features

In [45]:
def add_corr_columns(train_x, test_x, dataset, min_correlation, max_num_correlated_features):
    
    list_pca_scale_models = []

    data_to_add_train, list_number_features = get_extra_data(train_x, dataset, min_correlation, max_num_correlated_features)
    data_to_add_test, _ = get_extra_data(test_x, dataset, min_correlation, max_num_correlated_features)
    indices_to_split =  np.cumsum(list_number_features)[:-1]
    
    return data_to_add_train, data_to_add_test, indices_to_split

In [46]:
def add_landmark_similarities(train_x, val_x, test_x, n_samples=1000):
    
    def stack_data(data, data_to_add):
        cos_distance_matrix = sklearn.metrics.pairwise.cosine_similarity(data, data_to_add)
        data = np.hstack((data, cos_distance_matrix))
        # data = cos_distance_matrix
        return data
    
    if n_samples is None:
        data_to_stack = copy.deepcopy(test_x)
    else:
        random_indices = np.random.choice(test_x.shape[0], n_samples, replace=False)
        data_to_stack = copy.deepcopy(test_x[random_indices, :])
    
    train_x = stack_data(train_x, data_to_stack)
    val_x = stack_data(val_x, data_to_stack)
    test_x = stack_data(test_x, data_to_stack)
    
    return train_x, val_x, test_x
    

# load the data

In [47]:
def identify_constant_columns(train_x, test_x):
    
    constant_cols_list_train = train_x.columns[train_x.nunique() <= 1]
    constant_cols_list_test = test_x.columns[test_x.nunique() <= 1]
    constant_cols = set(list(constant_cols_list_train) + list(constant_cols_list_test))
    
    train_x = train_x.drop(columns=constant_cols)
    test_x = test_x.drop(columns=constant_cols)
            
    return train_x, test_x

In [48]:
def fit_scale_data_and_pca(train_data, dataset):
    
    scaler = None
    if dataset == "citeseq":
        scaler = StandardScaler()
        scaler.fit(train_data)
        train_data = scaler.transform(train_data).astype(np.float32)
    
    file_name = f"{dataset}_files/pca_{dataset}.p"
    if os.path.exists(file_name):
        pca = pickle.load(open(file_name, "rb"))
    else:
        raise ValueError(f"pca file does not exist") 

    train_data = pca.transform(train_data)
        
    scaler_pca = StandardScaler()
    scaler_pca.fit(train_data)
    train_data = scaler_pca.transform(train_data).astype(np.float32)
    
    return train_data, pca, scaler, scaler_pca

def apply_scale_data_and_pca(data, scaler, pca, scaler_pca):
    
    if scaler is not None:
        data = scaler.transform(data).astype(np.float32)
    
    data = pca.transform(data)
    data = scaler_pca.transform(data)
    
    return data

In [49]:
def pca_keep_top_variance(data, variance_threshold):
    
    pca = PCA(data.shape[1])
    pca.fit(data)
    
    k = np.where(np.cumsum(pca.explained_variance_ratio_) > variance_threshold)[0][0]
    
    return pca, k

In [50]:
def load_metadata():
    
    metadata = pd.read_csv("/kaggle/input/open-problems-multimodal/metadata.csv")

    return metadata

In [51]:
# get the paths to load the data

def load_dataset(data_name):
    
    dir_path = "/kaggle/input/open-problems-multimodal/"

    if data_name == "citeseq":
        train_x_path = os.path.join(dir_path,"train_cite_inputs.h5")
        train_y_path = os.path.join(dir_path,"train_cite_targets.h5")
        test_x_path = os.path.join(dir_path,"test_cite_inputs.h5")
        
        train_x = pd.read_hdf(train_x_path)
        train_y = pd.read_hdf(train_y_path)
        test_x = pd.read_hdf(test_x_path)

    elif data_name == "multiome":
        
        train_indices, test_indices, cols_name = pickle.load(open("multiome_files/index_train_test_cols.p", "rb"))
        
        train_x = pickle.load(open("train_x_multi_svd.p", "rb"))
        train_y = scipy.sparse.load_npz("train_multi_targets_values.sparse.npz").toarray()
        test_x = pickle.load(open("test_x_multi_svd.p", "rb"))
        
        train_x = pd.DataFrame(train_x, index=train_indices)
        test_x = pd.DataFrame(test_x, index=test_indices)
        train_y = pd.DataFrame(train_y, index=train_indices)
        
    else:
        raise NameError(f"{data_name} is not a valid name: choose between 'siteseq' and 'multiome'")
    
    return train_x, train_y, test_x

In [52]:
def remove_constant_targets(train_y):

    constant_cols = np.all(train_y == train_y[0,:], axis = 0)
    indices = [(i, train_y[0, i]) for i in range(len(constant_cols)) if constant_cols[i]]

    train_y_modify = np.delete(train_y, [tup[0] for tup in indices], axis=1)

    return train_y_modify, indices

In [53]:
def get_donor_subet(train_x, test_x, metadata, dataset, split_by_donor=False):
    
    data_subset_dic = dict()
    
    if split_by_donor:
        meta_data_donors = np.unique(meta_data["donor"])
        meta_subset_tech = meta_data[meta_data["technology"] == dataset]

        for donor in meta_data_donors:
            donor_subset_train = meta_subset_tech[meta_subset_tech["donor"] == donor]["cell_id"]
            cell_id_in_common_train = donor_subset_train[donor_subset_train.isin(train_x.index)]

            if np.sum(donor_subset_train) != 0:
                donor_subset_test = meta_subset_tech[meta_subset_tech["donor"] == donor]["cell_id"]
                cell_id_in_common_test = donor_subset_test[donor_subset_test.isin(test_x.index)]
                data_subset_dic[donor] = [cell_id_in_common_train, cell_id_in_common_train, cell_id_in_common_test]

            else:
                donor_subset_test = meta_subset_tech[meta_subset_tech["donor"] == donor]["cell_id"]
                cell_id_in_common_test = donor_subset_test[donor_subset_test.isin(test_x.index)]
                data_subset_dic[donor] = [train_x.index, train_x.index, cell_id_in_common_test]
    
    else:
        data_subset_dic["all_donor"] = [train_x.index, train_x.index, test_x.index]
            
    return data_subset_dic
            

In [54]:
def select_indices_multiome(prediction_array):
    
    rows = pickle.load(open("multiome_files/cell_id_to_numbers.p", "rb"))
    cols = pickle.load(open("multiome_files/gene_id_to_numbers.p", "rb"))
    prediction_array = prediction_array[rows, cols]
    
    return prediction_array

# get data

In [55]:
# load the data

In [56]:
dataset = "citeseq"

In [57]:
train_x, train_y, test_x = load_dataset(dataset)

In [58]:
if dataset == "citeseq":
    train_x, test_x = identify_constant_columns(train_x, test_x)

In [59]:
meta_data = load_metadata()

In [60]:
_, indices = remove_constant_targets(np.array(train_y))

In [61]:
meta_multiome = meta_data[meta_data["technology"] == dataset]

# Pytorch utils

In [62]:
def torch_corrcoef(preds, target):
    
    metric = PearsonCorrCoef().to(device)
    
    output = torch.empty(size=(preds.shape[0],)).to(device)
    for i in range(preds.shape[0]):
        score = metric(preds[i], target[i])
        output[i] = score
    
    return output.mean()

def corrcoef(preds, target):
    
    pred_centred = preds-torch.unsqueeze(torch.mean(preds, 1), 1)
    target_centred = target-torch.unsqueeze(torch.mean(target, 1), 1)
    
    pred_std = torch.unsqueeze(torch.sqrt(torch.mean(pred_centred**2, 1)), 1)
    target_std = torch.unsqueeze(torch.sqrt(torch.mean(target_centred**2, 1)), 1)
    
    corr_torch = pred_centred*target_centred/(pred_std*target_std)

    return corr_torch.mean()

In [63]:
# training loop for each epoch

def train(dataloader, model, loss_fn, loss, optimizer, device):
    
    num_batches = len(dataloader)
    model.train()
    loss_list = []
    corr_list = []
    
    for X, y in dataloader:

        # Compute prediction error
        pred = model(X)
        if (loss == "mse") or (loss == "mae"):
            train_loss = loss_fn(pred, y)
        elif loss == "huber":
            train_loss = loss_fn(pred, y)
        elif (loss == "cosine") or (loss == "corrcoef") or (loss == "torchcorrcoef"):
            train_loss = loss_fn(pred, y)*-1
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
            
        loss_list.append(train_loss.item())
        
        pred = pred.cpu().detach().numpy()
        y = y.cpu().detach().numpy()
        
        corr_list.append([pearsonr(pred[i], y[i])[0] for i in range(pred.shape[0])])
    
    loss = np.mean(loss_list)
    corr = np.mean(np.concatenate(corr_list))
    
    return loss, corr


# validation loop for each epoch
def val(dataloader, model, loss_fn, loss, device):
    
    pred_val_list = []
    num_batches = len(dataloader)
    model.eval()
    loss_list = []
    corr_list = []
    
    with torch.no_grad():
        for X, y in dataloader:
            
            # Compute prediction error
            #pred = torch.from_numpy(np.mean([model(X).cpu().detach().numpy() for _ in range(100)], axis=0)).to(device)
            pred = model(X)
            
            if (loss == "mse") or (loss == "mae"):
                val_loss = loss_fn(pred, y)
            elif loss == "huber":
                val_loss = loss_fn(pred, y)
            elif (loss == "cosine") or (loss == "corrcoef") or (loss == "torchcorrcoef"):
                val_loss = loss_fn(pred, y)*-1
                
            loss_list.append(val_loss.item())
            
            pred = pred.cpu().detach().numpy()
            if pred.shape[1] > 20000:
                for i in range(pred.shape[0]):
                    zero_value = np.mean(pred[i][[ind[0] for ind in indices]])
                    pred[i][pred[i] < zero_value] = zero_value
            
            y = y.cpu().detach().numpy()
            pred_val_list.append(pred)
            
            corr_list.append([pearsonr(pred[i], y[i])[0] for i in range(pred.shape[0])])

    loss = np.mean(loss_list)
    corr = np.mean(np.concatenate(corr_list))
    
    return loss, corr, pred_val_list


# make predictions on the test set (dataloader is only made of x)
def test(dataloader, model, device, num_eval=100, d=False):
    
    pred_list = []
    num_batches = len(dataloader)
    model.eval()
    
    with torch.no_grad():
        for X in dataloader:
            
            if d:
                pred = np.mean([model(X).cpu().detach().numpy() for _ in range(num_eval)], axis=0)
            else:
                pred = model(X)
                pred = pred.cpu().detach().numpy()
                
            if pred.shape[1] > 20000:
                for i in range(pred.shape[0]):
                    zero_value = np.mean(pred[i][[ind[0] for ind in indices]])
                    pred[i][pred[i] < zero_value] = zero_value
                    
            pred_list.append(pred)
    
    return pred_list

In [64]:
# Dataset class that loads the data and prepare it for the pytorch dataloader

class CompetitionDataset(Dataset):

    def __init__(self, data_tuple, mode='train'):
        self.mode = mode
        
        if self.mode == "train":
            # assert len(data_tuple) == 2, "`data_tuple` should have lenght 2"
            data_x, data_y = data_tuple
        elif self.mode == "test":
            # assert len(data_tuple) == 1, "`data_tuple` should have length 1"
            data_x = data_tuple[0]
        else:
            raise NameError(f"{self.mode} is not a valid mode: choose between 'train' and 'test'")

        self.filenames = dict()
        self.filenames['x'] = data_x
        
        if self.mode == "train":
            self.filenames['y'] = data_y

    def __getitem__(self, index):
        batch = dict()
        
        batch['x'] = torch.from_numpy(self.filenames['x'][index])
        
        if self.mode == "train":
            batch['y'] = torch.from_numpy(self.filenames['y'][index]).to(device)
            return batch['x'].to(device), batch['y']
        else:
            return batch['x'].to(device)
            
    def __len__(self):
        return len(self.filenames['x'])

In [65]:
def pytorch_dataset_and_dataloader(tr_x, tr_y, val_x, val_y, batch_size = 256, mode = "train"):

    train_dataset = CompetitionDataset((tr_x, tr_y), mode)
    val_dataset = CompetitionDataset((val_x, val_y), mode) # here "train" is also used for validation
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=False)
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, drop_last=False)
    
    return train_dataloader, val_dataloader

# Pytorch simple MLP

In [66]:
def obtain_model_dropout(model):
    
    new_model =  NeuralNetwork(n_shared_hidden_layers, shared_hidden_size_list, n_non_shared_hidden_layers, 
                              non_shared_hidden_size_list, drop, n_components_to_keep, indices_to_split, 
                              output_size, n_components_to_keep, True).to(device)
    
    new_model.parameters = model.parameters
    
    return new_model

In [67]:
class DropoutAlwaysActivated(dropout._DropoutNd):

    def forward(self, x):
        return functional.dropout(x, self.p, True, self.inplace)

In [68]:
class custom_layer_input_dropout(nn.Module):
    def __init__(self, input_size, dropout_p):
        super().__init__()
        self.dropout_p = dropout_p
        self.input_size = input_size
        self.identity = nn.Identity(self.input_size)

    def forward(self, x):
        
        x = self.identity(x)
        
        self.training = True
        if self.training:
            drop_out_values = (np.random.uniform(0.0, 1.0, self.input_size) < self.dropout_p).astype(np.float32)
            drop_out_values *= (1/self.dropout_p)
            x = x*torch.from_numpy(drop_out_values).to(device)
        
        return x

In [69]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, n_shared_hidden_layers, shared_hidden_size_list, n_non_shared_hidden_layers, non_shared_hidden_size_list, 
                 drop, shared_input_size, indices_to_split, output_size, n_components_to_keep, dropout_montecarlo = False):
        super(NeuralNetwork, self).__init__()
        
        assert len(shared_hidden_size_list) == n_shared_hidden_layers, f"`hidden_size_list` should have length {n_shared_hidden_layers}"
        assert len(non_shared_hidden_size_list) == n_non_shared_hidden_layers, f"`hidden_size_list` should have length {n_non_shared_hidden_layers}"
        
        self.n_shared_hidden_layers = n_shared_hidden_layers
        self.shared_hidden_size = shared_hidden_size_list
        self.n_non_shared_hidden_layers = n_non_shared_hidden_layers
        self.non_shared_hidden_size = non_shared_hidden_size_list
        self.drop = drop
        self.shared_input_size = shared_input_size
        self.indices_to_split = indices_to_split
        self.output_size = output_size
        self.input_size = n_components_to_keep
        self.dropout_montecarlo = dropout_montecarlo
        
        # explained_variance = pickle.load(open("multiome_files/pca_multiome.p", "rb")).explained_variance_[:self.input_size]
        # p = 0.8
        # self.input_dropout = ((explained_variance - np.min(explained_variance))/(np.max(explained_variance) - np.min(explained_variance)))*(1-p)+p
        # self.custom_layer_input_dropout = custom_layer_input_dropout(self.input_size, self.input_dropout)
        
        # hidden layers
        self.shared_hidden_layers_list = nn.ModuleList()
        for l in range(self.n_shared_hidden_layers):
            self.shared_hidden_layers_list.append(nn.LazyLinear(self.shared_hidden_size[l]))
            self.shared_hidden_layers_list.append(nn.SELU())
            if dropout_montecarlo:
                self.shared_hidden_layers_list.append(DropoutAlwaysActivated(self.drop))
            else:
                self.shared_hidden_layers_list.append(nn.Dropout(self.drop))
            self.shared_hidden_layers_list.append(nn.BatchNorm1d(self.shared_hidden_size[l]))
            
        self.shared_hidden_layers_list = nn.Sequential(*self.shared_hidden_layers_list)
        
        if n_non_shared_hidden_layers > 0:
            self.non_shared_concat = nn.ModuleList(nn.ModuleList() for _ in range(self.output_size))
            for i in range(self.output_size):
                for l in range(self.n_non_shared_hidden_layers):
                    self.non_shared_concat[i].append(nn.LazyLinear(self.non_shared_hidden_size[l]))
                    self.non_shared_concat[i].append(nn.SELU())
                    self.non_shared_concat[i].append(nn.Dropout(self.drop))
                    self.non_shared_concat[i].append(nn.BatchNorm1d(self.non_shared_hidden_size[l]))

                self.non_shared_concat[i].append(nn.LazyLinear(1))
                self.non_shared_concat[i] = nn.Sequential(*self.non_shared_concat[i])
        else:
            self.output_layer = nn.LazyLinear(self.output_size)
        
    def forward(self, x):
        
        # x = self.custom_layer_input_dropout(x)
        
        if indices_to_split is not None:
            x_non_shared = np.split(x[:, self.shared_input_size:], self.indices_to_split, 1)
            x = x[:, :self.shared_input_size]
        
        shared_x = self.shared_hidden_layers_list(x)
        
        if n_non_shared_hidden_layers > 0:
            output = torch.empty(size=(shared_x.shape[0], self.output_size)).to("cuda")
            for i in range(self.output_size):
                if indices_to_split is not None:
                    not_shared_x = torch.hstack((shared_x, x_non_shared[i]))
                else:
                    not_shared_x = shared_x
                not_shared_x = self.non_shared_concat[i](not_shared_x)

                output[:, i] = torch.squeeze(not_shared_x)
        
        else:
            output = self.output_layer(shared_x)
        
        return output

In [70]:
n_components_to_keep = 200

In [71]:
split_by_donor = False

In [72]:
add_correlated_columns = True

min_correlation = 0.1
max_num_correlated_features = 50
indices_to_split = None

In [76]:
# network parameters
output_size = train_y.shape[1]

n_shared_hidden_layers = 3
n_non_shared_hidden_layers = 1

drop = 0.5

shared_hidden_size_list = [1024, 1024, 1024] 
non_shared_hidden_size_list = [512]

In [74]:
loss = "corrcoef"

num_folds = 10

In [79]:
lr = 1e-3
batch_size = 256
n_epoch = 1
early_stopping_epochs = 10

# cross validation and inference

In [77]:
data_subset_dic = get_donor_subet(train_x, test_x, meta_data, dataset, split_by_donor)

In [78]:
# cross validation and inference

test_predictions_dic = dict()

for k in data_subset_dic.keys():
    test_predictions_dic[k] = []
    
    print(f"computing for donor {k}")
    
    ind_tr_x, in_tr_y, ind_test_x = data_subset_dic[k]
    
    subset_train_x = np.array(train_x.loc[ind_tr_x])
    subset_train_y = np.array(train_y.loc[in_tr_y])
    subset_test_x = np.array(test_x.loc[ind_test_x])

    if add_correlated_columns and dataset == "citeseq":
        data_to_add_train, data_to_add_test, indices_to_split = add_corr_columns(subset_train_x, subset_test_x, 
                                                                                 dataset, min_correlation, max_num_correlated_features)
    
    fold_number = 0
    for train_index, val_index in list(KFold(num_folds, shuffle=True, random_state=0).split(subset_train_x))[:]:
        print(f"computing for fold number {fold_number+1}")
        print(f"n for training set: {train_index.shape[0]}, n for validation set: {val_index.shape[0]}")

        tr_x, tr_y = subset_train_x[train_index], subset_train_y[train_index]
        val_x, val_y = subset_train_x[val_index], subset_train_y[val_index]
        te_x = copy.deepcopy(subset_test_x)
        
        if dataset == "citeseq":
            tr_x, pca_model, scaler_org_data, scaler_pca_data = fit_scale_data_and_pca(tr_x, dataset)
            val_x = apply_scale_data_and_pca(val_x, scaler_org_data, pca_model, scaler_pca_data)
            te_x = apply_scale_data_and_pca(te_x, scaler_org_data, pca_model, scaler_pca_data)
            
        tr_x = tr_x[:, :n_components_to_keep]
        val_x = val_x[:, :n_components_to_keep]
        te_x = te_x[:, :n_components_to_keep]
    
        tr_x, val_x, te_x = add_landmark_similarities(tr_x, val_x, te_x, 2000)
        
        scaler = StandardScaler()
        scaler.fit(tr_x)
        tr_x = scaler.transform(tr_x).astype(np.float32)
        val_x = scaler.transform(val_x).astype(np.float32)
        te_x = scaler.transform(te_x).astype(np.float32)

        if add_correlated_columns and dataset == "citeseq":
            important_subset_tr = copy.deepcopy(data_to_add_train)[train_index]
            important_subset_val = copy.deepcopy(data_to_add_train)[val_index]
            test_data_to_add = copy.deepcopy(data_to_add_test)

            scaler = StandardScaler()
            scaler.fit(important_subset_tr)
            important_subset_tr = scaler.transform(important_subset_tr).astype(np.float32)
            important_subset_val = scaler.transform(important_subset_val).astype(np.float32)
            test_data_to_add = scaler.transform(test_data_to_add).astype(np.float32)

            tr_x = np.hstack((tr_x, important_subset_tr))
            val_x = np.hstack((val_x, important_subset_val))
            te_x = np.hstack((te_x, test_data_to_add))
            
        train_dataloader, val_dataloader = pytorch_dataset_and_dataloader(tr_x, tr_y, val_x, val_y, batch_size, "train")
        
        # instantiate a newly initialized model and a new optimize (adam aggregates gradients so it needs to be reset)
        model = NeuralNetwork(n_shared_hidden_layers, shared_hidden_size_list, n_non_shared_hidden_layers, 
                              non_shared_hidden_size_list, drop, n_components_to_keep, indices_to_split, 
                              output_size, n_components_to_keep, False).to(device)

        if loss == "mse":
            loss_fn = nn.MSELoss()
        elif loss == "huber":
            loss_fn = nn.HuberLoss(delta=0.5)
        elif loss == "cosine":
            loss_fn = nn.CosineSimilarity()
        elif loss == "corrcoef":
            loss_fn = corrcoef
        elif loss == "torchcorrcoef":
            loss_fn = torch_corrcoef

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        early_stop_counter = 0
        best_loss = -1
        for epoch in range(n_epoch):

            # train for one full epochearly_stop_counter gather training_loss
            loss_train, corr_train = train(train_dataloader, model, loss_fn, loss, optimizer, device)
            # compute the validation loss after the epoch
            loss_val, corr_val, pred_val_list = val(val_dataloader, model, loss_fn, loss, device)

            # early stopping stuff
            early_stop_counter += 1
            if corr_val > best_loss:
                best_loss = corr_val
                early_stop_counter = 0
                swa_model = AveragedModel(model)

            if early_stop_counter == early_stopping_epochs:
                break
            
            if early_stop_counter > 1:
                swa_model.update_parameters(model)

            print(f"OUTER LOOP epoch {epoch}")
            print(f"training loss: {loss_train}, corr train: {corr_train}, validation_loss: {loss_val}, corr val: {corr_val}")
    
        torch.optim.swa_utils.update_bn(train_dataloader, swa_model)
        # compute the validation loss after the epoch
        loss_val, corr_val, pred_val_list = val(val_dataloader, swa_model, loss_fn, loss, device)
        print(f"Computing for the SWA model. validation_loss: {loss_val}, corr val: {corr_val}")

        test_dataset = CompetitionDataset((te_x,), mode='test')
        test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, drop_last=False)
        test_predictions = test(test_dataloader, swa_model, device)
        test_predictions = np.concatenate(test_predictions)
        test_predictions = scale(test_predictions, axis=1)
        if dataset == "multiome":
            test_predictions = select_indices_multiome(test_predictions)
        
        test_predictions_dic[k].append(test_predictions)
        # pickle.dump(test_predictions_dic, open(f"test_predictions_dic_{dataset}_{k}.p", "wb"))

        fold_number += 1
        break


computing for donor all_donor
computing for fold number 1
n for training set: 63889, n for validation set: 7099


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


OUTER LOOP epoch 0
training loss: -0.7451705948840827, corr train: 0.7449323183720531, validation_loss: -0.89251266845635, corr val: 0.8928068192052483
OUTER LOOP epoch 1
training loss: -0.8845920932292938, corr train: 0.884609153500778, validation_loss: -0.8965694989476886, corr val: 0.8968598606395376
OUTER LOOP epoch 2
training loss: -0.8896425764560699, corr train: 0.8896518871446821, validation_loss: -0.8978621533938816, corr val: 0.8981496399845107
OUTER LOOP epoch 3
training loss: -0.8921798818111419, corr train: 0.8921797383824087, validation_loss: -0.8992516355855125, corr val: 0.8995345454943495
OUTER LOOP epoch 4
training loss: -0.8937819457054138, corr train: 0.8937799993862684, validation_loss: -0.89969944528171, corr val: 0.8999779418398287
OUTER LOOP epoch 5
training loss: -0.8949581780433655, corr train: 0.8949710647269145, validation_loss: -0.9003759750298092, corr val: 0.9006496908739977
OUTER LOOP epoch 6
training loss: -0.8958580868244171, corr train: 0.895852574311



In [39]:
torch.cuda.empty_cache()

In [484]:
def modify_predictions(std_preds):

    cutoff_one = np.quantile(std_preds, 0.8)

    std_preds[std_preds > cutoff_one] = std_preds[std_preds > cutoff_one]-0.05*std_preds[std_preds > cutoff_one]
    
    return std_preds

In [485]:
preds = np.concatenate(pred_val_list)

In [486]:
new_preds = copy.deepcopy(preds)

In [None]:
new_preds = modify_predictions(new_preds)

In [482]:
corr_list = []

for i in range(preds.shape[0]):
    
    corr = pearsonr(new_preds[i], val_y[i])[0]
    corr_list.append(corr)
    

In [483]:
print(np.mean(corr_list))

0.9045159243193208


# generate predictions

In [48]:
preds_final = test_predictions_dic["all_donor"][0]

In [45]:
best_sub = pd.read_csv("best_submission.csv")

In [46]:
rows = pickle.load(open("multiome_files/cell_id_to_numbers.p", "rb"))
cols = pickle.load(open("multiome_files/gene_id_to_numbers.p", "rb"))

In [50]:
# preds_final = test_predictions[rows, cols]

In [49]:
pearsonr(best_sub["target"].values[-len(preds_final):], preds_final)

(0.9861383382105229, 0.0)

In [51]:
best_sub["target"][-len(preds_final):] = preds_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_sub["target"][-len(preds_final):] = preds_final


In [52]:
best_sub = best_sub.set_index("row_id")

In [53]:
display(best_sub)

Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
0,-1.346263
1,-1.257694
2,-0.737524
3,5.998595
4,8.257316
...,...
65744175,4.328752
65744176,-0.571734
65744177,-0.597907
65744178,0.230664


In [54]:
best_sub.to_csv('submission.csv')

In [12]:
test_predictions_dic = test_predictions_dic["all_donor"]

In [13]:
preds_all_donor = [np.concatenate(test_predictions_dic[i]) for i in range(len(test_predictions_dic))]

In [14]:
preds = np.median(preds_all_donor, axis=0)

In [15]:
preds.shape

(48663, 140)

In [16]:
preds = np.concatenate(preds)

In [17]:
submission_df = pd.read_csv("best_submission.csv")
submission_df = submission_df.set_index("row_id")

In [47]:
submission_df["target"][:test_x.shape[0]*140] = preds

In [48]:
display(submission_df)

Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
0,-2.950046
1,-2.661454
2,-1.602814
3,9.410881
4,12.907983
...,...
65744175,6.206671
65744176,0.045758
65744177,0.032102
65744178,1.350810


In [49]:
submission_df.to_csv('submission.csv')