### Kaggle notebook address: https://www.kaggle.com/code/oliverwang15/4th-solution-cite-online-nn

In [1]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse

DATA_DIR = "../input/open-problems-multimodal"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

VERBOSE = 0

## ------ CITEseq MODEL ---------

In [15]:
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col = 'cell_id')
metadata_df = metadata_df[metadata_df.technology == "citeseq"]

In [19]:
train_index = np.load("/kaggle/input/multimodal-single-cell-as-sparse-matrix/train_cite_inputs_idxcol.npz",allow_pickle=True)["index"]
meta = metadata_df.reindex(train_index)
meta.shape

(70988, 4)

In [20]:
cell_index_test = np.load("/kaggle/input/multimodal-single-cell-as-sparse-matrix/test_cite_inputs_idxcol.npz",allow_pickle=True)["index"]
cell_index_test.shape

(48663,)

## Target normalization

In [7]:
X = np.load("../input/cite-final/new_cite_train_final.npz")["arr_0"]
Xt = np.load("../input/cite-final/new_cite_test_final.npz")["arr_0"]
X.shape,Xt.shape

((70988, 735), (48663, 735))

In [8]:
Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)
Y = Y.values
Y -= Y.mean(axis=1).reshape(-1, 1)
Y /= Y.std(axis=1).reshape(-1, 1)
Y.shape

(70988, 140)

## Tensorflow Keras librairies

In [9]:
import math

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization

## Metric and loss function

In [10]:
def correlation_score(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def negative_correlation_loss(y_true, y_pred):
    my = K.mean(tf.convert_to_tensor(y_pred), axis=1)
    my = tf.tile(tf.expand_dims(my, axis=1), (1, y_true.shape[1]))
    ym = y_pred - my
    r_num = K.sum(tf.multiply(y_true, ym), axis=1)
    r_den = tf.sqrt(K.sum(K.square(ym), axis=1) * float(y_true.shape[-1]))
    r = tf.reduce_mean(r_num / r_den)
    return - r

## Model and parameters

In [11]:
LR_START = 0.01
BATCH_SIZE = 512

def create_model():
    
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape =(X.shape[1],))

    x0 = Dense(512, 
              kernel_regularizer = REG1,
              activation = activation,
             )(inputs)
    x0 = Dropout(DROP)(x0)
    
    
    x1 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x0)
    x1 = Dropout(DROP)(x1)
    
    
    x2 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x1) 
    x2= Dropout(DROP)(x2)
    
    x3 = Dense(256, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x2) 
    x3= Dropout(DROP)(x3)
    
    x4 = Dense(Y.shape[1],
               kernel_regularizer = REG1,
               activation = activation,
             )(x3)
    x4 = Dropout(DROP)(x4)

         
    x = Concatenate()([
 
                x2, 
                x3,
                x4
                ])
    
#     x = Dense(256, 
#                 kernel_regularizer = REG2,
#                 activation='linear',
#                 )(x)
    
    x = Dense(Y.shape[1], 
            kernel_regularizer = REG2,
            activation='linear',
            )(x)
    
    
    model = Model(inputs, x)
    

    return model

In [21]:
meta.shape

(70988, 4)

In [22]:
X.shape

(70988, 735)

In [None]:
# from tqdm.notebook import tqdm
# meta["id"] = [i for i in range(meta.shape[0])]
# people_list = [32606,13176,31800]
# day_list = [2,3,4]
# FOLDS_LIST = []
# num_fold = 3

# for val_people in [32606,13176,31800]:
#     train_people = [i for i in people_list if i != val_people]
#     for val_day in tqdm(day_list):
#         train_day = [i for i in day_list if i != val_day]
#         train_idx = meta[meta.day.isin(train_day)][meta.donor.isin(train_people)].id.to_list()
#         val_idx = meta[meta.day == val_day].id.to_list()
        

#         one_type = [
#             train_idx,val_idx
#         ]
#         FOLDS_LIST.append(one_type)
# len(FOLDS_LIST)

## Training

In [23]:
%%time
import warnings
warnings.filterwarnings("ignore")

EPOCHS = 300 
N_SPLITS = 3

pred_train = np.zeros((Y.shape[0],Y.shape[1]))

np.random.seed(1)
tf.random.set_seed(1)
score_list = []
kf = GroupKFold(n_splits=N_SPLITS)
score_list = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, groups=meta.donor)):
    start_time = datetime.datetime.now()
    model = None
    gc.collect()
    
    X_tr = X[idx_tr]
    y_tr = Y[idx_tr]
    X_va = X[idx_va]
    y_va = Y[idx_va]

    lr = ReduceLROnPlateau(
                    monitor = "val_loss",
                    factor = 0.9, 
                    patience = 4, 
                    verbose = VERBOSE)

    es = EarlyStopping(
                    monitor = "val_loss",
                    patience = 40, 
                    verbose = VERBOSE,
                    mode = "min", 
                    restore_best_weights = True)

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                    filepath = './citeseq',
                    save_weights_only = True,
                    monitor = 'val_loss',
                    mode = 'min',
                    save_best_only = True)

    callbacks = [
                    lr, 
                    es, 
                    model_checkpoint_callback
                    ]
    
    model = create_model()
    
    model.compile(
                optimizer = tf.keras.optimizers.Adam(learning_rate=LR_START),
                metrics = [negative_correlation_loss],
                loss = negative_correlation_loss
                 )
    # Training
    model.fit(
                X_tr,
                y_tr, 
                validation_data=(
                                X_va,
                                y_va), 
                epochs = EPOCHS,
                verbose = VERBOSE,
                batch_size = BATCH_SIZE,
                shuffle = True,
                callbacks = callbacks)

    del X_tr, y_tr 
    gc.collect()
    
    model.load_weights('./citeseq')
    model.save(f"./submissions/model_{fold}")
    print('model saved')
    
    #  Model validation
    y_va_pred = model.predict(X_va)
    corrscore = correlation_score(y_va, y_va_pred)
    pred_train[idx_va] = y_va_pred
    
    print(f"Fold {fold}, correlation =  {corrscore:.5f}")
    del X_va, y_va, y_va_pred
    gc.collect()
    score_list.append(corrscore)

# Show overall score
print(f"{Fore.GREEN}{Style.BRIGHT}Mean corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")
score_total = correlation_score(Y, pred_train)
print(f"{Fore.BLUE}{Style.BRIGHT}Oof corr   = {score_total:.5f}{Style.RESET_ALL}")

2022-12-06 11:37:24.260082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 11:37:24.349723: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 11:37:24.350468: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 11:37:24.351582: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

model saved
Fold 0, correlation =  0.89399
model saved
Fold 1, correlation =  0.89825
model saved
Fold 2, correlation =  0.89547
[32m[1mMean corr = 0.89590[0m
[34m[1mOof corr   = 0.89589[0m
CPU times: user 7min 39s, sys: 34.9 s, total: 8min 13s
Wall time: 6min 35s


## CITEseq Test prediction

In [24]:
weights = [0.89399,0.89825,0.89547]

In [25]:
def std(x):
    return (x - np.mean(x,axis=1).reshape(-1,1)) / np.std(x,axis=1).reshape(-1,1)

In [27]:
test_pred = np.zeros((len(Xt), 140), dtype=np.float32)
for fold in range(N_SPLITS):
    print(f"Predicting with fold {fold}")
    model = load_model(f"./submissions/model_{fold}",
                       custom_objects={'negative_correlation_loss': negative_correlation_loss})
    test_pred += std(model.predict(Xt))*weights[fold]

# from Juan Smith Perera to complete with the Multiome part :
submission = pd.read_csv('/kaggle/input/4th-solution-ensemble/submission.zip',index_col='row_id', squeeze=True)
submission.iloc[:len(test_pred.ravel())] = test_pred.ravel()
assert not submission.isna().any()

submission.to_csv('submission.csv')
display(submission)

Predicting with fold 0
Predicting with fold 1
Predicting with fold 2


row_id
0           -1.723919
1           -1.245159
2           -0.858834
3            3.764647
4            3.960902
              ...    
65744175    21.163233
65744176    -2.931209
65744177    -2.938488
65744178     0.965316
65744179    20.061283
Name: target, Length: 65744180, dtype: float64

In [28]:
test_pred.shape

(48663, 140)