## KERAS for Multiome

Ideas and code from :

https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor

https://www.kaggle.com/code/ambrosm/msci-citeseq-keras-quickstart

dataset from :

https://www.kaggle.com/datasets/fabiencrom/multimodal-single-cell-as-sparse-matrix


My appologies for the Kagglers not mentioned.

In [1]:
! pip install tables

[0m

In [2]:
import gc,os,pickle

import pandas as pd
import numpy as np

import glob
from tqdm.notebook import tqdm

from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

import time
import psutil
import tables

import scipy.sparse

## Target already normalized and reduced
Due to memory limitation, the target has been normalized and reduced by TruncatedSVD (n_comp = 128) in another Notebook and available in the dataset '../input/targets-multiome-sparse-scaled/'

In [3]:
%%time
with open('../input/targets-multiome-sparse-scaled/Y.pkl','rb') as f: Y = pickle.load(f)
Y.shape

CPU times: user 16.3 ms, sys: 117 ms, total: 133 ms
Wall time: 1.08 s


(105942, 128)

## Target dimension reduction 

In [4]:
with open('../input/targets-multiome-sparse-scaled/pca_target.pkl','rb') as f: pca_target = pickle.load(f)
pca_target

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


TruncatedSVD(n_components=128, random_state=42)

## Train dimension reduction

In [5]:
%%time
train = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz")
train = train.astype('float16',copy = False)

CPU times: user 39.3 s, sys: 3.98 s, total: 43.3 s
Wall time: 1min 6s


In [6]:
%%time
pca_train = TruncatedSVD(n_components=128, random_state=42)
train = pca_train.fit_transform(train)

CPU times: user 23min 58s, sys: 18 s, total: 24min 16s
Wall time: 23min 56s


In [7]:
train.shape, Y.shape

((105942, 128), (105942, 128))

## Tensorflow Keras librairies

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate,BatchNormalization

## Metrics

In [9]:
def correlation_score(y_true, y_pred):

    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## Model and parameters

In [10]:
plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=0,
    mode="min",
    min_delta=1e-7
)
es = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=1e-7,
    patience=10,
    verbose=0,
    mode="min",
    restore_best_weights=True,
)

In [11]:
# Architecture from AmbrosM used for CITEseq

def create_model():

    activation = 'selu'
    
    inputs = Input(shape= train.shape[1])

    x0 = Dense(units = 256, 
              activation = activation,
             )(inputs)
    
    x1 = Dense(units = 256, 
              activation = activation,
                )(x0)
    
    x2 = Dense(units = 256, 
              activation = activation,
             )(x1)

    
    x3 = Dense(units = 128, 
              activation = activation,
             )(x2)
    
    x = Concatenate()([
                        x0, 
                        x1, 
                        x2, 
                        x3
                      ])
    
    x = Dense(units =Y.shape[1], 
              activation = 'linear',
             )(x)
    
    model = Model(inputs, x)
    
    return model

gc.collect()

85

In [12]:
gc.collect()

21

## Training

In [13]:
import warnings
warnings.filterwarnings("ignore")

N_SPLIT = 5
kf = KFold(n_splits=N_SPLIT, shuffle=True, random_state=42)

for fold,(idx_tr, idx_va) in enumerate(kf.split(train)):
    
    X_tr = train[idx_tr]
    y_tr = Y[idx_tr]
    
    X_va = train[idx_va]
    y_va = Y[idx_va] 
    
    model = create_model()

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =1000,
              verbose = 0,
              batch_size=256,
              callbacks = [es,plateau]
             )
    pred = model.predict(X_va)
    
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,pred),2)}')
   
    filename = f"model_{fold}"
    model.save(filename)
    print('model saved :',filename)
        
    del X_tr,X_va,y_tr,y_va
    gc.collect()

2022-09-18 09:35:17.418460: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-09-18 09:35:17.786310: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



 --------- FOLD 0 -----------
Mean squared error = 3.609999895095825


2022-09-18 09:37:33.588257: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


model saved : model_0

 --------- FOLD 1 -----------
Mean squared error = 3.609999895095825
model saved : model_1

 --------- FOLD 2 -----------
Mean squared error = 3.609999895095825
model saved : model_2

 --------- FOLD 3 -----------
Mean squared error = 3.630000114440918
model saved : model_3

 --------- FOLD 4 -----------
Mean squared error = 3.619999885559082
model saved : model_4


## Test prediction

In [14]:
%%time
multi_test_x = scipy.sparse.load_npz("../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz")
multi_test_x = pca_train.transform(multi_test_x)
multi_test_x.shape

CPU times: user 1min 3s, sys: 2.15 s, total: 1min 5s
Wall time: 1min 22s


(55935, 128)

In [15]:
preds = np.zeros((multi_test_x.shape[0], 23418), dtype='float16')

for fold in range(N_SPLIT):
    print(f'fold {fold} prediction')
    model = tf.keras.models.load_model(f"model_{fold}")
    preds += (model.predict(multi_test_x)@pca_target.components_)/N_SPLIT
    gc.collect()

fold 0 prediction
fold 1 prediction
fold 2 prediction
fold 3 prediction
fold 4 prediction


# Submission preparation

In [16]:
%%time
eval_ids = pd.read_parquet("../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet")
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

CPU times: user 46.2 s, sys: 11.8 s, total: 58 s
Wall time: 54.3 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [17]:
%%time
y_columns = np.load("../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]

cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)

gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)

eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))
valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

submission.iloc[valid_multi_rows] = preds[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]

del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()

submission

CPU times: user 3.51 s, sys: 1.51 s, total: 5.02 s
Wall time: 4.78 s


row_id    cell_id       gene_id        
0         c2150f55becb  CD86                    NaN
1         c2150f55becb  CD274                   NaN
2         c2150f55becb  CD270                   NaN
3         c2150f55becb  CD155                   NaN
4         c2150f55becb  CD112                   NaN
                                             ...   
65744175  2c53aa67933d  ENSG00000134419    2.765625
65744176  2c53aa67933d  ENSG00000186862   -0.413574
65744177  2c53aa67933d  ENSG00000170959   -0.405273
65744178  2c53aa67933d  ENSG00000107874    0.226562
65744179  2c53aa67933d  ENSG00000166012    2.412109
Name: target, Length: 65744180, dtype: float32

In [18]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'

cite_submission = pd.read_csv("../input/targets-multiome-sparse-scaled/submission_cite.csv")
cite_submission = cite_submission.set_index("row_id")
cite_submission = cite_submission["target"]
submission[submission.isnull()] = cite_submission[submission.isnull()]
submission

row_id
0           0.094605
1          -0.162362
2          -0.405332
3          -0.302582
4           1.114355
              ...   
65744175    2.765625
65744176   -0.413574
65744177   -0.405273
65744178    0.226562
65744179    2.412109
Name: target, Length: 65744180, dtype: float32

In [19]:
submission.to_csv("submission_keras_multiome.csv")

In [20]:
del preds, Y, train, multi_test_x
gc.collect()
gc.collect()

0

## Ensemble submission from with 

https://www.kaggle.com/code/vslaykovsky/lb-0-811-normalized-ensembles-for-pearson-s-r

with files from :

the current notebook +

https://www.kaggle.com/code/sskknt/msci-citeseq-keras-quickstart-dropout/data

https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor/data


In [21]:
def std(x):
    return (x - np.mean(x)) / np.std(x)

def gen_std_submission(path, cell_ids):
    """
    Standardize submission per cell_id
    """
    df = pd.read_csv(path)
    df['cell_id'] = cell_ids    
    vals = []
    for idx, g in tqdm(df.groupby('cell_id', sort=False), desc=f'Standardizing {path}', miniters=1000):
        vals.append(std(g.target).values)
    vals = np.concatenate(vals)
    return vals

def gen_ensemble(technology):
    ensemble = None
    for path in tqdm([path for path in SUBMISSIONS.keys() if technology in path], desc='Process submission'):
        weight = SUBMISSIONS[path]
        if ensemble is None:
            ensemble = gen_std_submission(path, cell_ids) * weight
        else:
            ensemble += gen_std_submission(path, cell_ids) * weight
    return ensemble

In [22]:
SUBMISSIONS = {
    'submission_keras_multiome.csv':0.6,
    '../input/msci-citeseq-keras-quickstart-dropout/submission.csv': 0.2,         
    '../input/lb-t15-msci-multiome-catboostregressor/submission.csv':0.2,
}

In [23]:
cell_ids = pd.read_parquet('../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet').cell_id

PRED_SEGMENTS = [(0, 6812820), (6812820, 65744180)]
ensemble = []
for tech, (from_idx, to_idx) in tqdm(list(zip(['citeseq', 'multiome'], PRED_SEGMENTS)), desc='Technology'):    
    ensemble.append(gen_ensemble(tech)[from_idx: to_idx])
    
    
ensemble = np.concatenate(ensemble)

df_submit = pd.read_parquet('../input/multimodal-single-cell-as-sparse-matrix/sample_submission.parquet')
df_submit['target'] = ensemble
df_submit.to_csv('submission_06_02_02_v1.csv', index=False)
df_submit

Technology:   0%|          | 0/2 [00:00<?, ?it/s]

Process submission:   0%|          | 0/1 [00:00<?, ?it/s]

Standardizing ../input/msci-citeseq-keras-quickstart-dropout/submission.csv:   0%|          | 0/65443 [00:00<?…

Process submission:   0%|          | 0/2 [00:00<?, ?it/s]

Standardizing submission_keras_multiome.csv:   0%|          | 0/65443 [00:00<?, ?it/s]

Standardizing ../input/lb-t15-msci-multiome-catboostregressor/submission.csv:   0%|          | 0/65443 [00:00<…

Unnamed: 0,row_id,target
0,0,0.018921
1,1,-0.032472
2,2,-0.081066
3,3,-0.060516
4,4,0.222871
...,...,...
65744175,65744175,3.421738
65744176,65744176,-0.514589
65744177,65744177,-0.505025
65744178,65744178,0.263628
