In [5]:
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append("../../utils")
from loss import partial_correlation_score_torch_faster,correl_loss
from sklearn.model_selection import train_test_split,KFold,GroupKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from tqdm.notebook import tqdm
import lightgbm as lgb
import pandas as pd
import numpy as np
import joblib
import scipy
import pickle
import os
import gc

## Train

In [7]:
# %%time

raw_path = "../../data_preprocessing/"
# get the indexs
with open("../../data_preprocessing/multi_in_ori_raw_idx.pkl","rb") as f:
    multi_in_ori_raw_idx = pickle.load(f)
with open("../../data_preprocessing/multi_in_ori_ori_idx.pkl","rb") as f:
    multi_in_ori_ori_idx = pickle.load(f)

train_inputs = np.load(raw_path+"new_multi_train_tsvd.npz")["arr_0"]  # tsvd
train_inputs = train_inputs[multi_in_ori_raw_idx]
train_target = np.load(raw_path+"new_multi_target_tsvd.npz")["arr_0"] # tsvd targets
train_target = train_target[multi_in_ori_ori_idx]
train_targets = scipy.sparse.load_npz("../../../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz") # real targets
train_targets = train_targets[multi_in_ori_ori_idx]
train_targets = train_targets.toarray()

train_index = np.load(f"../../../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz",allow_pickle=True)
meta = pd.read_csv("../../../input/open-problems-multimodal/metadata.csv",index_col = "cell_id")
meta = meta[meta.technology=="multiome"]
lbe = LabelEncoder()
meta["cell_type"] = lbe.fit_transform(meta["cell_type"])
meta["gender"] = meta.apply(lambda x:0 if x["donor"]==13176 else 1,axis =1)
meta_train = meta.reindex(train_index["index"])
meta_train = meta_train.iloc[multi_in_ori_ori_idx]
train_meta = meta_train["gender"].values.reshape(-1, 1)
train_inputs = np.concatenate([train_inputs,train_meta],axis= -1)

train_inputs.shape,train_target.shape,train_targets.shape

((105868, 513), (105868, 1024), (105868, 23418))

In [6]:
with open(raw_path+"new_multi_tsvd_1024.pkl","rb") as f:
    pca2 = pickle.load(f)
components = pca2.components_
def multi_correlation_score(tgt,pred):
    pred_ = pred @ components
    return correl_loss(tgt,pred_)

In [3]:
# MOLGBM
class MultiOutputLGBMRegressor:
    def __init__(self,params):
        self.params = params
        self.model_list = []

    def fit(self,train_data,train_label,val_data,val_label,**fit_params):
        output_num = train_label.shape[1]
        for i in tqdm(range(output_num),leave=False):
            train_set = lgb.Dataset(train_data,train_label[:,i])
            val_set = lgb.Dataset(val_data,val_label[:,i])
            model = lgb.train(
                self.params,
                train_set,
                valid_sets = val_set,
                callbacks=[
                    lgb.early_stopping(20,verbose = False),
                    # lgb.log_evaluation(100),
                ]
            )
            self.model_list.append(model)
            
    def predict(self,test_data):
        res_list = []
        for model in tqdm(self.model_list,leave=False):
            res = model.predict(test_data)
            res_list.append(res)
        res_list = np.stack(res_list,axis = 1)
        return res_list
        
    def dump(self,path = "./models/MOLGB/" ):
        count = 0
        os.makedirs(path,exist_ok=True)
        for model in tqdm(self.model_list,leave=False):
            joblib.dump(model, f'{path}model_{str(count)}.pkl')
            count += 1
        print("Model saved")

    def load(self,path = "./models/MOLGB/" ):
        models = os.listdir(path)
        if len(self.model_list) != 0:
            raise ValueError("Don't load! Already loaded!")
        else:
            for i in tqdm(range(len(models)),leave=False):
                model = joblib.load(f'{path}model_{i}.pkl')
                self.model_list.append(model)
            print("Model loaded")

In [4]:
params = {
  'learning_rate': 0.2, 
  'objective': 'mse', 
  'metric': ['mse', 'mae'], 
  'n_estimators': 10000, 
  'learning_rate': 0.011322411312518462, 
  'num_leaves': 350, 
  'verbose': -1, 
  'boosting_type': 'gbdt', 
  'reg_alpha': 0.40300033428422216, 
  'reg_lambda': 1.6473388122802188, 
  'colsample_bytree': 0.5, 
  'subsample': 0.7, 
  'max_depth': -1, 
  'min_child_samples': 54, 
  'cat_smooth': 41.24648150772993,
  
    # 'device':"gpu",
    # "gpu_device_id":0,
    # "gpu_platform_id":1,
    }

In [None]:
np.random.seed(42)
kf = GroupKFold(n_splits=3) 

score = []
 
for id,(idx_tr, idx_va) in enumerate(kf.split(range(train_inputs.shape[0]),groups= meta_train.donor)):
    Xtr, Xva = train_inputs[idx_tr], train_inputs[idx_va]
    Ytr, Yva = train_target[idx_tr], train_target[idx_va]
    Ytrs, Yvas = train_targets[idx_tr], train_targets[idx_va]

    print(f'Fold {id}..')
    model = MultiOutputLGBMRegressor(params)
    model.fit(Xtr, Ytr,Xva,Yva,)

    y_tr_pred = model.predict(Xtr)
    mse_tr = mean_squared_error(Ytr, y_tr_pred)
    mae_tr = mean_absolute_error(Ytr, y_tr_pred)
    pearson_tr = multi_correlation_score(Ytrs, y_tr_pred)
    print(f"Flod_{id}_train  mse:{mse_tr},  mae:{mae_tr},  pearson:{pearson_tr}")

    y_va_pred = model.predict(Xva)
    mse = mean_squared_error(Yva, y_va_pred)
    mae = mean_absolute_error(Yva, y_va_pred)
    pearson = multi_correlation_score(Yvas, y_va_pred)
    print(f"Flod-{id}_test   mse:{mse},  mae:{mae},  pearson:{pearson}\n")

    score.append(pearson)
    del Xtr, Ytr
    del Xva, Yva
    gc.collect()
    
    d_path = f"./models/CV/Fold_{id}/"
    os.makedirs(d_path,exist_ok=True)
    model.dump(d_path)

gc.collect()

## Predict

In [None]:
test = np.load(raw_path+"new_multi_test_tsvd.npz")["arr_0"]  # tsvd

test_index = np.load(f"../../../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz",allow_pickle=True)
meta_test= meta.reindex(test_index["index"])
test_meta = meta_test["gender"].values.reshape(-1, 1)
test = np.concatenate([test,test_meta],axis= -1)
test.shape

In [None]:
score # = [0.6636945377072712, 0.6701864710383082, 0.6676791305484101]

In [None]:
np.mean(score) # 0.6671867130979964

In [None]:
def std(x):
    return (x - np.mean(x,axis=1).reshape(-1,1)) / np.std(x,axis=1).reshape(-1,1)

In [None]:
from tqdm.notebook import tqdm
import glob
model_path = "./models/CV/Fold_*"
model_list = glob.glob(model_path)
preds = np.zeros((test.shape[0], 23418))
for id,fn in enumerate(tqdm(model_list)):
    model_ = MultiOutputLGBMRegressor(params)
    model_.load(fn+"/")
    preds += std(model_.predict(test)@ components)* score[id]
    gc.collect()

In [None]:
def submit(test_pred,cite_path = r"C:\Users\Olive\Downloads\Compressed\submission_best.zip"):
    
    # Read the table of rows and columns required for submission
    eval_ids = pd.read_parquet("../data/others/evaluation.parquet")

    # Convert the string columns to more efficient categorical types
    #eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))

    eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
    eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

    submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
    print("data loaded")
    y_columns = np.load("../data/others/train_multi_targets_idxcol.npz",allow_pickle=True)["columns"]
    test_index = np.load("../data/others/test_multi_inputs_idxcol.npz",allow_pickle=True)["index"]

    cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
    assert len(cell_dict)  == len(test_index)

    gene_dict = dict((k,v) for v,k in enumerate(y_columns))
    assert len(gene_dict) == len(y_columns)

    eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
    eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

    valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)
    valid_multi_rows = valid_multi_rows.to_numpy()
    eval_ids_gene_num[valid_multi_rows].to_numpy()
    
    submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
    eval_ids_gene_num[valid_multi_rows].to_numpy()]

    del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
    gc.collect()

    submission.reset_index(drop=True, inplace=True)
    submission.index.name = 'row_id'

    # Merging in the CITEseq submission
    cite_submission = pd.read_csv(cite_path)
    cite_submission = cite_submission.set_index("row_id")
    cite_submission = cite_submission["target"]

    submission[submission.isnull()] = cite_submission[submission.isnull()]
    submission.isnull().any()
    print("start -> submission.zip")
    submission.to_csv("submission.zip")

    print("submission.zip saved!")


In [None]:
submit(preds,cite_path=r"D:\python_project\MSCI\model_ensemble\submission_best.zip")