In [1]:
import sys
sys.path.append("../../utils")
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split,KFold,GroupKFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from tqdm.notebook import tqdm
import lightgbm as lgb
import pandas as pd
import numpy as np
import joblib
import os
import gc

In [2]:
root_path = "../../../input/"
train = np.load("../../data_preprocessing/new_cite_train_final.npz")["arr_0"]
target = pd.read_hdf(f"{root_path}open-problems-multimodal/train_cite_targets.h5").values
target -= target.mean(axis=1).reshape(-1, 1)
target /= target.std(axis=1).reshape(-1, 1)
print(train.shape,target.shape)

(70988, 735) (70988, 140)


In [3]:
train_index = np.load(f"{root_path}/multimodal-single-cell-as-sparse-matrix/train_cite_inputs_idxcol.npz",allow_pickle=True)
meta = pd.read_csv(f"{root_path}open-problems-multimodal/metadata.csv",index_col = "cell_id")
meta = meta[meta.technology=="citeseq"]
lbe = LabelEncoder()
meta["cell_type"] = lbe.fit_transform(meta["cell_type"])
meta["gender"] = meta.apply(lambda x:0 if x["donor"]==13176 else 1,axis =1)
meta_train = meta.reindex(train_index["index"])
train_meta = meta_train["gender"].values.reshape(-1, 1)
train = np.concatenate([train,train_meta],axis= -1)
train_meta = meta_train["cell_type"].values.reshape(-1, 1)
ohe = OneHotEncoder(sparse=False)
train_meta = ohe.fit_transform(train_meta)
train = np.concatenate([train,train_meta],axis= -1)
train.shape

(70988, 743)

In [4]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)


In [6]:
# MOLGBM
class MultiOutputLGBMRegressor:
    def __init__(self,params):
        self.params = params
        self.model_list = []

    def fit(self,train_data,train_label,val_data,val_label,**fit_params):
        output_num = train_label.shape[1]
        for i in tqdm(range(output_num),leave=False):
            train_set = lgb.Dataset(train_data,train_label[:,i])
            val_set = lgb.Dataset(val_data,val_label[:,i])
            model = lgb.train(
                self.params,
                train_set,
                valid_sets = val_set,
                callbacks=[
                    lgb.early_stopping(20,verbose = False),
                    # lgb.log_evaluation(100),
                ]
            )
            self.model_list.append(model)
            
    def predict(self,test_data):
        res_list = []
        for model in tqdm(self.model_list,leave=False):
            res = model.predict(test_data)
            res_list.append(res)
        res_list = np.stack(res_list,axis = 1)
        return res_list
        
    def dump(self,path = "./models/MOLGB/" ):
        count = 0
        os.makedirs(path,exist_ok=True)
        for model in tqdm(self.model_list,leave=False):
            joblib.dump(model, f'{path}model_{str(count)}.pkl')
            count += 1
        print("Model saved")

    def load(self,path = "./models/MOLGB/" ):
        models = os.listdir(path)
        if len(self.model_list) != 0:
            raise ValueError("Don't load! Already loaded!")
        else:
            for i in tqdm(range(len(models)),leave=False):
                model = joblib.load(f'{path}model_{i}.pkl')
                self.model_list.append(model)
            print("Model loaded")

In [5]:
params = {
  'learning_rate': 0.1, 
  'objective': 'mse', 
  'metric': ['mse', 'mae'], 
  'n_estimators': 10000, 
  'learning_rate': 0.011322411312518462, 
  'num_leaves': 350, 
  'verbose': -1, 
  'boosting_type': 'gbdt', 
  'reg_alpha': 0.40300033428422216, 
  'reg_lambda': 1.6473388122802188, 
  'colsample_bytree': 0.5, 
  'subsample': 0.7, 
  'max_depth': -1, 
  'min_child_samples': 54, 
  'cat_smooth': 41.24648150772993
  
    # 'device':"gpu",
    # "gpu_device_id":0,
    # "gpu_platform_id":1,
    }

In [7]:
%%time
np.random.seed(42)
kf = GroupKFold(n_splits=3) 
scores = []
 
for id,(idx_tr, idx_va) in enumerate(kf.split(range(train.shape[0]),groups= meta_train.donor)):
    Xtr, Xva = train[idx_tr], train[idx_va]
    Ytr, Yva = target[idx_tr], target[idx_va]
    print(f'Fold {id}..')
    model = MultiOutputLGBMRegressor(params)
    model.fit(Xtr, Ytr,Xva,Yva,)

    y_tr_pred = model.predict(Xtr)
    mse_tr = mean_squared_error(Ytr, y_tr_pred)
    mae_tr = mean_absolute_error(Ytr, y_tr_pred)
    pearson_tr = correlation_score(Ytr, y_tr_pred)
    print(f"Flod_{id}_train  mse:{mse_tr},  mae:{mae_tr},  pearson:{pearson_tr}")

    y_va_pred = model.predict(Xva)
    mse = mean_squared_error(Yva, y_va_pred)
    mae = mean_absolute_error(Yva, y_va_pred)
    pearson = correlation_score(Yva, y_va_pred)
    print(f"Flod-{id}_test   mse:{mse},  mae:{mae},  pearson:{pearson}\n")

    scores.append(pearson)
    del Xtr, Ytr
    del Xva, Yva
    gc.collect()
    
    d_path = f"./models/CV/Fold_{id}/"
    os.makedirs(d_path,exist_ok=True)
    model.dump(d_path)

gc.collect()

#### training outputs:
Fold 0..

Flod_0_train  mse:0.08878915679206573,  mae:0.20941672083422697,  pearson:0.9552240083673027

Flod-0_test   mse:0.20303296180899078,  mae:0.30586355881144645,  pearson:0.8924862687034453

Model saved

Fold 1..

Flod_1_train  mse:0.08535758777930302,  mae:0.21115628878616854,  pearson:0.9570751551709759

Flod-1_test   mse:0.1949198555221245,  mae:0.30004321722735716,  pearson:0.8971982153227883

Model saved

Fold 2..

Flod_2_train  mse:0.08978115117208632,  mae:0.2106828369575134,  pearson:0.9551558939737768

Flod-2_test   mse:0.20189180835630965,  mae:0.3125771411516383,  pearson:0.8930300743198056

Model saved 

## Predict

In [9]:
test = np.load("../../data_preprocessing/new_cite_test_final.npz")["arr_0"]

test_index = np.load(f"{root_path}/multimodal-single-cell-as-sparse-matrix/test_cite_inputs_idxcol.npz",allow_pickle=True)
meta_test = meta.reindex(test_index["index"])
test_meta = meta_test["gender"].values.reshape(-1, 1)
test = np.concatenate([test,test_meta],axis= -1)
test_meta = meta_test["cell_type"].values.reshape(-1, 1)
test_meta = ohe.transform(test_meta)
test = np.concatenate([test,test_meta],axis= -1)
test.shape

(48663, 741)

In [11]:
scores # = [0.890046787050592, 0.895745355307898, 0.8921687207940432]

In [14]:
np.mean(scores)

0.8926536210508443

In [12]:
def std(x):
    return (x - np.mean(x,axis=1).reshape(-1,1)) / np.std(x,axis=1).reshape(-1,1)

In [13]:
from tqdm.notebook import tqdm
import glob
model_path = "./models/CV/Fold_*"
model_list = glob.glob(model_path)
preds = np.zeros((test.shape[0], 140))
for id,fn in enumerate(tqdm(model_list)):
    model_ = MultiOutputLGBMRegressor(params)
    model_.load(fn+"/")
    preds += std(model_.predict(test))* scores[id]
    gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

Model loaded


  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

Model loaded


  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

Model loaded


  0%|          | 0/140 [00:00<?, ?it/s]

In [None]:
import seaborn as sns
sns.heatmap(preds)

In [15]:
def submit(test_pred,multi_path):
    submission = pd.read_csv(multi_path,index_col = 0)
    submission = submission["target"]
    print("data loaded")
    submission.iloc[:len(test_pred.ravel())] = test_pred.ravel()
    assert not submission.isna().any()
    # submission = submission.round(6) # reduce the size of the csv
    print("start -> submission.zip")
    submission.to_csv('submission.zip')
    print("submission.zip saved!")

In [16]:
%%time
submit(preds,multi_path = r"D:\python_project\MSCI\model_ensemble\submission_best.zip")

data loaded
start -> submission.zip
submission.zip saved!
CPU times: total: 1min 56s
Wall time: 3min 46s
