In [1]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Basic utilities
import os
import gc
import glob
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# import plotly.figure_factory as ff
# import plotly.express as px

# Scientific computing
from scipy import stats
from itertools import groupby

# Machine Learning
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


# Set the folder path for data
folder_path = "./input"

In [2]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd

pd.set_option('display.max_columns', 30)

import numpy as np

SEED = 6174
np.random.seed(SEED)

In [3]:
de_train = pd.read_parquet(f'{folder_path}/de_train.parquet')
genes = de_train.columns[5:]
id_map = pd.read_csv (f'{folder_path}/id_map.csv')

sm_lincs_id = de_train.set_index('sm_name')["sm_lincs_id"].to_dict()
sm_name_to_smiles = de_train.set_index('sm_name')['SMILES'].to_dict()

id_map['sm_lincs_id'] = id_map['sm_name'].map(sm_lincs_id)
id_map['SMILES'] = id_map['sm_name'].map(sm_name_to_smiles)

de_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZSWIM5,ZSWIM6,ZSWIM7,ZSWIM8,ZSWIM9,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,0.073229,-0.016823,0.101717,-0.005153,1.043629,...,0.299807,0.319123,0.179530,0.220086,-0.206053,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,0.203559,0.604656,0.498592,-0.317184,0.375550,...,0.091576,0.717595,1.262570,0.357003,-0.168803,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,-0.480681,0.467144,-0.293205,-0.005098,0.214918,...,-0.590645,-0.542832,0.225485,0.131672,-0.393695,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,0.718590,-0.162145,0.157206,-3.654218,-0.212402,...,0.760570,-0.217246,-0.203936,2.060546,0.899520,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,2.022829,0.600011,1.231275,0.236739,0.338703,...,1.005788,0.106344,-0.145054,0.965736,0.248029,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,-0.544709,0.282458,-0.431359,-0.364961,0.043123,...,0.092460,-0.960509,0.000051,-0.626368,-0.261534,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,0.090954,0.169523,0.428297,0.106553,0.435088,...,0.883842,0.611697,-0.538152,0.047483,-0.602049,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,-0.061539,0.002818,-0.027167,-0.383696,0.226289,...,0.169480,-0.084077,0.697416,0.225507,0.063579,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,-0.706087,-0.620919,-1.485381,0.059303,-0.032584,...,-1.149889,-0.977296,0.369929,0.625152,-0.885209,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [4]:
def split_sign(text):
    text = text.replace(')(', ' ')
    text = text.replace('(' , ' ')
    text = text.replace(')' , ' ')
    return text.split(" ")

de_train['_SMILES'] = [split_sign(text) for text in de_train['SMILES'].values]

sign = []
for row in de_train['_SMILES'].values:
    for ele in row:
        sign.append(ele)
        
sign_list = list(set(sign))

data = np.zeros((len(de_train), len(sign_list)), dtype=int)
de_features = pd.DataFrame(data=data, columns=sign_list)

for sign in sign_list:
    for i in range(len(de_train)):
        row = de_train['_SMILES'].values[i]

        for ele in row:
            if ele == sign:
                de_features[sign][i] += 1

                
id_map['_SMILES'] = [split_sign(text) for text in id_map['SMILES'].values]

sign = []
for row in id_map['_SMILES'].values:
    for ele in row:
        sign.append(ele)
        
sign_list = list(set(sign))

data = np.zeros((len(id_map), len(sign_list)), dtype=int)
test_features = pd.DataFrame(data=data, columns=sign_list)

for sign in sign_list:
    for i in range(len(id_map)):
        row = id_map['_SMILES'].values[i]

        for ele in row:
            if ele == sign:
                test_features[sign][i] += 1
                
uncommon = [f for f in de_features if f not in test_features]
de_features = de_features.drop(columns=uncommon)

de_features = de_features.sort_index(axis = 1)
test_features = test_features.sort_index(axis = 1)

print("Columns Check", list(de_features.columns) == list(test_features.columns))

Columns Check True


In [5]:
sm_name = pd.get_dummies(de_train['sm_name'], dtype=float)
de_features = pd.concat([sm_name, de_features], axis=1)
de_features

Unnamed: 0,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,ABT-199 (GDC-0199),ABT737,AMD-070 (hydrochloride),AT 7867,AT13387,AVL-292,AZ628,AZD-8330,AZD3514,AZD4547,Alogliptin,Alvocidib,Amiodarone,Atorvastatin,...,ncnc3c2,nn12,nn1Cc1ccnc,nnc5C,no1,no2,noc1C,noc4C,o1,oc6,on4,s2,s3,sc2cc,sc3cc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
cell_type = pd.get_dummies(de_train['cell_type'], dtype=float)
de_features = pd.concat([cell_type, de_features], axis=1)
de_features

Unnamed: 0,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,ABT-199 (GDC-0199),ABT737,AMD-070 (hydrochloride),AT 7867,AT13387,AVL-292,AZ628,AZD-8330,...,ncnc3c2,nn12,nn1Cc1ccnc,nnc5C,no1,no2,noc1C,noc4C,o1,oc6,on4,s2,s3,sc2cc,sc3cc
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
610,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
611,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
612,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
sm_name = sm_name.iloc[:len(id_map)]
sm_name.iloc[:, :] = 0.0
for i, item in enumerate(id_map['sm_name']):
    if item in sm_name.columns:
        sm_name.loc[i, item] = 1.0
test_features = pd.concat([sm_name, test_features], axis=1)

cell_type = cell_type.iloc[:len(id_map)]
cell_type.iloc[:, :] = 0.0
cell_type.iloc[:, :2] = pd.get_dummies(id_map['cell_type'], dtype=float)
test_features = pd.concat([cell_type, test_features], axis=1)
test_features

Unnamed: 0,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,ABT-199 (GDC-0199),ABT737,AMD-070 (hydrochloride),AT 7867,AT13387,AVL-292,AZ628,AZD-8330,...,ncnc3c2,nn12,nn1Cc1ccnc,nnc5C,no1,no2,noc1C,noc4C,o1,oc6,on4,s2,s3,sc2cc,sc3cc
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
252,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
253,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

def mrrmse(y_true, y_pred):
		return np.sqrt(np.mean(np.square(y_true - y_pred))).mean()

# Assuming de_features and genes are defined elsewhere in your script
X_train, X_cv, y_train, y_cv = train_test_split(de_features.values, de_train[genes].values, test_size=0.2, random_state=6174)

# Define the number of components for Truncated SVD
n_components = 100  # Adjust based on your dataset

# KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=6174)
fold_mrrmse = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
		print(f'Fold {fold+1}')

		X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
		X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

		# Using Random Forest as the model
		#model = RandomForestRegressor(n_estimators=100, random_state=0)
		model = Ridge(alpha=1.0)

		model.fit(X_train_fold, y_train_fold)
		y_val_pred = model.predict(X_val_fold)
		fold_mrrmse.append(mrrmse(y_val_fold, y_val_pred))
		print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')

avg_mrrmse = np.mean(fold_mrrmse)
print(f'Average MRRMSE: {avg_mrrmse}')

# Averaging predictions
y_cv_pred = model.predict(X_cv)

final_mrrmse = mrrmse(y_cv, y_cv_pred)
print(f'Final MRRMSE on CV set: {final_mrrmse}')


Fold 1
Fold 1 MRRMSE: 2.031414670911738
Fold 2
Fold 2 MRRMSE: 1.674695842789679
Fold 3
Fold 3 MRRMSE: 2.2469117543271313
Fold 4
Fold 4 MRRMSE: 2.4272529078892826
Fold 5
Fold 5 MRRMSE: 2.7529882485437236
Average MRRMSE: 2.2266526848923105
Final MRRMSE on CV set: 2.0883329247430775


In [9]:
from sklearn.linear_model import LinearRegression

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
		print(f'Fold {fold+1}')

		X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
		X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

		model = LinearRegression()
		model.fit(X_train_fold, y_train_fold)
		y_val_pred = model.predict(X_val_fold)
		fold_mrrmse.append(mrrmse(y_val_fold, y_val_pred))
		print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')

avg_mrrmse = np.mean(fold_mrrmse)
print(f'Average MRRMSE: {avg_mrrmse}')

# Averaging predictions
y_cv_pred = model.predict(X_cv)

final_mrrmse = mrrmse(y_cv, y_cv_pred)
print(f'Final MRRMSE on CV set: {final_mrrmse}')


Fold 1
Fold 1 MRRMSE: 5169821660704.472
Fold 2
Fold 2 MRRMSE: 1830268461659.0366
Fold 3
Fold 3 MRRMSE: 5989792589576.021
Fold 4
Fold 4 MRRMSE: 6709688301251.57
Fold 5
Fold 5 MRRMSE: 725274685352.6796
Average MRRMSE: 2042484569855.4915
Final MRRMSE on CV set: 1121304664577.3467


In [10]:
from sklearn.neighbors import KNeighborsRegressor

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
		print(f'Fold {fold+1}')

		X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
		X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

		model = KNeighborsRegressor()
		model.fit(X_train_fold, y_train_fold)
		y_val_pred = model.predict(X_val_fold)
		fold_mrrmse.append(mrrmse(y_val_fold, y_val_pred))
		print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')

avg_mrrmse = np.mean(fold_mrrmse)
print(f'Average MRRMSE: {avg_mrrmse}')

# Averaging predictions
y_cv_pred = model.predict(X_cv)

final_mrrmse = mrrmse(y_cv, y_cv_pred)
print(f'Final MRRMSE on CV set: {final_mrrmse}')


Fold 1
Fold 1 MRRMSE: 1.9190973003430332
Fold 2
Fold 2 MRRMSE: 1.804263147534912
Fold 3
Fold 3 MRRMSE: 2.2078493368637497
Fold 4
Fold 4 MRRMSE: 2.177870462820775
Fold 5
Fold 5 MRRMSE: 2.937694511002063
Average MRRMSE: 1361656379904.3975
Final MRRMSE on CV set: 2.1190234621012283


In [11]:
from sklearn.tree import DecisionTreeRegressor
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
		print(f'Fold {fold+1}')

		X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
		X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

		model = DecisionTreeRegressor()

		model.fit(X_train_fold, y_train_fold)
		y_val_pred = model.predict(X_val_fold)
		fold_mrrmse.append(mrrmse(y_val_fold, y_val_pred))
		print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')

avg_mrrmse = np.mean(fold_mrrmse)
print(f'Average MRRMSE: {avg_mrrmse}')

# Averaging predictions
y_cv_pred = model.predict(X_cv)

final_mrrmse = mrrmse(y_cv, y_cv_pred)
print(f'Final MRRMSE on CV set: {final_mrrmse}')


Fold 1
Fold 1 MRRMSE: 2.4354683954596004
Fold 2
Fold 2 MRRMSE: 1.8432261458123833
Fold 3
Fold 3 MRRMSE: 2.3738410639407315
Fold 4
Fold 4 MRRMSE: 2.6045008861773526
Fold 5
Fold 5 MRRMSE: 3.208157134991823
Average MRRMSE: 1021242284928.9215
Final MRRMSE on CV set: 2.3032552529011285


In [12]:
#from sklearn.svm import LinearSVR
#from sklearn.multioutput import MultiOutputRegressor
#from sklearn.model_selection import cross_val_score

#fold_mrrmse = []
#for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
#		print(f'Fold {fold+1}')

#		X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
#		X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

#		model = LinearSVR()
#		# define the direct multioutput wrapper model
#		wrapper = MultiOutputRegressor(model)

#		scores = cross_val_score(wrapper, X_train_fold, y_train_fold, cv=5, scoring='neg_mean_squared_error')
#		fold_mrrmse.append(np.sqrt(-scores.mean()))
#		print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')


#avg_mrrmse = np.mean(fold_mrrmse)
#print(f'Average MRRMSE: {avg_mrrmse}')

## Averaging predictions
#y_cv_pred = model.predict(X_cv)

#final_mrrmse = mrrmse(y_cv, y_cv_pred)
#print(f'Final MRRMSE on CV set: {final_mrrmse}')


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def mrrmse(y_true, y_pred):
    return np.sqrt(np.mean(np.square(y_true - y_pred))).mean()

# Assuming de_features and genes are defined elsewhere in your script
X_train, X_cv, y_train, y_cv = train_test_split(de_features.values, de_train[genes].values, test_size=0.2, random_state=6174)

# Define the number of components for Truncated SVD (if needed)
n_components = 100  # Adjust based on your dataset

# KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=6174)
fold_mrrmse = []

# Hyperparameter grid (example, adjust according to your needs)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'subsample': [0.7, 0.8, 1.0]
}

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f'Fold {fold+1}')

    X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
    X_val_fold, y_val_fold = X_train[val_index], y_train[val_index]

    # Initialize XGBoost model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=0)

    # Hyperparameter tuning
    grid_search = RandomizedSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_fold, y_train_fold)

    # Best model for the current fold
    best_model = grid_search.best_estimator_

    y_val_pred = best_model.predict(X_val_fold)
    fold_mrrmse.append(mrrmse(y_val_fold, y_val_pred))
    print(f'Fold {fold+1} MRRMSE: {fold_mrrmse[-1]}')

avg_mrrmse = np.mean(fold_mrrmse)
print(f'Average MRRMSE: {avg_mrrmse}')

# Retrain the model on the entire training set with the best parameters
final_model = xgb.XGBRegressor(**grid_search.best_params_, objective='reg:squarederror', random_state=0)
final_model.fit(X_train, y_train)

# Make predictions on the cross-validation set
y_cv_pred = final_model.predict(X_cv)

final_mrrmse = mrrmse(y_cv, y_cv_pred)
print(f'Final MRRMSE on CV set: {final_mrrmse}')


Fold 1
Fitting 3 folds for each of 10 candidates, totalling 30 fits


exception calling callback for <Future at 0x7f441510d490 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py", line 556, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/usr/local/lib/p

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [9]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.2.0-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.7/484.7 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders>=2.4.0
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plotly>=5.0.0
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy~=1.10.1
  Downloading scipy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyod>=1.0.8
  Downloading pyod-1.1.2.tar.gz (160 kB)
[2K     [90m━━━━━━━━━

In [17]:
from pycaret.regression import *
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming de_features and genes are defined elsewhere in your script
X_train, X_cv, y_train, y_cv = train_test_split(de_features.values, de_train[genes].values, test_size=0.2, random_state=6174)

# Convert X_train to DataFrame for compatibility with PyCaret
X_train_df = pd.DataFrame(X_train)
X_cv_df = pd.DataFrame(X_cv)

# Function to calculate MRRMSE
def mrrmse(y_true, y_pred):
		return np.sqrt(np.mean(np.square(y_true - y_pred)))

# Initialize an empty list to store MRRMSE for each target
mrrmse_scores = []

# Loop over each target variable
for i in range(y_train.shape[1]):
		print(f"Modeling for Target {i+1}/{y_train.shape[1]}")

		# Prepare the dataset for the current target
		train_data = X_train_df.copy()
		train_data['Target'] = y_train[:, i]

		# Setup the PyCaret environment for the current target
		reg_experiment = setup(data=train_data, target='Target', verbose=False, fold=5, fold_strategy='kfold', session_id=6174)

		# Compare models to find the best one for the current target
		best_model = compare_models(n_select=1)  # Select the top model

		# Tune the hyperparameters of the best model
		tuned_model = tune_model(best_model)

		# Finalize the model and make predictions on the unseen data
		final_model = finalize_model(tuned_model)
		unseen_predictions = predict_model(final_model, data=X_cv_df)

		# Calculate MRRMSE for the current target
		current_mrrmse = mrrmse(y_cv[:, i], unseen_predictions['Label'])
		mrrmse_scores.append(current_mrrmse)
		print(f'Target {i+1} MRRMSE: {current_mrrmse}\n')

# Calculate average MRRMSE across all targets
average_mrrmse = sum(mrrmse_scores) / len(mrrmse_scores)
print(f'Average MRRMSE on CV set: {average_mrrmse}')


Modeling for Target 1/18211


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.7751,2.9559,1.4228,-0.1149,0.4062,2.8157,0.009
en,Elastic Net,0.7751,2.9559,1.4228,-0.1149,0.4062,2.8157,0.01
llar,Lasso Least Angle Regression,0.7751,2.9559,1.4228,-0.1149,0.4062,2.8157,0.009
br,Bayesian Ridge,0.7715,2.7472,1.3971,-0.1859,0.41,3.1377,0.014
huber,Huber Regressor,0.8503,2.8826,1.4593,-0.4834,0.4236,3.5367,0.022
knn,K Neighbors Regressor,0.8125,3.0953,1.5385,-0.5965,0.4464,2.6116,0.009
ridge,Ridge Regression,0.9107,3.0899,1.5212,-0.6572,0.4405,4.3033,0.011
ada,AdaBoost Regressor,0.8855,3.0588,1.5105,-0.7478,0.4237,4.5285,0.031
par,Passive Aggressive Regressor,1.0223,3.2352,1.5824,-0.7571,0.4735,5.2893,0.011
rf,Random Forest Regressor,0.7567,3.0192,1.4898,-1.2734,0.4415,3.8598,0.15


KeyboardInterrupt: 

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZSWIM5,ZSWIM6,ZSWIM7,ZSWIM8,ZSWIM9,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,0.073229,-0.016823,0.101717,-0.005153,1.043629,...,0.299807,0.319123,0.179530,0.220086,-0.206053,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,0.203559,0.604656,0.498592,-0.317184,0.375550,...,0.091576,0.717595,1.262570,0.357003,-0.168803,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,-0.480681,0.467144,-0.293205,-0.005098,0.214918,...,-0.590645,-0.542832,0.225485,0.131672,-0.393695,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,0.718590,-0.162145,0.157206,-3.654218,-0.212402,...,0.760570,-0.217246,-0.203936,2.060546,0.899520,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,2.022829,0.600011,1.231275,0.236739,0.338703,...,1.005788,0.106344,-0.145054,0.965736,0.248029,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,-0.544709,0.282458,-0.431359,-0.364961,0.043123,...,0.092460,-0.960509,0.000051,-0.626368,-0.261534,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,0.090954,0.169523,0.428297,0.106553,0.435088,...,0.883842,0.611697,-0.538152,0.047483,-0.602049,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,-0.061539,0.002818,-0.027167,-0.383696,0.226289,...,0.169480,-0.084077,0.697416,0.225507,0.063579,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,-0.706087,-0.620919,-1.485381,0.059303,-0.032584,...,-1.149889,-0.977296,0.369929,0.625152,-0.885209,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


Columns Check True


Unnamed: 0,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells,-c1ccc,-c1cccc,-c2[nH]c,-c2[nH]nc3ccc,-c2cc,-c2cc3c,-c2cc3nccc,-c2cc3nccn3c,-c2ccc,...,ncnc3c2,nn12,nn1Cc1ccnc,nnc5C,no1,no2,noc1C,noc4C,o1,oc6,on4,s2,s3,sc2cc,sc3cc
0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
610,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
611,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
612,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,ABT-199 (GDC-0199),ABT737,AMD-070 (hydrochloride),AT 7867,AT13387,AVL-292,AZ628,AZD-8330,...,TR-14035,Tacalcitol,Tamatinib,Tipifarnib,Tivantinib,Tivozanib,Topotecan,Tosedostat,Trametinib,UNII-BXU45ZH6LI,Vandetanib,Vanoxerine,Vardenafil,Vorinostat,YK 4-279
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
611,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
612,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Fold 1


Fold 1
Epoch 0, MRRMSE: 1.0047153499962092
Epoch 1, MRRMSE: 1.0084927101677392
Epoch 2, MRRMSE: 1.0426744226083833
Epoch 3, MRRMSE: 1.23896465885814
Epoch 4, MRRMSE: 1.1638127635054814
Epoch 5, MRRMSE: 1.4890449759182902
Epoch 6, MRRMSE: 0.9906527371697679
Epoch 7, MRRMSE: 1.059663677998504
Epoch 8, MRRMSE: 0.9552710828090751
Epoch 9, MRRMSE: 0.9986350581653908
Epoch 10, MRRMSE: 1.5427229646140297
Epoch 11, MRRMSE: 1.2187074205755466
Epoch 12, MRRMSE: 1.4005966736794389
Epoch 13, MRRMSE: 1.069363867032207
Epoch 14, MRRMSE: 1.0403135234128698
Epoch 00015: reducing learning rate of group 0 to 1.0000e-04.
Epoch 15, MRRMSE: 1.0790136100145296
Epoch 16, MRRMSE: 1.015792021841304
Epoch 17, MRRMSE: 0.990410492513625
Epoch 18, MRRMSE: 1.102805383445942
Early stopping triggered


Fold 2
Epoch 0, MRRMSE: 0.8553150788339188
Epoch 1, MRRMSE: 0.8402973896161112
Epoch 2, MRRMSE: 0.8196683777440318
Epoch 3, MRRMSE: 0.8879514396524437
Epoch 4, MRRMSE: 0.9357821670989924
Epoch 5, MRRMSE: 1.019059323812

Ensemble CV MRRMSE: 1.100290298461914
