In [None]:
!pip install rdkit

In [1]:
import warnings # suppress warnings
warnings.filterwarnings('ignore')
#:::::::::::::::::::::::::::::::::::
import os
import gc
import glob
import random
import numpy as np 
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy import stats
from pathlib import Path
from itertools import groupby
#:::::::::::::::::::::::::::::::::::
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px

from rdkit import Chem
from rdkit.Chem import Draw

from sklearn.model_selection import KFold
%matplotlib inline

folder_path = "./input"

In [2]:
de_train = pd.read_parquet(f'{folder_path}/de_train.parquet')
de_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [3]:
id_map = pd.read_csv(f'{folder_path}/id_map.csv', index_col='id')
id_map

Unnamed: 0_level_0,cell_type,sm_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,B cells,ABT-199 (GDC-0199)
2,B cells,ABT737
3,B cells,AMD-070 (hydrochloride)
4,B cells,AT 7867
...,...,...
250,Myeloid cells,Vandetanib
251,Myeloid cells,Vanoxerine
252,Myeloid cells,Vardenafil
253,Myeloid cells,Vorinostat


In [4]:
genes = de_train.columns[5:]
genes

Index(['A1BG', 'A1BG-AS1', 'A2M', 'A2M-AS1', 'A2MP1', 'A4GALT', 'AAAS', 'AACS',
       'AAGAB', 'AAK1',
       ...
       'ZUP1', 'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', length=18211)

In [5]:
def add_columns(de_train, id_map):
    sm_lincs_id = de_train.set_index('sm_name')["sm_lincs_id"].to_dict()
    sm_name_to_smiles = de_train.set_index('sm_name')['SMILES'].to_dict()

    id_map['sm_lincs_id'] = id_map['sm_name'].map(sm_lincs_id)
    id_map['SMILES'] = id_map['sm_name'].map(sm_name_to_smiles)
    
    return id_map

add_columns(de_train, id_map)

Unnamed: 0_level_0,cell_type,sm_name,sm_lincs_id,SMILES
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,LSM-47134,Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C
1,B cells,ABT-199 (GDC-0199),LSM-45468,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...
2,B cells,ABT737,LSM-1180,CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)...
3,B cells,AMD-070 (hydrochloride),LSM-45591,NCCCCN(Cc1nc2ccccc2[nH]1)[C@H]1CCCc2cccnc21
4,B cells,AT 7867,LSM-1155,Clc1ccc(C2(c3ccc(-c4cn[nH]c4)cc3)CCNCC2)cc1
...,...,...,...,...
250,Myeloid cells,Vandetanib,LSM-1199,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
251,Myeloid cells,Vanoxerine,LSM-2703,Fc1ccc(C(OCCN2CCN(CCCc3ccccc3)CC2)c2ccc(F)cc2)cc1
252,Myeloid cells,Vardenafil,LSM-2292,CCCc1nc(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(CC...
253,Myeloid cells,Vorinostat,LSM-3828,O=C(CCCCCCC(=O)Nc1ccccc1)NO


In [35]:
kf = KFold(n_splits=6, shuffle=True, random_state=6174)

# Create k-fold columns
kf_val_index = []
for fold, (train_index, test_index) in enumerate(kf.split(de_train)):
	kf_val_index.append(test_index)
kf_val_index

[array([  6,  14,  24,  28,  33,  50,  53,  60,  67,  77,  86,  92,  95,
         98, 106, 114, 119, 120, 122, 126, 129, 133, 136, 140, 145, 157,
        169, 171, 176, 179, 188, 195, 196, 197, 200, 202, 206, 211, 213,
        243, 245, 259, 262, 270, 271, 273, 274, 298, 310, 311, 315, 316,
        317, 321, 325, 326, 345, 354, 355, 357, 363, 365, 366, 371, 389,
        394, 395, 402, 427, 428, 430, 432, 438, 445, 447, 450, 458, 461,
        473, 474, 476, 489, 490, 502, 508, 529, 531, 544, 556, 559, 563,
        577, 579, 585, 589, 592, 595, 596, 597, 599, 603, 607, 608]),
 array([  4,   7,  13,  21,  23,  37,  39,  42,  49,  57,  59,  66,  68,
         75,  78,  83,  88,  94, 100, 103, 105, 112, 113, 123, 135, 138,
        146, 161, 164, 175, 180, 185, 193, 194, 199, 234, 235, 236, 238,
        251, 254, 257, 290, 292, 294, 300, 301, 308, 318, 328, 336, 341,
        346, 347, 350, 351, 368, 369, 375, 376, 377, 393, 407, 416, 418,
        420, 422, 423, 425, 433, 434, 435, 436, 439, 4

In [17]:
xlist  = ['cell_type','sm_name']
_ylist = ['cell_type','sm_name','sm_lincs_id','SMILES','control']

y = de_train.drop(columns=_ylist)

train = pd.get_dummies(de_train[xlist], columns=xlist)
test = pd.get_dummies(id_map[xlist], columns=xlist)

In [18]:
uncommon = [f for f in train if f not in test]
len(uncommon)

21

In [19]:
X = train.drop(columns=uncommon)
X.shape, test.shape

((614, 131), (255, 131))

In [20]:
def mrrmse_pd(y_pred: pd.DataFrame, y_true: pd.DataFrame):
	return ((y_pred - y_true)**2).mean(axis=1).apply(np.sqrt).mean()

def mrrmse_np(y_pred, y_true):
  return np.sqrt(np.square(y_true - y_pred).mean(axis=1)).mean()

In [21]:
de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]

de_cell_type.shape, de_sm_name.shape

((614, 18212), (614, 18212))

In [22]:
de_cell_type

Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,0.104720,-0.077524,-1.625596,-0.144545,0.143555,0.073229,-0.016823,0.101717,-0.005153,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,0.915953,-0.884380,0.371834,-0.081677,-0.498266,0.203559,0.604656,0.498592,-0.317184,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,-0.387721,-0.305378,0.567777,0.303895,-0.022653,-0.480681,0.467144,-0.293205,-0.005098,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,0.232893,0.129029,0.336897,0.486946,0.767661,0.718590,-0.162145,0.157206,-3.654218,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,4.290652,-0.063864,-0.017443,-0.541154,0.570982,2.022829,0.600011,1.231275,0.236739,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,-0.544709,0.282458,-0.431359,-0.364961,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,-0.455549,0.188181,0.595734,-0.100299,0.786192,0.090954,0.169523,0.428297,0.106553,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,0.338168,-0.109079,0.270182,-0.436586,-0.069476,-0.061539,0.002818,-0.027167,-0.383696,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,-0.706087,-0.620919,-1.485381,0.059303,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [23]:
mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()

display(mean_cell_type)
display(mean_sm_name)

Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,B cells,1.38089,0.530585,1.340812,1.594307,4.927551,3.613191,0.02864,0.544636,0.723079,...,0.257778,0.674977,0.217386,1.439374,0.952903,0.581303,0.637408,0.517737,-0.207092,0.079199
1,Myeloid cells,1.570336,0.752564,-2.856826,0.887845,6.658911,4.034911,0.442943,0.403543,0.196285,...,-0.270423,-0.103318,-1.307952,-0.166312,1.883588,0.612681,-0.583563,-0.427938,-0.292768,-0.067723
2,NK cells,0.417735,0.409016,-0.224808,-0.425929,0.282997,1.324508,0.050034,0.405179,-0.065836,...,0.273242,-0.114225,0.019743,0.228173,0.226144,0.366789,0.205059,0.27148,-0.262843,-0.095723
3,T cells CD4+,0.020208,0.116092,0.107412,-0.327098,-0.034363,0.734447,-0.185652,0.279729,0.218412,...,-0.054699,0.262059,0.090887,0.173507,0.437869,0.169841,0.185947,0.084912,-0.14418,-0.037697
4,T cells CD8+,0.028166,-0.063453,0.019265,0.038879,0.138214,-0.090901,0.024953,0.086375,-0.220386,...,0.091154,-0.026132,-0.161742,0.080242,0.226675,0.061512,0.116773,-0.004632,-0.087825,-0.140895
5,T regulatory cells,0.733468,0.442569,0.89868,1.081128,1.145535,1.118455,0.058883,0.585039,-0.005645,...,-0.006385,0.48075,0.251817,0.495385,0.875552,0.873839,0.50152,0.315104,-0.125323,-0.030542


Unnamed: 0,sm_name,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,0.300267,-0.112432,0.413144,1.468632,0.733237,0.722462,0.125359,0.210903,-0.876761,...,-0.769578,-0.690020,0.303616,0.260685,0.555278,0.837875,0.444535,0.432414,-0.219858,0.551906
1,ABT-199 (GDC-0199),-0.081286,0.007314,0.081242,-0.125777,0.219469,0.258288,-0.160568,0.023898,0.317472,...,0.430786,0.094845,-0.088646,-0.140509,0.216322,-0.065943,0.113272,-0.181743,0.068095,-0.093228
2,ABT737,0.408012,0.322574,0.107448,-0.049174,0.422284,1.151523,0.751861,0.189453,-0.121147,...,0.186543,-0.180051,0.028183,0.413515,0.166978,0.327588,0.256550,-0.069630,-0.135967,-0.728025
3,AMD-070 (hydrochloride),-0.031131,0.533648,0.124738,0.241484,-0.017756,0.039647,-0.173965,0.806999,-0.019594,...,-0.100840,0.065319,0.193013,0.310814,0.018807,0.144418,0.000372,0.204476,-0.077820,0.166340
4,AT 7867,0.242736,-0.275840,0.158312,0.267365,-0.003346,0.183553,-0.228290,0.162294,-0.240859,...,-0.704684,-0.088803,-0.120544,-0.337481,0.051235,0.466585,-0.157225,0.622629,0.022401,0.079217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Vandetanib,-0.006076,-0.672747,-0.230338,-0.492947,0.109427,-0.528983,0.030436,0.155058,-0.130232,...,0.113566,-0.056856,-0.375688,-0.098679,-0.203246,-0.313355,0.105695,-0.004788,0.135301,0.254045
142,Vanoxerine,0.188002,-0.459637,0.107419,0.442630,0.288657,0.239626,-0.191168,-0.257659,-0.149061,...,0.023219,-0.114899,0.148367,0.166318,0.220331,-0.095794,-0.191277,0.358461,-0.246971,-0.221379
143,Vardenafil,-0.097319,0.526795,-0.339189,-0.156595,0.021584,-0.269225,-0.216612,-0.447963,0.047921,...,0.044047,0.050485,0.528853,0.162957,0.079624,-0.091698,-0.154987,0.191808,0.325303,-0.385319
144,Vorinostat,0.075208,0.014575,0.454048,-0.009477,0.342563,-0.226654,0.517033,0.845815,1.743839,...,0.320819,0.542119,-0.402185,-0.104306,-0.586294,0.382534,-0.107183,0.212404,-0.029705,0.270488


In [28]:
rows = []
for name in de_cell_type['cell_type']:
	mean_rows = mean_cell_type[mean_cell_type['cell_type'] == name].copy()
	rows.append(mean_rows)

tr_cell_type = pd.concat(rows)
tr_cell_type = tr_cell_type.reset_index(drop=True)

rows = []
for name in de_sm_name['sm_name']:
	mean_rows = mean_sm_name[mean_sm_name['sm_name'] == name].copy()
	rows.append(mean_rows)

tr_sm_name = pd.concat(rows)
tr_sm_name = tr_sm_name.reset_index(drop=True)


rows = []
for name in id_map['cell_type']:
    mean_rows = mean_cell_type[mean_cell_type['cell_type'] == name].copy()
    rows.append(mean_rows)

te_cell_type = pd.concat(rows)
te_cell_type = te_cell_type.reset_index(drop=True)


rows = []
for name in id_map['sm_name']:
    mean_rows = mean_sm_name[mean_sm_name['sm_name'] == name].copy()
    rows.append(mean_rows)

te_sm_name = pd.concat(rows)
te_sm_name = te_sm_name.reset_index(drop=True)


In [29]:
y0 = y.iloc[:, 0].copy()
y0

0      0.104720
1      0.915953
2     -0.387721
3      0.232893
4      4.290652
         ...   
609   -0.014372
610   -0.455549
611    0.338168
612    0.101138
613   -0.757116
Name: A1BG, Length: 614, dtype: float64

In [30]:
X0 = X.join(tr_cell_type.iloc[:, 0+1]).copy()
X0 = X0.join(tr_sm_name.iloc[:, 0+1], lsuffix='_cell_type', rsuffix='_sm_name')
X0

Unnamed: 0,cell_type_B cells,cell_type_Myeloid cells,sm_name_5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,sm_name_ABT-199 (GDC-0199),sm_name_ABT737,sm_name_AMD-070 (hydrochloride),sm_name_AT 7867,sm_name_AT13387,sm_name_AVL-292,sm_name_AZ628,...,sm_name_Tosedostat,sm_name_Trametinib,sm_name_UNII-BXU45ZH6LI,sm_name_Vandetanib,sm_name_Vanoxerine,sm_name_Vardenafil,sm_name_Vorinostat,sm_name_YK 4-279,A1BG_cell_type,A1BG_sm_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.417735,0.216461
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.020208,0.216461
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.028166,0.216461
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.733468,0.216461
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.417735,1.330514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.733468,0.280193
610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.417735,-0.193340
611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.020208,-0.193340
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.028166,-0.193340


In [31]:
test0 = test.join(te_cell_type.iloc[:, 0+1]).copy()
test0 = test0.join(te_sm_name.iloc[:, 0+1], lsuffix='_cell_type', rsuffix='_sm_name')
test0

Unnamed: 0_level_0,cell_type_B cells,cell_type_Myeloid cells,sm_name_5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,sm_name_ABT-199 (GDC-0199),sm_name_ABT737,sm_name_AMD-070 (hydrochloride),sm_name_AT 7867,sm_name_AT13387,sm_name_AVL-292,sm_name_AZ628,...,sm_name_Tosedostat,sm_name_Trametinib,sm_name_UNII-BXU45ZH6LI,sm_name_Vandetanib,sm_name_Vanoxerine,sm_name_Vardenafil,sm_name_Vorinostat,sm_name_YK 4-279,A1BG_cell_type,A1BG_sm_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.380890,0.300267
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.380890,-0.081286
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.380890,0.408012
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1.380890,-0.031131
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1.380890,0.242736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1.570336,-0.006076
251,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1.570336,0.188002
252,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1.570336,-0.097319
253,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1.570336,0.075208


# Model Training

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Create an imputer instance
imputer = SimpleImputer(strategy='mean')  # You can choose mean, median, or most_frequent

model0 = lgb.LGBMRegressor()
model1 = KNeighborsRegressor(n_neighbors=13)
model2 = LinearSVR(max_iter= 2000, epsilon= 0.1)

tr_x, te_x, tr_y, te_y = train_test_split(X, y, test_size=0.2, random_state=6174)

preds0 = []
preds1 = []
preds2 = []
preds = []

test_preds = []

for i in range(tr_y.shape[1]):
	tr_yi = tr_y.iloc[:, i].copy()
	te_yi = te_y.iloc[:, i].copy()

	tr_xi = tr_x.join(tr_cell_type.iloc[:, i+1], lsuffix='_', rsuffix='__').copy()
	tr_xi = tr_xi.join(tr_sm_name.iloc[:, i+1], lsuffix='_cell_type', rsuffix='_sm_name')
	tr_xi = imputer.fit_transform(tr_xi)
	
	te_xi = te_x.join(tr_cell_type.iloc[:, i+1], lsuffix='_', rsuffix='__').copy()
	te_xi = te_xi.join(tr_sm_name.iloc[:, i+1], lsuffix='_cell_type', rsuffix='_sm_name')
	te_xi = imputer.fit_transform(te_xi)
	
	model0.fit(tr_xi, tr_yi)
	model1.fit(tr_xi, tr_yi)
	model2.fit(tr_xi, tr_yi)
	
	pred0 = model0.predict(te_xi)
	pred1 = model1.predict(te_xi)
	pred2 = model2.predict(te_xi)
	
	pred = (pred0 + pred1 + pred2) / 3

	preds0.append(pred0)
	preds1.append(pred1)
	preds2.append(pred2)
	preds.append(pred)

	test_xi = test.join(te_cell_type.iloc[:, i+1], lsuffix='_', rsuffix='__').copy()
	test_xi = test_xi.join(te_sm_name.iloc[:, i+1], lsuffix='_cell_type', rsuffix='_sm_name')
	test_xi = imputer.fit_transform(test_xi)
	test_pred = (model0.predict(test_xi) + model1.predict(test_xi) + model2.predict(test_xi)) / 3
	test_preds.append(test_pred)

print(fold, "LGB", mrrmse_np(preds0, te_y))
print(fold, "KNN", mrrmse_np(preds1, te_y))
print(fold, "SVR", mrrmse_np(preds2, te_y))

print(fold, "ENS", mrrmse_np(preds, te_y))
	

de_train = pd.read_parquet('../input/open-problems-single-cell-perturbations/de_train.parquet')
prediction = pd.DataFrame(test_preds).T
prediction.columns = de_train.columns[5:]
prediction.index.name = 'id'
prediction.to_csv('submission.csv')

	


ModuleNotFoundError: No module named 'lightgbm'

In [41]:
def split_sign(text):
	text = text.replace(')(', ' ')
	text = text.replace('(' , ' ')
	text = text.replace(')' , ' ')
	return text.split(" ")

de_train['_SMILES'] = [split_sign(text) for text in de_train['SMILES'].values]
de_train['_SMILES']

0             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
1             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
2             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
3             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
4      [C[C@@H]1C[C@H]2[C@@H]3CCC4=CC, =O, C=C[C@]4, ...
                             ...                        
609    [CC, C, c1c, C, =O, Nc2ccccc2, c, -c2ccccc2, c...
610    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
611    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
612    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
613    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
Name: _SMILES, Length: 614, dtype: object

In [43]:
sign = []
for row in de_train['_SMILES'].values:
	for ele in row:
		sign.append(ele)
        
de_train_sign_list = list(set(sign))

data = np.zeros((len(de_train), len(de_train_sign_list)), dtype=int)
de_train_sign = pd.DataFrame(data=data, columns=de_train_sign_list)

for sign in de_train_sign_list:
	for i in range(len(de_train)):
		row = de_train['_SMILES'].values[i]
		
		if (sign in row):
			de_train_sign[sign].iloc[i] = 1
            
de_train_sign

Unnamed: 0,Br,c12,=C,OCCO,Clc1ccc2c,cc12,cc4,N6CCNCC6,Nc3ccccc3S,C[C@]12C=CC,...,C1CCCC1,N1CCCCC1,ccc,C[C@@H]1O[C@@H],C7CC7,-c2ccc3ncnc,c3s2,c3c,Nc1cc,Fc1ccc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
id_map['_SMILES'] = [split_sign(text) for text in id_map['SMILES'].values]

sign = []
for row in id_map['_SMILES'].values:
    for ele in row:
        sign.append(ele)
        
id_map_sign_list = list(set(sign))

data = np.zeros((len(id_map), len(id_map_sign_list)), dtype=int)
id_map_sign = pd.DataFrame(data=data, columns=id_map_sign_list)

for sign in id_map_sign_list:
	for i in range(len(id_map)):
		row = id_map['_SMILES'].values[i]
		
		if (sign in row):
			id_map_sign[sign].iloc[i] = 1
            
id_map_sign

Unnamed: 0,Br,c12,=C,OCCO,Clc1ccc2c,cc12,cc4,Nc3ccccc3S,C[C@]12C=CC,ncc2F,...,C1CCCC1,N1CCCCC1,ccc,C[C@@H]1O[C@@H],C7CC7,-c2ccc3ncnc,c3s2,c3c,Nc1cc,Fc1ccc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
252,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
uncommon = [f for f in de_train_sign if f not in id_map_sign]
train_sign = de_train_sign.drop(columns=uncommon)
train_sign = train_sign.sort_index(axis = 1)
id_map_sign = id_map_sign.sort_index(axis = 1)

X = X.join(train_sign).copy()
test = test.join(id_map_sign).copy()

In [None]:
model0 = lgb.LGBMRegressor()
model1 = KNeighborsRegressor(n_neighbors=13)
model2 = LinearSVR(max_iter= 2000, epsilon= 0.1)

kf = KFold(n_splits=6, shuffle=True, random_state=6174)
for fold, (train_index, test_index) in enumerate(kf.split(X, y0)):
	tr_x, te_x = X.iloc[train_index], X.iloc[test_index]
	tr_y, te_y = y0.iloc[train_index], y0.iloc[test_index]
	model0.fit(tr_x, tr_y)
	model1.fit(tr_x, tr_y)
	model2.fit(tr_x, tr_y)
	pred0 = model0.predict(te_x)
	pred1 = model1.predict(te_x)
	pred2 = model2.predict(te_x)
	print(fold, "LGBM", mrrmse_np(pred0, te_y))
	print(fold, "KNN", mrrmse_np(pred1, te_y))
	print(fold, "SVR", mrrmse_np(pred2, te_y))
	pred = (pred0 + pred1 + pred2) / 3
	print(fold, "ENS", mrrmse_np(pred, te_y))
	print()