In [12]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Basic utilities
import os
import gc
import glob
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px

# Scientific computing
from scipy import stats
from itertools import groupby

# Machine Learning
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


# Set the folder path for data
folder_path = "./input"


In [21]:
de_train = pd.read_parquet(f'{folder_path}/de_train.parquet')
de_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [20]:
id_map = pd.read_csv(f'{folder_path}/id_map.csv', index_col='id')
id_map

Unnamed: 0_level_0,cell_type,sm_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,B cells,ABT-199 (GDC-0199)
2,B cells,ABT737
3,B cells,AMD-070 (hydrochloride)
4,B cells,AT 7867
...,...,...
250,Myeloid cells,Vandetanib
251,Myeloid cells,Vanoxerine
252,Myeloid cells,Vardenafil
253,Myeloid cells,Vorinostat


In [22]:
genes = de_train.columns[5:]
genes

Index(['A1BG', 'A1BG-AS1', 'A2M', 'A2M-AS1', 'A2MP1', 'A4GALT', 'AAAS', 'AACS',
       'AAGAB', 'AAK1',
       ...
       'ZUP1', 'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', length=18211)

In [23]:
def add_columns(de_train, id_map):
	sm_lincs_id = de_train.set_index('sm_name')["sm_lincs_id"].to_dict()
	sm_name_to_smiles = de_train.set_index('sm_name')['SMILES'].to_dict()

	id_map['sm_lincs_id'] = id_map['sm_name'].map(sm_lincs_id)
	id_map['SMILES'] = id_map['sm_name'].map(sm_name_to_smiles)
	
	return id_map

add_columns(de_train, id_map)

Unnamed: 0_level_0,cell_type,sm_name,sm_lincs_id,SMILES
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,LSM-47134,Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C
1,B cells,ABT-199 (GDC-0199),LSM-45468,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...
2,B cells,ABT737,LSM-1180,CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)...
3,B cells,AMD-070 (hydrochloride),LSM-45591,NCCCCN(Cc1nc2ccccc2[nH]1)[C@H]1CCCc2cccnc21
4,B cells,AT 7867,LSM-1155,Clc1ccc(C2(c3ccc(-c4cn[nH]c4)cc3)CCNCC2)cc1
...,...,...,...,...
250,Myeloid cells,Vandetanib,LSM-1199,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
251,Myeloid cells,Vanoxerine,LSM-2703,Fc1ccc(C(OCCN2CCN(CCCc3ccccc3)CC2)c2ccc(F)cc2)cc1
252,Myeloid cells,Vardenafil,LSM-2292,CCCc1nc(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(CC...
253,Myeloid cells,Vorinostat,LSM-3828,O=C(CCCCCCC(=O)Nc1ccccc1)NO


In [24]:
def mrrmse_pd(y_pred: pd.DataFrame, y_true: pd.DataFrame):
	return ((y_pred - y_true)**2).mean(axis=1).apply(np.sqrt).mean()

def mrrmse_np(y_pred, y_true):
	return np.sqrt(np.square(y_true - y_pred).mean()).mean()

In [25]:
def split_sign(text):
	text = text.replace(')(', ' ')
	text = text.replace('(' , ' ')
	text = text.replace(')' , ' ')
	return text.split(" ")

de_train['_SMILES'] = [split_sign(text) for text in de_train['SMILES'].values]
de_train['_SMILES']

0             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
1             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
2             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
3             [Clc1ccccc1C, c1ccccc1, c1ccccc1, n1ccnc1]
4      [C[C@@H]1C[C@H]2[C@@H]3CCC4=CC, =O, C=C[C@]4, ...
                             ...                        
609    [CC, C, c1c, C, =O, Nc2ccccc2, c, -c2ccccc2, c...
610    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
611    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
612    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
613    [COC, =O, N, C, c1c, N, nc, -c2nn, Cc3ccccc3F,...
Name: _SMILES, Length: 614, dtype: object

In [30]:
sign = []
for row in de_train['_SMILES'].values:
	for ele in row:
		sign.append(ele)
        
de_train_sign_list = list(set(sign))

data = np.zeros((len(de_train), len(de_train_sign_list)), dtype=int)
de_train_sign = pd.DataFrame(data=data, columns=de_train_sign_list)

for sign in de_train_sign_list:
	for i in range(len(de_train)):
		row = de_train['_SMILES'].values[i]
		
		for ele in row:
			if ele == sign:
				de_train_sign[sign][i] += 1
					
de_train_sign

Unnamed: 0,Nc2c,COc1cc2ncn,o1,c1=O,sc3cc,-c2ccc3ncnc,O=C,CN2CCN,Nc1cccc2c1C,n1CC[C@@H],...,cc1Nc1nc,nc2-c2ccc3nccnc3c2,-c2cc3c,nc12,nc2ccccc2c1-c1ccc,-c2cnc,F,CCCCc1oc2ccccc2c1C,Cc5ccccc5,Nc1ncnc2[nH]cnc12
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]

mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()

display(mean_cell_type)
display(mean_sm_name)

Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,B cells,1.38089,0.530585,1.340812,1.594307,4.927551,3.613191,0.02864,0.544636,0.723079,...,0.257778,0.674977,0.217386,1.439374,0.952903,0.581303,0.637408,0.517737,-0.207092,0.079199
1,Myeloid cells,1.570336,0.752564,-2.856826,0.887845,6.658911,4.034911,0.442943,0.403543,0.196285,...,-0.270423,-0.103318,-1.307952,-0.166312,1.883588,0.612681,-0.583563,-0.427938,-0.292768,-0.067723
2,NK cells,0.417735,0.409016,-0.224808,-0.425929,0.282997,1.324508,0.050034,0.405179,-0.065836,...,0.273242,-0.114225,0.019743,0.228173,0.226144,0.366789,0.205059,0.27148,-0.262843,-0.095723
3,T cells CD4+,0.020208,0.116092,0.107412,-0.327098,-0.034363,0.734447,-0.185652,0.279729,0.218412,...,-0.054699,0.262059,0.090887,0.173507,0.437869,0.169841,0.185947,0.084912,-0.14418,-0.037697
4,T cells CD8+,0.028166,-0.063453,0.019265,0.038879,0.138214,-0.090901,0.024953,0.086375,-0.220386,...,0.091154,-0.026132,-0.161742,0.080242,0.226675,0.061512,0.116773,-0.004632,-0.087825,-0.140895
5,T regulatory cells,0.733468,0.442569,0.89868,1.081128,1.145535,1.118455,0.058883,0.585039,-0.005645,...,-0.006385,0.48075,0.251817,0.495385,0.875552,0.873839,0.50152,0.315104,-0.125323,-0.030542


Unnamed: 0,sm_name,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,0.300267,-0.112432,0.413144,1.468632,0.733237,0.722462,0.125359,0.210903,-0.876761,...,-0.769578,-0.690020,0.303616,0.260685,0.555278,0.837875,0.444535,0.432414,-0.219858,0.551906
1,ABT-199 (GDC-0199),-0.081286,0.007314,0.081242,-0.125777,0.219469,0.258288,-0.160568,0.023898,0.317472,...,0.430786,0.094845,-0.088646,-0.140509,0.216322,-0.065943,0.113272,-0.181743,0.068095,-0.093228
2,ABT737,0.408012,0.322574,0.107448,-0.049174,0.422284,1.151523,0.751861,0.189453,-0.121147,...,0.186543,-0.180051,0.028183,0.413515,0.166978,0.327588,0.256550,-0.069630,-0.135967,-0.728025
3,AMD-070 (hydrochloride),-0.031131,0.533648,0.124738,0.241484,-0.017756,0.039647,-0.173965,0.806999,-0.019594,...,-0.100840,0.065319,0.193013,0.310814,0.018807,0.144418,0.000372,0.204476,-0.077820,0.166340
4,AT 7867,0.242736,-0.275840,0.158312,0.267365,-0.003346,0.183553,-0.228290,0.162294,-0.240859,...,-0.704684,-0.088803,-0.120544,-0.337481,0.051235,0.466585,-0.157225,0.622629,0.022401,0.079217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Vandetanib,-0.006076,-0.672747,-0.230338,-0.492947,0.109427,-0.528983,0.030436,0.155058,-0.130232,...,0.113566,-0.056856,-0.375688,-0.098679,-0.203246,-0.313355,0.105695,-0.004788,0.135301,0.254045
142,Vanoxerine,0.188002,-0.459637,0.107419,0.442630,0.288657,0.239626,-0.191168,-0.257659,-0.149061,...,0.023219,-0.114899,0.148367,0.166318,0.220331,-0.095794,-0.191277,0.358461,-0.246971,-0.221379
143,Vardenafil,-0.097319,0.526795,-0.339189,-0.156595,0.021584,-0.269225,-0.216612,-0.447963,0.047921,...,0.044047,0.050485,0.528853,0.162957,0.079624,-0.091698,-0.154987,0.191808,0.325303,-0.385319
144,Vorinostat,0.075208,0.014575,0.454048,-0.009477,0.342563,-0.226654,0.517033,0.845815,1.743839,...,0.320819,0.542119,-0.402185,-0.104306,-0.586294,0.382534,-0.107183,0.212404,-0.029705,0.270488


In [19]:
def filter_and_concat(df, column_name, filter_values):
	filtered_rows = [df[df[column_name] == value].copy() for value in filter_values]
	return pd.concat(filtered_rows, ignore_index=True)

# Assuming de_cell_type, mean_cell_type, de_sm_name, mean_sm_name, and id_map are predefined DataFrames
tr_cell_type = filter_and_concat(mean_cell_type, 'cell_type', de_cell_type['cell_type'])
tr_sm_name = filter_and_concat(mean_sm_name, 'sm_name', de_sm_name['sm_name'])
te_cell_type = filter_and_concat(mean_cell_type, 'cell_type', id_map['cell_type'])
te_sm_name = filter_and_concat(mean_sm_name, 'sm_name', id_map['sm_name'])

tr_cell_type

Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,0.417735,0.409016,-0.224808,-0.425929,0.282997,1.324508,0.050034,0.405179,-0.065836,...,0.273242,-0.114225,0.019743,0.228173,0.226144,0.366789,0.205059,0.271480,-0.262843,-0.095723
1,T cells CD4+,0.020208,0.116092,0.107412,-0.327098,-0.034363,0.734447,-0.185652,0.279729,0.218412,...,-0.054699,0.262059,0.090887,0.173507,0.437869,0.169841,0.185947,0.084912,-0.144180,-0.037697
2,T cells CD8+,0.028166,-0.063453,0.019265,0.038879,0.138214,-0.090901,0.024953,0.086375,-0.220386,...,0.091154,-0.026132,-0.161742,0.080242,0.226675,0.061512,0.116773,-0.004632,-0.087825,-0.140895
3,T regulatory cells,0.733468,0.442569,0.898680,1.081128,1.145535,1.118455,0.058883,0.585039,-0.005645,...,-0.006385,0.480750,0.251817,0.495385,0.875552,0.873839,0.501520,0.315104,-0.125323,-0.030542
4,NK cells,0.417735,0.409016,-0.224808,-0.425929,0.282997,1.324508,0.050034,0.405179,-0.065836,...,0.273242,-0.114225,0.019743,0.228173,0.226144,0.366789,0.205059,0.271480,-0.262843,-0.095723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,0.733468,0.442569,0.898680,1.081128,1.145535,1.118455,0.058883,0.585039,-0.005645,...,-0.006385,0.480750,0.251817,0.495385,0.875552,0.873839,0.501520,0.315104,-0.125323,-0.030542
610,NK cells,0.417735,0.409016,-0.224808,-0.425929,0.282997,1.324508,0.050034,0.405179,-0.065836,...,0.273242,-0.114225,0.019743,0.228173,0.226144,0.366789,0.205059,0.271480,-0.262843,-0.095723
611,T cells CD4+,0.020208,0.116092,0.107412,-0.327098,-0.034363,0.734447,-0.185652,0.279729,0.218412,...,-0.054699,0.262059,0.090887,0.173507,0.437869,0.169841,0.185947,0.084912,-0.144180,-0.037697
612,T cells CD8+,0.028166,-0.063453,0.019265,0.038879,0.138214,-0.090901,0.024953,0.086375,-0.220386,...,0.091154,-0.026132,-0.161742,0.080242,0.226675,0.061512,0.116773,-0.004632,-0.087825,-0.140895


In [10]:
tr_cell_type.iloc[:, 1:].isna().any().any()

False

In [11]:
pca = PCA(n_components=200)

#tr_cell_type
tr_cell_type_pca = pca.fit_transform(tr_cell_type.iloc[:, 1:])
tr_cell_type_pca = pd.DataFrame(tr_cell_type_pca, columns=[f'pca_{i}' for i in range(200)])
#tr_cell_type_pca['cell_type'] = tr_cell_type['cell_type']

te_cell_type_pca = pca.transform(te_cell_type.iloc[:, 1:])
te_cell_type_pca = pd.DataFrame(te_cell_type_pca, columns=[f'pca_{i}' for i in range(200)])
#te_cell_type_pca['cell_type'] = te_cell_type['cell_type']

#tr_sm_name
tr_sm_name_pca = pca.fit_transform(tr_sm_name.iloc[:, 1:])
tr_sm_name_pca = pd.DataFrame(tr_sm_name_pca, columns=[f'pca_{i}' for i in range(200)])
#tr_sm_name_pca['sm_name'] = tr_sm_name['sm_name']

te_sm_name_pca = pca.transform(te_sm_name.iloc[:, 1:])
te_sm_name_pca = pd.DataFrame(te_sm_name_pca, columns=[f'pca_{i}' for i in range(200)])
#te_sm_name_pca['sm_name'] = te_sm_name['sm_name']


ValueError: array must not contain infs or NaNs

In [None]:
pca_train = pd.concat([tr_cell_type_pca, tr_sm_name_pca], axis=1)
pca_test = pd.concat([te_cell_type_pca, te_sm_name_pca], axis=1)

pca_train["sm_name"] = tr_sm_name["sm_name"]

In [10]:

n_components = 100
alpha = 5

i = 0
val_df = de_train.loc[val_index[i]].copy()
train_df = de_train.loc[~de_train.index.isin(val_index[i])].copy()

train_pca = pca_train.loc[~pca_train.index.isin(val_index[i])].copy()
val_pca = pca_train.loc[val_index[i]].copy()

pred_df = val_df.copy()
pred_df[genes] = 0

# Model
for gene in tqdm(genes):
    moodel = LinearSVR()
    model.fit(train_pca, train_df[gene])

    pred_df[gene] = model.predict(val_pca)
    id_map[gene] = model.predict(pca_test)

print(f'Fold {i} MRRMSE: {mrrmse_pd(pred_df[genes], val_df[genes]):.6f}')

id_map = id_map.loc[:, genes]
id_map.to_csv('submission.csv')
id_map

ValueError: could not convert string to float: 'T cells CD4+'