### Import

In [1]:
!pip install rdkit



In [1]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [4]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [49]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train_with_descriptors.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles,Molecular_Weight,LogP
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,995.188,2.7436
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,535.624,2.7209
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,537.596,1.5672
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,545.566,2.4335
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,936.189,6.49322


### Data Pre-processing 1

In [67]:
train = chembl_data[['Smiles', 'pIC50', 'Molecular_Weight']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [68]:
X_Fingerprint = np.array(train['Fingerprint'].tolist())

X_molecular_weight = np.array(train['Molecular_Weight']).reshape(-1, 1)

In [69]:
train_x = np.hstack((X_Fingerprint, X_molecular_weight))

In [70]:
train_x

array([[  1.   ,   1.   ,   1.   , ...,   0.   ,   0.   , 995.188],
       [  0.   ,   1.   ,   0.   , ...,   0.   ,   0.   , 535.624],
       [  0.   ,   1.   ,   0.   , ...,   0.   ,   0.   , 537.596],
       ...,
       [  0.   ,   0.   ,   0.   , ...,   0.   ,   0.   , 318.239],
       [  0.   ,   0.   ,   0.   , ...,   0.   ,   0.   , 449.536],
       [  0.   ,   1.   ,   0.   , ...,   0.   ,   0.   , 179.219]])

In [71]:
train_y = train['pIC50']

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

### Data Pre-processing 2

In [76]:
train = chembl_data[['Smiles', 'pIC50', 'Molecular_Weight']]

In [77]:
fingerprints = np.array([smiles_to_fingerprint(smiles) for smiles in train['Smiles']])

# 지문 데이터프레임으로 변환
fingerprints_df = pd.DataFrame(fingerprints, columns=[f'feature_{i}' for i in range(fingerprints.shape[1])])



In [78]:
train = train.drop(columns=['Smiles'])
train = pd.concat([train, fingerprints_df], axis=1)

In [79]:
train

Unnamed: 0,pIC50,Molecular_Weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_2038,feature_2039,feature_2040,feature_2041,feature_2042,feature_2043,feature_2044,feature_2045,feature_2046,feature_2047
0,10.66,995.188,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10.59,535.624,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,10.11,537.596,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,10.09,545.566,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,10.00,936.189,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1947,4.52,283.247,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1948,4.52,327.815,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1949,4.52,318.239,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1950,4.38,449.536,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
X = train.drop(columns=['pIC50'])
y = train['pIC50']

In [81]:
train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.3, random_state=42)

### Train & Validation

In [82]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

In [83]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 2185.288074422494


### Inference

In [29]:
test = pd.read_csv('./test_with_descriptors.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

X_Fingerprint = np.array(test['Fingerprint'].tolist())

X_molecular_weight = np.array(test['Molecular_Weight']).reshape(-1, 1)

test_x = np.hstack((X_Fingerprint, X_molecular_weight))

test_y_pred = model.predict(test_x)



### Submission

In [31]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,70.07786
1,TEST_001,25.052246
2,TEST_002,17.640058
3,TEST_003,16.919986
4,TEST_004,20.539979


In [32]:
submit.to_csv('./MW_submit.csv', index=False)