In [46]:
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors, rdFingerprintGenerator

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [2]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

os.listdir(data_dir)

['test.parquet',
 'train.csv',
 'test.csv',
 'train.parquet',
 'sample_submission.csv']

In [3]:
# data = pd.read_csv(train_csv) ## 전체 데이터가 295246830. 2억 9천 5백 24만 6830개.

In [4]:
con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      ORDER BY random()
                      LIMIT 100000)
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      ORDER BY random()
                      LIMIT 100000)""").df()

con.close()

In [5]:
data.shape

(200000, 7)

In [6]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,151356204,O=C(Nc1c(F)ccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccncc1[N+](=O)[O-],Nc1nnc(S)s1,O=C(N[Dy])c1c(Br)ccc(F)c1Nc1nc(Nc2nnc(S)s2)nc(...,BRD4,0
1,129947405,O=C(N[C@H]1CCC[C@@H]1C(=O)O)OCC1c2ccccc2-c2ccc...,CS(=O)(=O)NC(=O)CCN.Cl,CN(Cc1ccco1)Cc1ccccc1CN,CN(Cc1ccco1)Cc1ccccc1CNc1nc(NCCC(=O)NS(C)(=O)=...,sEH,0
2,236759874,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Cc1cc(C#N)cnc1N,Nc1nccc(=O)[nH]1,Cc1cc(C#N)cnc1Nc1nc(Nc2nccc(=O)[nH]2)nc(NC(CC(...,BRD4,0
3,157888956,O=C(Nc1cc(Br)c(Cl)cc1C(=O)O)OCC1c2ccccc2-c2ccc...,Nc1ccnc(Cl)c1,Cl.Cl.NCc1ncccc1F,O=C(N[Dy])c1cc(Cl)c(Br)cc1Nc1nc(NCc2ncccc2F)nc...,BRD4,0
4,168414976,O=C(Nc1cc(Cl)cc(C(=O)O)c1)OCC1c2ccccc2-c2ccccc21,Cl.NC[C@H]1CC[C@H](C(N)=O)CC1,Cc1cccc2oc(CCN)nc12.Cl.Cl,Cc1cccc2oc(CCNc3nc(NC[C@H]4CC[C@H](C(N)=O)CC4)...,HSA,0


# EDA

데이터셋은 매우 불균형하다. bind=0이 293656924개, bind=1이 1589906개.

In [7]:
bind_0_count = data[data['binds'] == 0].shape[0]
bind_1_count = data[data['binds'] == 1].shape[0]

print(bind_0_count, bind_1_count)

100000 100000


In [8]:
bb1_set = set(data['buildingblock1_smiles'])
bb2_set = set(data['buildingblock2_smiles'])
bb3_set = set(data['buildingblock3_smiles'])


bb1_bb2_intersection = bb1_set.intersection(bb2_set)
bb1_bb3_intersection = bb1_set.intersection(bb3_set)
bb2_bb3_intersection = bb2_set.intersection(bb3_set)

print(f"Building block 1 & 2 중복 : {'있음' if bb1_bb2_intersection else '없음'}")
print(f"Building block 1 & 3 중복 : {'있음' if bb1_bb3_intersection else '없음'}")
print(f"Building block 2 & 3 중복 : {'있음' if bb2_bb3_intersection else '없음'}")

print(f"Building block 1과 2 사이의 중복된 값: {bb1_bb2_intersection}")
print(f"Building block 1과 3 사이의 중복된 값: {bb1_bb3_intersection}")
print(f"Building block 2와 3 사이의 중복된 값: {bb2_bb3_intersection}")

Building block 1 & 2 중복 : 없음
Building block 1 & 3 중복 : 없음
Building block 2 & 3 중복 : 있음
Building block 1과 2 사이의 중복된 값: set()
Building block 1과 3 사이의 중복된 값: set()
Building block 2와 3 사이의 중복된 값: {'Cc1ccc(-c2cc(C(F)(F)F)nc(OCCN)n2)cc1', 'Cl.Cl.NCc1nc2cnccc2s1', 'Cl.NCCOc1ccc(F)c(F)c1', 'COc1ncc(N)cn1', 'Cl.Nc1cccc2c(=O)cc(-c3nn[nH]n3)oc12', 'Cn1ccc2cc(N)ccc21', 'Nc1nc(-c2cccc([N+](=O)[O-])c2)cs1', 'Cl.NCc1cscc1C(F)(F)F', 'Nc1cc(Cl)cnc1Cl', 'Cl.NCCC1CN(c2ncnc3[nH]ncc23)c2ccccc21', 'COCC1(CN)CCCCC1', 'Cl.Cl.NCC(=O)Nc1nccs1', 'CN(C)c1cccnc1CN', 'COC(=O)c1cc(N)ccc1F', 'Nc1ccc(-c2ncc[nH]2)cc1', 'Nc1cc2cccnc2c2ncccc12', 'Cl.NCc1cc2ccccc2[nH]c1=O', 'COc1ccc(Cl)c(N)c1.Cl', 'CC(C)(C)OC(=O)n1ncc2cc(N)ccc21', 'CC(C)(C)NS(=O)(=O)c1cccc(N)c1', 'Cl.NCCCCF', 'COC(=O)c1cc(N)cc(O)c1', 'NCC1(CO)CCOC1', 'CC1(CCCCN)OCCO1', 'CN1CC(CN)CC1=O', 'Nc1ccc(-n2cncn2)cc1', 'Cl.NCCC1CC1', 'CCN(CCCN)S(C)(=O)=O', 'Cl.NCCCNC(=O)c1ccc(F)cc1', 'CC1(F)CCN(CCN)C1.Cl.Cl', 'Cc1cc(N)ncc1[N+](=O)[O-]', 'COc1ccnc(N)n1', 'Cc1csc(N)n

molecule_smiles는 SMILES 표기법으로 작성된 화학식이자 building block들이 화학적으로 결합한 소분자.  
기계학습을 위해서는 컴퓨터가 이해할 수 있는 형태로 Encoding 해야한다.

In [9]:
mols = [Chem.MolFromSmiles(i) for i in data['molecule_smiles']]
print(mols[0])

<rdkit.Chem.rdchem.Mol object at 0x7fe57c85b890>


In [10]:
def get_fp_with_ao(mol, radius=2, fpSize=2048):
    fpg = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=fpSize)
    
    ao = rdFingerprintGenerator.AdditionalOutput()
    ao.AllocateAtomCounts() ## 원자의 수
    ao.AllocateAtomToBits() ## 원자가 어떤 비트로 맵핑되는지
    ao.AllocateBitInfoMap()
    
    fp = fpg.GetFingerprint(mol, additionalOutput=ao)
    return fp, ao

In [11]:
fp, ao = get_fp_with_ao(mols[0])
print(fp)
print(ao)

<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7fe57c876580>
<rdkit.Chem.rdFingerprintGenerator.AdditionalOutput object at 0x7fe668a651d0>


In [12]:
total_bits = fp.GetNumBits()
on_bits = fp.GetNumOnBits()

on_bits_list = list(fp.GetOnBits())

print(f'Total bits: {total_bits}, On Bits: {on_bits}, Sparsity: {on_bits/total_bits}')

Total bits: 2048, On Bits: 64, Sparsity: 0.03125


하나의 분자식에서도 여러 가지 추가 특성들을 계산할 수 있다.

In [13]:
# SMILES 문자열
smiles = 'CC(=O)OC1=CC=CC=C1C(=O)O'

# SMILES를 분자 객체로 변환
mol = Chem.MolFromSmiles(smiles)

# 분자량 계산
mol_weight = Descriptors.ExactMolWt(mol)
print(f"Molecular Weight: {mol_weight:.2f}")

# 로그P (리피드 용해도) 계산
log_p = Descriptors.MolLogP(mol)
print(f"LogP: {log_p:.2f}")

# 수소 결합 주개 계산
h_bond_donor = Descriptors.NumHDonors(mol)
print(f"Number of Hydrogen Bond Donors: {h_bond_donor}")

# 수소 결합 받개 계산
h_bond_acceptor = Descriptors.NumHAcceptors(mol)
print(f"Number of Hydrogen Bond Acceptors: {h_bond_acceptor}")

# 회전 가능한 결합 수 계산
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
print(f"Number of Rotatable Bonds: {rotatable_bonds}")

# TPSA (위상학적 극성 표면적) 계산
tpsa = Descriptors.TPSA(mol)
print(f"Topological Polar Surface Area (TPSA): {tpsa:.2f}")

# 방향족 고리 수 계산
aromatic_rings = Descriptors.NumAromaticRings(mol)
print(f"Number of Aromatic Rings: {aromatic_rings}")

# 헤테로 원자 수 계산
hetero_atoms = Descriptors.NumHeteroatoms(mol)
print(f"Number of Heteroatoms: {hetero_atoms}")

Molecular Weight: 180.04
LogP: 1.31
Number of Hydrogen Bond Donors: 1
Number of Hydrogen Bond Acceptors: 3
Number of Rotatable Bonds: 2
Topological Polar Surface Area (TPSA): 63.60
Number of Aromatic Rings: 1
Number of Heteroatoms: 4


# Preprocessing

```GetMorganGenerator```을 정의할 때 radius의 설정을 바꿔가면서 학습을 해보는 것도 방법이될 수 있을 것.

In [14]:
data['molecule'] = data['molecule_smiles'].apply(Chem.MolFromSmiles) ## 문자열 화학식을 rdkit 객체로 변환한다.

In [15]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule
0,151356204,O=C(Nc1c(F)ccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccncc1[N+](=O)[O-],Nc1nnc(S)s1,O=C(N[Dy])c1c(Br)ccc(F)c1Nc1nc(Nc2nnc(S)s2)nc(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8767b0>
1,129947405,O=C(N[C@H]1CCC[C@@H]1C(=O)O)OCC1c2ccccc2-c2ccc...,CS(=O)(=O)NC(=O)CCN.Cl,CN(Cc1ccco1)Cc1ccccc1CN,CN(Cc1ccco1)Cc1ccccc1CNc1nc(NCCC(=O)NS(C)(=O)=...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876820>
2,236759874,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Cc1cc(C#N)cnc1N,Nc1nccc(=O)[nH]1,Cc1cc(C#N)cnc1Nc1nc(Nc2nccc(=O)[nH]2)nc(NC(CC(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8763c0>
3,157888956,O=C(Nc1cc(Br)c(Cl)cc1C(=O)O)OCC1c2ccccc2-c2ccc...,Nc1ccnc(Cl)c1,Cl.Cl.NCc1ncccc1F,O=C(N[Dy])c1cc(Cl)c(Br)cc1Nc1nc(NCc2ncccc2F)nc...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876890>
4,168414976,O=C(Nc1cc(Cl)cc(C(=O)O)c1)OCC1c2ccccc2-c2ccccc21,Cl.NC[C@H]1CC[C@H](C(N)=O)CC1,Cc1cccc2oc(CCN)nc12.Cl.Cl,Cc1cccc2oc(CCNc3nc(NC[C@H]4CC[C@H](C(N)=O)CC4)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876900>


In [16]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def compute_fingerprint(mol):
    if mol is None:
        return None
    return fpg.GetFingerprint(mol)

data['fingerprints'] = data['molecule'].apply(compute_fingerprint)

In [17]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,fingerprints
0,151356204,O=C(Nc1c(F)ccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccncc1[N+](=O)[O-],Nc1nnc(S)s1,O=C(N[Dy])c1c(Br)ccc(F)c1Nc1nc(Nc2nnc(S)s2)nc(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8767b0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,129947405,O=C(N[C@H]1CCC[C@@H]1C(=O)O)OCC1c2ccccc2-c2ccc...,CS(=O)(=O)NC(=O)CCN.Cl,CN(Cc1ccco1)Cc1ccccc1CN,CN(Cc1ccco1)Cc1ccccc1CNc1nc(NCCC(=O)NS(C)(=O)=...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876820>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,236759874,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Cc1cc(C#N)cnc1N,Nc1nccc(=O)[nH]1,Cc1cc(C#N)cnc1Nc1nc(Nc2nccc(=O)[nH]2)nc(NC(CC(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8763c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,157888956,O=C(Nc1cc(Br)c(Cl)cc1C(=O)O)OCC1c2ccccc2-c2ccc...,Nc1ccnc(Cl)c1,Cl.Cl.NCc1ncccc1F,O=C(N[Dy])c1cc(Cl)c(Br)cc1Nc1nc(NCc2ncccc2F)nc...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876890>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,168414976,O=C(Nc1cc(Cl)cc(C(=O)O)c1)OCC1c2ccccc2-c2ccccc21,Cl.NC[C@H]1CC[C@H](C(N)=O)CC1,Cc1cccc2oc(CCN)nc12.Cl.Cl,Cc1cccc2oc(CCNc3nc(NC[C@H]4CC[C@H](C(N)=O)CC4)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876900>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
# 분자량 계산 함수
def calculate_mol_weight(mol):
    return Descriptors.ExactMolWt(mol)

# 로그P 계산 함수
def calculate_log_p(mol):
    return Descriptors.MolLogP(mol)

# 수소 결합 주개 계산 함수
def calculate_h_bond_donor(mol):
    return Descriptors.NumHDonors(mol)

# 수소 결합 받개 계산 함수
def calculate_h_bond_acceptor(mol):
    return Descriptors.NumHAcceptors(mol)

# 회전 가능한 결합 수 계산 함수
def calculate_rotatable_bonds(mol):
    return Descriptors.NumRotatableBonds(mol)

# TPSA 계산 함수
def calculate_tpsa(mol):
    return Descriptors.TPSA(mol)

# 방향족 고리 수 계산 함수
def calculate_aromatic_rings(mol):
    return Descriptors.NumAromaticRings(mol)

# 헤테로 원자 수 계산 함수
def calculate_hetero_atoms(mol):
    return Descriptors.NumHeteroatoms(mol)

In [19]:
data['MolWeight'] = data['molecule'].apply(calculate_mol_weight)
data['LogP'] = data['molecule'].apply(calculate_log_p)
data['HBondDonor'] = data['molecule'].apply(calculate_h_bond_donor)
data['HBondAcceptor'] = data['molecule'].apply(calculate_h_bond_acceptor)
data['RotatableBonds'] = data['molecule'].apply(calculate_rotatable_bonds)
data['TPSA'] = data['molecule'].apply(calculate_tpsa)
data['AromaticRings'] = data['molecule'].apply(calculate_aromatic_rings)
data['HeteroAtoms'] = data['molecule'].apply(calculate_hetero_atoms)

In [20]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,fingerprints,MolWeight,LogP,HBondDonor,HBondAcceptor,RotatableBonds,TPSA,AromaticRings,HeteroAtoms
0,151356204,O=C(Nc1c(F)ccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccncc1[N+](=O)[O-],Nc1nnc(S)s1,O=C(N[Dy])c1c(Br)ccc(F)c1Nc1nc(Nc2nnc(S)s2)nc(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8767b0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",741.886865,3.6415,5,14,8,185.67,4,19
1,129947405,O=C(N[C@H]1CCC[C@@H]1C(=O)O)OCC1c2ccccc2-c2ccc...,CS(=O)(=O)NC(=O)CCN.Cl,CN(Cc1ccco1)Cc1ccccc1CN,CN(Cc1ccco1)Cc1ccccc1CNc1nc(NCCC(=O)NS(C)(=O)=...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876820>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",762.185186,1.7476,5,12,15,183.48,3,16
2,236759874,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Cc1cc(C#N)cnc1N,Nc1nccc(=O)[nH]1,Cc1cc(C#N)cnc1Nc1nc(Nc2nccc(=O)[nH]2)nc(NC(CC(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8763c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",724.019831,2.9035,5,11,9,186.29,4,15
3,157888956,O=C(Nc1cc(Br)c(Cl)cc1C(=O)O)OCC1c2ccccc2-c2ccc...,Nc1ccnc(Cl)c1,Cl.Cl.NCc1ncccc1F,O=C(N[Dy])c1cc(Cl)c(Br)cc1Nc1nc(NCc2ncccc2F)nc...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876890>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",739.915752,5.1634,4,9,8,129.64,4,15
4,168414976,O=C(Nc1cc(Cl)cc(C(=O)O)c1)OCC1c2ccccc2-c2ccccc21,Cl.NC[C@H]1CC[C@H](C(N)=O)CC1,Cc1cccc2oc(CCN)nc12.Cl.Cl,Cc1cccc2oc(CCNc3nc(NC[C@H]4CC[C@H](C(N)=O)CC4)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876900>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",740.153013,4.27052,5,10,11,172.98,4,14


In [27]:
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_encoded = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1)).astype(np.uint8)
protein_columns = onehot_encoder.get_feature_names_out(['protein'])
protein_encoded_df = pd.DataFrame(protein_encoded, columns=protein_columns)
data = pd.concat([data, protein_encoded_df], axis=1)

In [28]:
data.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,fingerprints,MolWeight,...,RotatableBonds,TPSA,AromaticRings,HeteroAtoms,protein_BRD4,protein_HSA,protein_sEH,protein_BRD4.1,protein_HSA.1,protein_sEH.1
0,151356204,O=C(Nc1c(F)ccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccncc1[N+](=O)[O-],Nc1nnc(S)s1,O=C(N[Dy])c1c(Br)ccc(F)c1Nc1nc(Nc2nnc(S)s2)nc(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8767b0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",741.886865,...,8,185.67,4,19,1.0,0.0,0.0,1,0,0
1,129947405,O=C(N[C@H]1CCC[C@@H]1C(=O)O)OCC1c2ccccc2-c2ccc...,CS(=O)(=O)NC(=O)CCN.Cl,CN(Cc1ccco1)Cc1ccccc1CN,CN(Cc1ccco1)Cc1ccccc1CNc1nc(NCCC(=O)NS(C)(=O)=...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876820>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",762.185186,...,15,183.48,3,16,0.0,0.0,1.0,0,0,1
2,236759874,O=C(O)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)c1ccc(Br...,Cc1cc(C#N)cnc1N,Nc1nccc(=O)[nH]1,Cc1cc(C#N)cnc1Nc1nc(Nc2nccc(=O)[nH]2)nc(NC(CC(...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c8763c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",724.019831,...,9,186.29,4,15,1.0,0.0,0.0,1,0,0
3,157888956,O=C(Nc1cc(Br)c(Cl)cc1C(=O)O)OCC1c2ccccc2-c2ccc...,Nc1ccnc(Cl)c1,Cl.Cl.NCc1ncccc1F,O=C(N[Dy])c1cc(Cl)c(Br)cc1Nc1nc(NCc2ncccc2F)nc...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876890>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",739.915752,...,8,129.64,4,15,1.0,0.0,0.0,1,0,0
4,168414976,O=C(Nc1cc(Cl)cc(C(=O)O)c1)OCC1c2ccccc2-c2ccccc21,Cl.NC[C@H]1CC[C@H](C(N)=O)CC1,Cc1cccc2oc(CCN)nc12.Cl.Cl,Cc1cccc2oc(CCNc3nc(NC[C@H]4CC[C@H](C(N)=O)CC4)...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x7fe57c876900>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",740.153013,...,11,172.98,4,14,0.0,1.0,0.0,0,1,0


In [29]:
data.to_csv("./train.csv", index=False)

# Training

In [45]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr

protein_onehot = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1))
fingerprints = [convert_to_numpy_array(fp) for fp in data['fingerprints'].tolist()]
X = [np.concatenate([fp, protein]) for fp, protein in zip(fingerprints, protein_onehot.tolist())]
y = data['binds'].tolist()

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [48]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [49]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9068


In [38]:
features = ['MolWeight', 'LogP', 'HBondDonor', 'HBondAcceptor', 'RotatableBonds',
            'TPSA', 'AromaticRings', 'HeteroAtoms', 'protein_BRD4', 'protein_HSA', 'protein_sEH']
X = data[features].values

X_fingerprints = np.array([list(fp) for fp in data['fingerprints']])

X = np.concatenate([X, X_fingerprints], axis=1)

y = data[['protein_BRD4', 'protein_HSA', 'protein_sEH']].values

In [41]:
train_indices, test_indices = train_test_split(range(len(X)), test_size=0.2, random_state=42)

X_train = [X[i] for i in train_indices]
X_test = [X[i] for i in test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

X_train = np.array([np.array(x) for x in X_train])
X_test = np.array([np.array(x) for x in X_test])

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(160000, 2062) (160000, 6)
(40000, 2062) (40000, 6)


In [42]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [43]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 1.0000
