In [None]:
%pip install -q rdkit pandas numpy scikit-learn matplotlib seaborn tqdm tensorflow tensorflow-gnn

In [None]:
# 코랩 - 구글 드라이브 연결
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
import random
import tensorflow as tf
import tensorflow_gnn as tfgnn
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# Configurations
CFG = {
    'FP_SIZE': 1024,
    'RADIUS': 2,
    'SEED': 42,
    'BATCH_SIZE': 32,
    'LR': 1e-3,
    'EPOCHS': 10
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(CFG['SEED'])


# 데이터셋 경로 설정
ROOT_DIR_GOOGLEDRIVE = '/content/drive/MyDrive/Contest/New-Medinine-Dev/'
ROOT_DIR_LOCAL = '..'

In [None]:
# SMILES 데이터를 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([atom.GetAtomicNum()])
    atom_features = np.array(atom_features, dtype=np.float32)

    edge_index = []
    for bond in mol.GetBonds():
        edge_index.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
        edge_index.append([bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()])
    edge_index = np.array(edge_index, dtype=np.int32)

    return atom_features, edge_index

# 데이터를 그래프 형식으로 변환하고 로드
def load_data(file_path):
    data = pd.read_csv(file_path)
    graphs = []
    labels = []
    for i, row in data.iterrows():
        graph = smiles_to_graph(row['Smiles'])
        if graph is not None:
            graphs.append(graph)
            labels.append(row['pIC50'])
    return graphs, np.array(labels)

In [None]:
train_graphs, train_labels = load_data(f'{ROOT_DIR_GOOGLEDRIVE}/data/train.csv')
test_graphs, _ = load_data(f'{ROOT_DIR_GOOGLEDRIVE}/data/test.csv')

# 학습 및 검증 데이터 분리
train_graphs, val_graphs, train_labels, val_labels = train_test_split(train_graphs, train_labels, test_size=0.2, random_state=CFG['SEED'])


In [None]:
# TensorFlow Dataset
def create_tf_dataset(graphs, labels=None):
    def gen():
        for i, (atom_features, edge_index) in enumerate(graphs):
            yield {
                'node_features': atom_features,
                'edge_features': np.ones(edge_index.shape[0], dtype=np.float32),  # Edge weights
                'edge_index': edge_index.T,
            }, labels[i] if labels is not None else 0
    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                'node_features': tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                'edge_features': tf.TensorSpec(shape=(None,), dtype=tf.float32),
                'edge_index': tf.TensorSpec(shape=(2, None), dtype=tf.int32)
            },
            tf.TensorSpec(shape=(), dtype=tf.float32) if labels is not None else tf.TensorSpec(shape=(), dtype=tf.float32)
        )
    )

train_ds = create_tf_dataset(train_graphs, train_labels).batch(CFG['BATCH_SIZE'])
val_ds = create_tf_dataset(val_graphs, val_labels).batch(CFG['BATCH_SIZE'])
test_ds = create_tf_dataset(test_graphs).batch(CFG['BATCH_SIZE'])

In [None]:
# GCN 모델 정의
class GCNModel(tf.keras.Model):
    def __init__(self, hidden_dim):
        super(GCNModel, self).__init__()
        self.gcn1 = tfgnn.keras.layers.GraphConv(units=hidden_dim)
        self.gcn2 = tfgnn.keras.layers.GraphConv(units=hidden_dim)
        self.dense = tf.keras.layers.Dense(1)

    def call(self, inputs, training=False):
        node_features = inputs['node_features']
        edge_features = inputs['edge_features']
        edge_index = inputs['edge_index']

        x = self.gcn1([node_features, edge_index], training=training)
        x = tf.nn.relu(x)
        x = self.gcn2([x, edge_index], training=training)
        x = tf.reduce_mean(x, axis=0)  # 그래프 풀링
        x = self.dense(x)
        return x

# 모델 초기화 및 컴파일
model = GCNModel(hidden_dim=64)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=CFG['LR']),
              loss='mean_squared_error')

# 학습
model.fit(train_ds, validation_data=val_ds, epochs=CFG['EPOCHS'])

In [None]:
# 테스트셋 예측
test_y_pred = model.predict(test_ds)

In [None]:
# pIC50 -> IC50 변환 함수
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

# 제출 파일 생성
submit = pd.read_csv('sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('baseline_submit.csv', index=False)