In [None]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem

# Configurations
CFG = {
    'NBITS': 2048,
    'SEED': 42,
    'BATCH_SIZE': 32,
    'LR': 1e-3,
    'EPOCHS': 10
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG['SEED'])

# SMILES 데이터를 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # 노드(원자) 특성
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([atom.GetAtomicNum()])
    atom_features = torch.tensor(atom_features, dtype=torch.float)

    # 엣지(결합) 특성
    edge_index = []
    for bond in mol.GetBonds():
        edge_index.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
        edge_index.append([bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    return Data(x=atom_features, edge_index=edge_index)

# 데이터를 그래프 형태로 변환 및 로드
def load_data(file_path):
    data = pd.read_csv(file_path)
    graphs = []
    labels = []
    for i, row in data.iterrows():
        graph = smiles_to_graph(row['Smiles'])
        if graph is not None:
            graphs.append(graph)
            labels.append(row['pIC50'])
    return graphs, torch.tensor(labels, dtype=torch.float)

train_graphs, train_labels = load_data('train.csv')
test_graphs, _ = load_data('test.csv')

# 학습 및 검증 데이터 분리
train_graphs, val_graphs, train_labels, val_labels = train_test_split(train_graphs, train_labels, test_size=0.3, random_state=CFG['SEED'])

# PyTorch Geometric DataLoader
train_loader = DataLoader(train_graphs, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=CFG['BATCH_SIZE'], shuffle=False)

# GCN 모델 정의
class GCNModel(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)  # 노드 특성이 1차원 (원자 번호)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = torch.nn.Linear(hidden_channels, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # 그래프 특성 추출 (평균 풀링)
        x = torch.mean(x, dim=0)

        # fully connected layer 통과
        x = self.fc(x)
        return x

# 모델 학습 함수
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

# 검증 함수
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            output = model(data)
            loss = criterion(output, data.y.view(-1, 1))
            total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

# 모델 초기화
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNModel(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LR'])
criterion = torch.nn.MSELoss()

# 학습 실행
for epoch in range(CFG['EPOCHS']):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch+1}/{CFG["EPOCHS"]}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# 테스트셋 예측
test_loader = DataLoader(test_graphs, batch_size=CFG['BATCH_SIZE'], shuffle=False)
model.eval()
test_y_pred = []
with torch.no_grad():
    for data in test_loader:
        pred = model(data).cpu().numpy()
        test_y_pred.append(pred)

test_y_pred = np.concatenate(test_y_pred)

# pIC50 -> IC50 변환 함수
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

# 제출 파일 생성
submit = pd.read_csv('sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('baseline_submit.csv', index=False)