In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import pandas as pd
import torch.optim as opt
import sys
import utils
import hyp
from utils import PrioritizedReplayBuffer 
from rdkit import Chem
from rdkit.Chem import QED, AllChem, rdFingerprintGenerator, Crippen, Descriptors, MACCSkeys
from rdkit.Chem.Crippen import MolLogP
from rdkit.Contrib.SA_Score import sascorer
from environment import Molecule
from torch.utils.tensorboard import SummaryWriter
import os
import subprocess
from tqdm import tqdm
import pickle
import catboost

In [5]:
class NoisyLinear(nn.Module):
    """Noisy Linear Layer for exploration"""
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.std_init = std_init
        
        self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.Tensor(out_features, in_features))
        self.bias_mu = nn.Parameter(torch.Tensor(out_features))
        self.bias_sigma = nn.Parameter(torch.Tensor(out_features))
        
        self.register_buffer('weight_epsilon', torch.Tensor(out_features, in_features))
        self.register_buffer('bias_epsilon', torch.Tensor(out_features))
        
        self.reset_parameters()
        self.reset_noise()
    
    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / np.sqrt(self.in_features))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / np.sqrt(self.out_features))
    
    def reset_noise(self):
        epsilon_in = self.scale_noise(self.in_features)
        epsilon_out = self.scale_noise(self.out_features)
        
        self.weight_epsilon.copy_(epsilon_out.outer(epsilon_in))
        self.bias_epsilon.copy_(epsilon_out)
    
    def forward(self, x):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
            return F.linear(x, weight, bias)
        else:
            return F.linear(x, self.weight_mu, self.bias_mu)
    
    @staticmethod
    def scale_noise(size):
        x = torch.randn(size)
        return x.sign().mul_(x.abs().sqrt_())

class RainbowDQN(nn.Module):
    """Rainbow DQN Network with Dueling Architecture and Distributional RL"""
    def __init__(self, input_length, output_length, atoms=51, v_min=-10, v_max=10):
        super(RainbowDQN, self).__init__()
        self.atoms = atoms
        self.v_min = v_min
        self.v_max = v_max
        self.output_length = output_length
        
        # Feature extraction
        self.linear_1 = NoisyLinear(input_length, 1024)
        self.linear_2 = NoisyLinear(1024, 512)
        
        # Dueling streams
        self.value_stream = nn.Sequential(
            NoisyLinear(512, 128),
            nn.ReLU(),
            NoisyLinear(128, atoms)
        )
        
        self.advantage_stream = nn.Sequential(
            NoisyLinear(512, 128),
            nn.ReLU(),
            NoisyLinear(128, output_length * atoms)
        )
            
    def forward(self, x):
       
        x = F.relu(self.linear_1(x))
        x = F.relu(self.linear_2(x))
        
        value = self.value_stream(x).view(-1, 1, self.atoms)
        advantage = self.advantage_stream(x).view(-1, self.output_length, self.atoms)
        
        q_dist = value + advantage - advantage.mean(dim=1, keepdim=True)
        return F.softmax(q_dist, dim=2)
    
    def reset_noise(self):
        for module in self.modules():
            if isinstance(module, NoisyLinear):
                module.reset_noise()
    
    def get_q_values(self, x):
        with torch.no_grad():
            dist = self.forward(x)
            support = torch.linspace(self.v_min, self.v_max, self.atoms).to(x.device)
            q_values = (dist * support).sum(dim=2)
            return q_values


In [6]:

REPLAY_BUFFER_CAPACITY = hyp.replay_buffer_size

def predict_activity(smiles: str) -> dict:
    """
    Предсказывает pIC50 и IC50 для молекулы по SMILES-строке.
    
    Аргументы:
        smiles (str): SMILES-представление молекулы
        
    Возвращает:
        dict: Словарь с предсказаниями pIC50 и IC50
        или сообщение об ошибке
    """
    MODEL = None
    COLUMNS = None
    
    # Загрузка модели и списка колонок при первом вызове
    if MODEL is None:
        try:
            with open('./submodel/final_catboost_model.pkl', 'rb') as f:
                MODEL = pickle.load(f)
            with open('./submodel/descriptor_columns.pkl', 'rb') as f:
                COLUMNS = pickle.load(f)
        except Exception as e:
            return {"error": f"Ошибка загрузки модели: {str(e)}"}
    
    # Преобразование SMILES в молекулярный объект
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {"error": "Невалидный SMILES"}
    
    try:
        # Вычисление обычных дескрипторов
        desc_calc = {name: func for name, func in Descriptors.descList}
        row = {}
        
        # Разделение колонок на обычные и MACCS
        regular_cols = [col for col in COLUMNS if not col.startswith('maccs_')]
        maccs_cols = [col for col in COLUMNS if col.startswith('maccs_')]
        
        # Вычисление химических дескрипторов
        for col in regular_cols:
            if col in desc_calc:
                row[col] = desc_calc[col](mol)
            else:
                return {"error": f"Неизвестный дескриптор: {col}"}
        
        # Генерация MACCS-фингерпринтов
        fp = MACCSkeys.GenMACCSKeys(mol)
        for col in maccs_cols:
            bit_idx = int(col.split('_')[1])
            row[col] = 1 if fp.GetBit(bit_idx) else 0
        

        # Создание DataFrame с сохранением порядка колонок
        input_data = pd.DataFrame([row], columns=COLUMNS)
        
        # Предсказание pIC50
        pIC50 = MODEL.predict(input_data)[0]# Здесь модель обучалась на pIC50
        
        # Конвертация в IC50 (в наномолях)
        IC50_M = 10 ** (-pIC50)  # в молях
        IC50_nM = IC50_M * 1e9   # в наномолях

        # Расчет AlogP
        alogp = MolLogP(mol)
        
        return {
            "pIC50": round(pIC50, 6),
            "IC50": IC50_nM,
            "AlogP": round(alogp, 6)
        }
        
    except Exception as e:
        return {"error": f"Ошибка предсказания: {str(e)}"}


def has_pains(mol):
    """Обнаружение PAINS субструктур"""

    pains = [
             # Реакционноспособные
            "C(Cl)(Cl)Cl", "C#N", "N=C=O",
            # PAINS
            "C1COC1", "C=CC=O", "SC(=S)NC",
            # Металлоиды
            "[As]", "[Hg]",
            # Гетероциклы
            "c1nocc1", "c1nsnc1"
        ]
    PAINS_PATTERNS = [Chem.MolFromSmiles(s) for s in pains]
    for pattern in PAINS_PATTERNS:
        if pattern and mol.HasSubstructMatch(pattern):
            return True
    return False

def calculate_bbbp_score(mol):
    """Эвристика для предсказания проницаемости через ГЭБ на основе свойств молекулы"""
    logp = Descriptors.MolLogP(mol)
    mw = Descriptors.MolWt(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Критерии BBB проницаемости (Clark's rules)
    bbbp = (
        (logp > 1.0) and 
        (logp < 5.0) and 
        (mw < 500) and 
        (hbd <= 3) and 
        (hba <= 8)
    )
    return float(bbbp)

def reward_function(smiles: str, discount_factor: float) -> float:
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return -10.0  # Штраф за невалидные молекулы
    
    # 1. Предсказание активности
    activity = predict_activity(smiles)
    
    
    
    # 2. Ключевые параметры
    ic50 = activity["IC50"]
    pIC50 = activity["pIC50"]
    qed = QED.qed(mol)
    logp = activity["AlogP"]
    sas_score = sascorer.calculateScore(mol)  # Синтезируемость (1-10, меньше = лучше)
    bbbp_score = calculate_bbbp_score(mol)    # BBB проницаемость (0/1)
    
    # 3. Компоненты награды
    # Активность: экспоненциальное усиление низкого IC50
    activity_reward = 10 / (1 + np.exp(0.5 * (ic50 - 100))) 
    
    # Drug-likeness: QED + Lipinski filters
    druglikeness = qed * 0.7 + 0.3 * float(
        Descriptors.MolWt(mol) < 500 and
        Descriptors.NumHDonors(mol) <= 5 and
        Descriptors.NumHAcceptors(mol) <= 10
    )
    
    # Штрафы
    sas_penalty = np.clip((sas_score - 4) / 6, 0, 1)  # Штраф за сложность синтеза
    tox_penalty = 0.0
    if has_pains(mol):  # Проверка PAINS
        tox_penalty = 1.0
    
    # 4. Итоговая награда
    reward = (
        + 5.0 * activity_reward        # Приоритет активности
        + 2.0 * druglikeness           # Качество лекарства
        + 1.5 * bbbp_score             # Критично для ЦНС
        - 3.0 * sas_penalty            # Штраф за сложный синтез
        - 4.0 * tox_penalty            # Жесткий штраф за токсичность
        + 0.5 * (1 / (1 + abs(logp - 3))) # Оптимальный LogP (2-4)
    )
    
    return float(reward * discount_factor)

class QEDRewardMolecule(Molecule):
    
    def __init__(self, discount_factor, **kwargs):
        
        super(QEDRewardMolecule, self).__init__(**kwargs)
        self.discount_factor = discount_factor

    def _reward(self):
        return reward_function(self._state, self.discount_factor)
        
    def _goal_reached(self):
        mol = Chem.MolFromSmiles(self._state)
        if mol is None:
            return 0.0

        activity = predict_activity(self._state)
        IC50 = float(activity['IC50'])
        sas_score = sascorer.calculateScore(mol)  # Синтезируемость (1-10, меньше = лучше)
        bbbp_score = calculate_bbbp_score(mol)    # BBB проницаемость (0/1)
        qed = QED.qed(mol)

        return (
        IC50 < 50 and 
        qed > 0.6 and 
        sas_score < 5 and 
        bbbp_score == 1 and 
        not has_pains(mol)
        )

In [7]:
 class RainbowAgent:
    def __init__(self, input_length, output_length, device, atoms=51, v_min=-10, v_max=10):
        self.device = device
        self.atoms = atoms
        self.v_min = v_min
        self.v_max = v_max
        self.delta_z = (v_max - v_min) / (atoms - 1)
        self.support = torch.linspace(v_min, v_max, atoms).to(device)
        
        # Main and target networks
        self.dqn = RainbowDQN(input_length, output_length, atoms, v_min, v_max).to(device)
        self.target_dqn = RainbowDQN(input_length, output_length, atoms, v_min, v_max).to(device)
        self.target_dqn.load_state_dict(self.dqn.state_dict())
        
        self.replay_buffer = PrioritizedReplayBuffer(hyp.replay_buffer_size)
        self.optimizer = getattr(opt, hyp.optimizer)(self.dqn.parameters(), lr=hyp.learning_rate)
        self.times_of_update = 0
    
    def get_action(self, observations, epsilon_threshold):
        if np.random.uniform() < epsilon_threshold:
            return np.random.randint(0, observations.shape[0])
        
        observations = observations.to(self.device)
        with torch.no_grad():
            q_values = self.dqn.get_q_values(observations).cpu()
        return torch.argmax(q_values).item()
    
    def update_params(self, batch_size, gamma, polyak):
        if len(self.replay_buffer) < batch_size:
            return None
        
        # Sample from prioritized replay buffer
        samples, indices, weights = self.replay_buffer.sample(batch_size)
        weights = weights.to(self.device)
        
        # Unpack batch
        states, _, rewards, next_states, dones = zip(*samples)
        states = torch.stack([torch.FloatTensor(s) for s in states]).to(self.device)
        next_states = torch.stack([torch.FloatTensor(ns) for ns in next_states]).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        # Distributional DQN update
        with torch.no_grad():
            # Next state distribution
            next_dist = self.target_dqn(next_states)
            next_q = (next_dist * self.support).sum(2)
            next_actions = next_q.argmax(1)
            
            # Project next distribution
            next_dist = next_dist[range(batch_size), next_actions]
            rewards = rewards.unsqueeze(1).expand_as(next_dist)
            dones = dones.unsqueeze(1).expand_as(next_dist)
            support = self.support.unsqueeze(0).expand_as(next_dist)
            
            Tz = rewards + gamma * support * (1 - dones)
            Tz = Tz.clamp(self.v_min, self.v_max)
            b = (Tz - self.v_min) / self.delta_z
            l = b.floor().long()
            u = b.ceil().long()
            
            offset = torch.linspace(0, (batch_size - 1) * self.atoms, batch_size).long()\
                .unsqueeze(1).expand(batch_size, self.atoms).to(self.device)
            
            proj_dist = torch.zeros(next_dist.size()).to(self.device)
            proj_dist.view(-1).index_add_(0, (l + offset).view(-1), 
                                          (next_dist * (u.float() - b)).view(-1))
            proj_dist.view(-1).index_add_(0, (u + offset).view(-1), 
                                          (next_dist * (b - l.float())).view(-1))

        
        # Current state distribution
        dist = self.dqn(states)
        actions = torch.argmax(self.dqn.get_q_values(states), dim=1)
        dist = dist[range(batch_size), actions]
        
        # Calculate loss
        log_dist = torch.log(dist.clamp(min=1e-5))
        loss = - (proj_dist * log_dist).sum(1)
        weighted_loss = (weights * loss).mean()
        
        # Backpropagation
        self.optimizer.zero_grad()
        weighted_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.dqn.parameters(), 10)
        self.optimizer.step()
        
        # Update priorities
        priorities = loss.detach().cpu().numpy() + 1e-5
        self.replay_buffer.update_priorities(indices, priorities)
        
        # Update target network
        if self.times_of_update % hyp.update_interval == 0:
            with torch.no_grad():
                for param, target_param in zip(self.dqn.parameters(), self.target_dqn.parameters()):
                    target_param.data.copy_(polyak * target_param.data + (1 - polyak) * param.data)
        
        self.times_of_update += 1
        self.dqn.reset_noise()
        self.target_dqn.reset_noise()
        
        return weighted_loss.item()

In [13]:
# Инициализация
TENSORBOARD_LOG = True
TB_LOG_PATH = "./runs/dqn/run2"
episodes = 0
iterations = 25000
num_updates_per_it = 1

initmols = pd.read_csv("./InitMols.csv", sep=';')["Smiles"].to_numpy()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
environment = QEDRewardMolecule(
    discount_factor=hyp.discount_factor,
    atom_types=set(hyp.atom_types),
    init_mols = initmols,
    allow_removal=hyp.allow_removal,
    allow_no_modification=hyp.allow_no_modification,
    allow_bonds_between_rings=hyp.allow_bonds_between_rings,
    allowed_ring_sizes=set(hyp.allowed_ring_sizes),
    max_steps=hyp.max_steps_per_episode,
)

# Rainbow Agent вместо DQN
agent = RainbowAgent(
    input_length=hyp.fingerprint_length + 1,
    output_length=1,
    device=device,
    atoms=51,
    v_min=-10,
    v_max=10
)

# Загрузка весов
# agent.dqn.load_state_dict(torch.load("best_weights.pt"))

if TENSORBOARD_LOG:
    writer = SummaryWriter(TB_LOG_PATH)

environment.initialize()
eps_threshold = 1
best_reward = -1000

In [15]:
eps_threshold = 0.6
agent.dqn.load_state_dict(torch.load('best_weights.pt', weights_only=True))
for it in tqdm(range(iterations)):
    steps_left = hyp.max_steps_per_episode - environment.num_steps_taken
    valid_actions = list(environment.get_valid_actions())
    
    # Подготовка наблюдений
    observations = np.vstack([
        np.append(
            utils.get_fingerprint(act, hyp.fingerprint_length, hyp.fingerprint_radius),
            steps_left
        ) for act in valid_actions
    ])
    
    observations_tensor = torch.Tensor(observations)
    
    # Выбор действия
    action_idx = agent.get_action(observations_tensor, max(0.1, eps_threshold))
    action = valid_actions[action_idx]
    
    # Шаг среды
    result = environment.step(action)
    next_state, reward, done = result
    
    # Сохранение в буфер (с n-step)
    action_fingerprint = np.append(
        utils.get_fingerprint(action, hyp.fingerprint_length, hyp.fingerprint_radius),
        steps_left
    )

    steps_left_next = steps_left - 1 if not done else 0
    next_state_fp = np.append(
        utils.get_fingerprint(action, hyp.fingerprint_length, hyp.fingerprint_radius),
        steps_left_next
    )
    

    # Добавляем переход в буфер 
    agent.replay_buffer.add((
        action_fingerprint, 
        action_idx, 
        reward, 
        next_state_fp, 
        float(done)
    ), n_step=3, gamma=hyp.discount_factor)
    
    # Обновление модели
    if it % hyp.update_interval == 0 and len(agent.replay_buffer) > hyp.batch_size:
        loss = agent.update_params(hyp.batch_size, hyp.gamma, hyp.polyak)
        
        if TENSORBOARD_LOG and loss is not None:
            writer.add_scalar("training/loss", loss, it)
    
    # Обработка завершения эпизода
    if done:
        final_reward = reward
        
        if TENSORBOARD_LOG:
            writer.add_scalar("episode/reward", final_reward, episodes)
            writer.add_scalar("episode/epsilon", eps_threshold, episodes)
        
        # Логирование и сохранение лучшей модели
        if episodes % 10 == 0:
            print(f"Episode {episodes}, Reward: {final_reward:.2f}, Best Reward: {best_reward:.2f}, Eps: {eps_threshold:.3f}")
            mol = Chem.MolFromSmiles(environment._state)
            if mol:
                activity = predict_activity(environment._state)
                IC50 = float(activity['IC50'])
                AlogP = float(activity['AlogP'])
                qed = QED.qed(mol)

                sa = sascorer.calculateScore(mol)
                print(f"  QED: {qed:.3f}, IC50: {IC50:.3f},  AlogP: {AlogP:.3f}, SA: {sa:.3f} Molecule: {environment._state}")
                
        if final_reward > best_reward:
            torch.save(agent.dqn.state_dict(), 'best_weights.pt')
            print(f"Saved best model with reward: {final_reward:.2f}")
            best_reward = final_reward
        
        # Сброс среды и уменьшение epsilon
        environment.initialize()
        eps_threshold = max(hyp.epsilon_end, eps_threshold * hyp.gamma)
        episodes += 1

# Закрытие логгера
if TENSORBOARD_LOG:
    writer.close()

  activity_reward = 10 / (1 + np.exp(0.5 * (ic50 - 100)))
  0%|                                                                             | 30/25000 [00:10<2:51:06,  2.43it/s]

Episode 0, Reward: -1.17, Best Reward: -1000.00, Eps: 0.600
  QED: 0.199, IC50: 888.274,  AlogP: 1.389, SA: 7.299 Molecule: C=NC12OSc3cc4c(c(-n5sc(=O)n(C6c7c8cc(c(O)c76)NS8)c5=O)c31)=CC21CC=41
Saved best model with reward: -1.17


  0%|▎                                                                           | 120/25000 [00:42<2:13:03,  3.12it/s]

Saved best model with reward: 0.34


  1%|▍                                                                           | 150/25000 [00:53<2:18:15,  3.00it/s]

Saved best model with reward: 48.80


  1%|▋                                                                           | 211/25000 [01:09<2:41:23,  2.56it/s]

Saved best model with reward: 52.80


  1%|▉                                                                           | 301/25000 [01:36<3:00:27,  2.28it/s]

Episode 10, Reward: -4.78, Best Reward: 52.80, Eps: 0.576
  QED: 0.199, IC50: 542.726,  AlogP: 2.506, SA: 8.081 Molecule: N=C1C2C34OC25N3C=C(OS)C12C4C#CN1c3nc4nc6[nH]c7c-4c(F)c(SO)c(c7c36)C125


  2%|█▊                                                                          | 600/25000 [03:29<4:09:00,  1.63it/s]

Episode 20, Reward: -4.75, Best Reward: 52.80, Eps: 0.554
  QED: 0.024, IC50: 288.424,  AlogP: 3.533, SA: 6.310 Molecule: CCOc1cc(-c2n[nH]c3c(S)c(C#N)c4c5nnc([nH]5)c4c23)c2c(c1CN)C(OO)(C(=O)N(C#N)CC(C)(C=S)SN)S2


  4%|██▋                                                                         | 900/25000 [06:57<3:42:36,  1.80it/s]

Episode 30, Reward: -0.10, Best Reward: 52.80, Eps: 0.532
  QED: 0.153, IC50: 129.371,  AlogP: 2.957, SA: 6.783 Molecule: OC=NC1=CC(=S)C2c3c4c5c(S)c(c3n(-c3nc6ccc7cn6c3c3c6nc6n73)c4=N5)C12O


  5%|███▌                                                                       | 1201/25000 [09:51<3:34:28,  1.85it/s]

Episode 40, Reward: -0.94, Best Reward: 52.80, Eps: 0.511
  QED: 0.094, IC50: 130.134,  AlogP: 1.137, SA: 6.519 Molecule: Cc1n[nH]c(CO)c1-c1nc(-n2oc3c4c5c6c(oc5=C(NC(O)=C=S)CC64)c32)nc(NON)c1O


  6%|████▍                                                                      | 1471/25000 [12:17<3:24:27,  1.92it/s]

Episode 50, Reward: 48.16, Best Reward: 52.80, Eps: 0.491
  QED: 0.049, IC50: 67.324,  AlogP: 2.928, SA: 7.770 Molecule: C#Cc1c([N+]([O-])ON=O)cc2c3c4[nH]c2c1NC3(OC(O)=S)C1(O)Oc2c3c(SN=N)c-4c4c2c1nn4C3O


  7%|█████▎                                                                     | 1772/25000 [15:35<3:23:56,  1.90it/s]

Episode 60, Reward: -0.64, Best Reward: 52.80, Eps: 0.472
  QED: 0.038, IC50: 788.508,  AlogP: 6.980, SA: 5.596 Molecule: Cc1c(C(=S)Sc2nnc(-c3c4c5nc(o5)c34)o2)cc2c3cc(C(N)(N)SO)c(c4ssc34)c2c1C(=S)S


  8%|██████▏                                                                    | 2072/25000 [17:53<2:48:33,  2.27it/s]

Episode 70, Reward: -4.63, Best Reward: 52.80, Eps: 0.453
  QED: 0.061, IC50: 842.927,  AlogP: 5.280, SA: 5.835 Molecule: NON(SS)SC(=C=S)c1c(Cl)c2c1c1c2c2c(Cl)c3c(c4[nH]s[nH]c42)c2c(=O)[nH]c(=O)c2n31


  9%|███████                                                                    | 2372/25000 [20:38<4:50:34,  1.30it/s]

Episode 80, Reward: -1.64, Best Reward: 52.80, Eps: 0.435
  QED: 0.220, IC50: 453.735,  AlogP: 0.528, SA: 8.211 Molecule: NC12C3(F)C(O)C4C15C1NC6NOC27C4(N6C(/C=C2/c4cc6scnc6c(SO)c42)C3CS)C157


 10%|███████▏                                                                   | 2403/25000 [20:50<2:44:06,  2.29it/s]

Saved best model with reward: 52.87


 11%|███████▉                                                                   | 2642/25000 [22:45<4:17:58,  1.44it/s]

Episode 90, Reward: -1.02, Best Reward: 52.87, Eps: 0.418
  QED: 0.043, IC50: 529.783,  AlogP: 5.943, SA: 6.434 Molecule: CCN=Nc1c(C(F)(F)F)cc2c3c1C#CC(O)(C3=O)C(O)N2c1nc2c(C(C)SN)c(-c3[nH]c(SC)nc3-c3cc4cc-4c3N)c1-2


 12%|████████▋                                                                  | 2913/25000 [25:32<5:45:22,  1.07it/s]

Episode 100, Reward: -0.85, Best Reward: 52.87, Eps: 0.402
  QED: 0.066, IC50: 153.086,  AlogP: 2.134, SA: 6.444 Molecule: N=Cc1ccc(S(=O)(=O)N2C3CN[C@H](C(SC=S)C(N=N)c4c(-c5nc6nc-6c5O)nc[nH]c4=O)C2C3)cc1S


 13%|█████████▋                                                                 | 3213/25000 [28:18<2:43:44,  2.22it/s]

Episode 110, Reward: -0.77, Best Reward: 52.87, Eps: 0.386
  QED: 0.089, IC50: 320.243,  AlogP: 2.512, SA: 6.475 Molecule: CC(NOS)c1c2c3c-2c(-n2c4nc(ON(O)SNO)c2c(-c2c5nn6nc5ccc26)n4)c1-3


 13%|██████████                                                                 | 3335/25000 [29:23<3:29:44,  1.72it/s]

Saved best model with reward: 53.04


 14%|██████████▍                                                                | 3485/25000 [30:36<2:48:59,  2.12it/s]

Episode 120, Reward: -1.01, Best Reward: 53.04, Eps: 0.371
  QED: 0.036, IC50: 1157.253,  AlogP: 5.731, SA: 6.401 Molecule: NN1c2oc3c(Cl)c(SC#[SH])c(-c4ocnc4S)c(C4(c5cc(SSO)nc6c5C(=S)O6)C5N(N=S)N54)c3c21


 15%|███████████▎                                                               | 3785/25000 [33:07<2:42:43,  2.17it/s]

Episode 130, Reward: -3.86, Best Reward: 53.04, Eps: 0.356
  QED: 0.134, IC50: 315.820,  AlogP: 3.157, SA: 5.045 Molecule: Cn1sn1N(C(=O)O)C(=O)C(=Cc1c(O)c2c(Cl)c(NC(=O)C#[SH])c1-2)n1c#cc2cccc(O)c21


 16%|████████████                                                               | 4027/25000 [35:06<3:02:58,  1.91it/s]

Episode 140, Reward: -0.76, Best Reward: 53.04, Eps: 0.342
  QED: 0.073, IC50: 508.238,  AlogP: 5.295, SA: 6.047 Molecule: CON(S)c1c2c(cc(O)c1-n1s/c(=N/C(C#[SH])c3cncc(S)c3)nc1-c1c(C=[SH])c3c4oc#cc1c43)O2


 17%|████████████▉                                                              | 4327/25000 [37:32<3:09:11,  1.82it/s]

Episode 150, Reward: -0.77, Best Reward: 53.04, Eps: 0.329
  QED: 0.022, IC50: 750.749,  AlogP: 2.086, SA: 6.136 Molecule: COC(N)OC(=S)C1(C(=O)N(O)c2n[nH]c3nc4c(cc23)-c2c(N=N)c3c(OC(O)O)c-4c2-3)C(CC=S)C1OS


 19%|█████████████▉                                                             | 4627/25000 [39:49<2:32:17,  2.23it/s]

Episode 160, Reward: -0.65, Best Reward: 53.04, Eps: 0.316
  QED: 0.055, IC50: 5781.817,  AlogP: 0.842, SA: 6.990 Molecule: Cc1cc2c(cc1C(N)=NS)N2C(=S)NC1(O)N2CC3(OO2)C(=COO)C31C


 20%|██████████████▊                                                            | 4927/25000 [42:13<3:06:04,  1.80it/s]

Episode 170, Reward: -4.35, Best Reward: 53.04, Eps: 0.304
  QED: 0.117, IC50: 868.745,  AlogP: 3.742, SA: 5.698 Molecule: N#CC(=N)C1C(=O)N(N=N)c2nc(N)cc(-c3[nH]c(Cc4c(N)cc5cc4-5)nc3-c3c4[nH]c5cc4c35)c21


 21%|███████████████▋                                                           | 5226/25000 [44:38<2:59:42,  1.83it/s]

Episode 180, Reward: -1.18, Best Reward: 53.04, Eps: 0.292
  QED: 0.156, IC50: 325.631,  AlogP: 2.127, SA: 7.346 Molecule: Cc1nonc1-n1nnc(-c2n[nH]c(-c3ccncc3)n2)c1C1(SS)OOC2=C3SC(=N)C24C(S)C3N14


 22%|████████████████▍                                                          | 5498/25000 [46:49<2:53:39,  1.87it/s]

Episode 190, Reward: -2.86, Best Reward: 53.04, Eps: 0.280
  QED: 0.454, IC50: 278.528,  AlogP: 5.259, SA: 4.551 Molecule: C=Cc1ccc(-c2cc3c4c(n[nH]c4c2OC)N3C(OC#N)C2=C=C2C)c(C)c1C


 23%|█████████████████▎                                                         | 5769/25000 [49:36<3:02:25,  1.76it/s]

Episode 200, Reward: -0.68, Best Reward: 53.04, Eps: 0.269
  QED: 0.078, IC50: 265.030,  AlogP: 2.107, SA: 6.126 Molecule: NN1On2oooc3c(-n4c5c(c6c7cc(OOO)c-7c64)C(O)NC5=O)cc1c(C(O)Nc1cc4ccc1-4)c32


 24%|██████████████████▏                                                        | 6069/25000 [52:14<2:20:21,  2.25it/s]

Episode 210, Reward: 0.57, Best Reward: 53.04, Eps: 0.259
  QED: 0.243, IC50: 120.368,  AlogP: 2.431, SA: 5.364 Molecule: OOc1ncc2c3c1[nH]n3C21C(CS)=NN2c3c(Cl)cc(O)c1c32


 25%|██████████████████▉                                                        | 6312/25000 [54:32<3:45:59,  1.38it/s]

Episode 220, Reward: 0.70, Best Reward: 53.04, Eps: 0.248
  QED: 0.278, IC50: 298.113,  AlogP: 3.079, SA: 4.293 Molecule: COc1ccc2c(CC(=O)N3C(=O)Cc4c3ccc3c4OC3)c(C(=N)C(C)n3cc(C)nc3O)n(C=O)c2c1


 26%|███████████████████▋                                                       | 6554/25000 [56:22<2:52:46,  1.78it/s]

Episode 230, Reward: 52.89, Best Reward: 53.04, Eps: 0.239
  QED: 0.732, IC50: 6.544,  AlogP: 2.350, SA: 3.253 Molecule: Cc1c(-c2ccncn2)nc(N2CCNC(c3ccccc3Cl)C2)n(C)c1=O


 27%|████████████████████▍                                                      | 6825/25000 [58:40<2:01:15,  2.50it/s]

Episode 240, Reward: 0.38, Best Reward: 53.04, Eps: 0.229
  QED: 0.334, IC50: 801.009,  AlogP: 4.309, SA: 5.798 Molecule: O=C(S)C1c2c(Br)c3c4c1c(-c1ccc5cc1N5)[nH]c4c2NN3


 28%|████████████████████▊                                                    | 7124/25000 [1:01:05<2:19:23,  2.14it/s]

Episode 250, Reward: -0.06, Best Reward: 53.04, Eps: 0.220
  QED: 0.045, IC50: 528.592,  AlogP: 3.769, SA: 4.815 Molecule: Cc1c2cc(c(N)c1N)N(C(=O)NO)c1c-2[nH]c2nc(SS)nc(Oc3cc(O)c4c(O)c3-4)c12


 29%|█████████████████████▌                                                   | 7366/25000 [1:03:17<3:34:47,  1.37it/s]

Episode 260, Reward: -0.69, Best Reward: 53.04, Eps: 0.212
  QED: 0.064, IC50: 915.022,  AlogP: 1.245, SA: 5.937 Molecule: CNC(O)c1cc(O)c2c(-c3nc(N4C5=CCCC4=C5)ncc3O)c(-c3c(C(N)(S)NO)ccc(O)c3O)nn2n1


 31%|██████████████████████▎                                                  | 7638/25000 [1:05:16<2:59:12,  1.61it/s]

Episode 270, Reward: 44.35, Best Reward: 53.04, Eps: 0.203
  QED: 0.117, IC50: 95.291,  AlogP: 2.258, SA: 6.636 Molecule: CC(S)(S)c1nc(C2C=C=C3C(N)C32)c(-c2c(OS)c3nc(N4C(=N)C5C(S)C54O)c2-3)[nH]1


 32%|███████████████████████▏                                                 | 7938/25000 [1:08:00<2:17:17,  2.07it/s]

Episode 280, Reward: -3.16, Best Reward: 53.04, Eps: 0.195
  QED: 0.341, IC50: 2272.666,  AlogP: 6.152, SA: 4.788 Molecule: CCC(=O)c1c2c3c(c(-c4cncc(-c5onc6c5s[nH]c5c6c6c7c5c67)n4)c1-3)S2


 33%|████████████████████████                                                 | 8238/25000 [1:10:42<2:42:10,  1.72it/s]

Episode 290, Reward: -0.69, Best Reward: 53.04, Eps: 0.188
  QED: 0.027, IC50: 1538.290,  AlogP: 5.270, SA: 5.770 Molecule: COc1cc(S)c2c3nn(c2c1)c1nc(SS)nc(NN=C(O)c2c4sn5c2cc(OCC(=S)N2C=C2)c45)c31


 34%|████████████████████████▋                                                | 8450/25000 [1:12:41<3:14:01,  1.42it/s]

Episode 300, Reward: -4.53, Best Reward: 53.04, Eps: 0.180
  QED: 0.033, IC50: 744.468,  AlogP: 4.112, SA: 5.714 Molecule: C#Cc1c(SN=C=O)c2c(c3c1[nH]c1c3c(CN)c(C)c3c4cccc(OO)c4n([C@H]4C[C@@H](NC)[C@@H](OCN)[C]O4)c13)OC2


 35%|█████████████████████████▌                                               | 8751/25000 [1:15:20<2:47:29,  1.62it/s]

Episode 310, Reward: -0.72, Best Reward: 53.04, Eps: 0.173
  QED: 0.046, IC50: 396.635,  AlogP: -1.409, SA: 5.762 Molecule: NON=COC(c1nnn2c1c(NC(O)C1=C3OSOC31O)nc1nonc12)n1ncc2c(CO)cncc21


 36%|██████████████████████████▎                                              | 9021/25000 [1:17:35<2:25:29,  1.83it/s]

Episode 320, Reward: -0.82, Best Reward: 53.04, Eps: 0.166
  QED: 0.095, IC50: 2274.792,  AlogP: 5.142, SA: 6.242 Molecule: C=NC1(C(=S)c2cc3c4c5c6c(c7c8ccccc8n(c7c25)C4(S)S3)C(=O)NC6=O)C2SSC21


 37%|███████████████████████████                                              | 9263/25000 [1:19:47<1:17:58,  3.36it/s]

Episode 330, Reward: 1.10, Best Reward: 53.04, Eps: 0.160
  QED: 0.347, IC50: 575.969,  AlogP: 2.519, SA: 4.624 Molecule: Oc1c2cc(-c3nc4[nH]c5n[nH]c4c5c3S)cc1-2


 38%|███████████████████████████▊                                             | 9534/25000 [1:22:15<4:09:07,  1.03it/s]

Episode 340, Reward: -0.62, Best Reward: 53.04, Eps: 0.154
  QED: 0.082, IC50: 600.282,  AlogP: 3.410, SA: 6.194 Molecule: C=CC(=CC(O)NC12SN1Cc1cc2c(OC)cc1OC)c1cc(OC(C)OC(O)S)c(O)cc1S


 39%|████████████████████████████▍                                            | 9746/25000 [1:24:41<2:45:14,  1.54it/s]

Saved best model with reward: 53.19


 39%|████████████████████████████▋                                            | 9805/25000 [1:25:07<2:32:24,  1.66it/s]

Episode 350, Reward: -0.04, Best Reward: 53.19, Eps: 0.148
  QED: 0.088, IC50: 776.094,  AlogP: 4.212, SA: 5.984 Molecule: [CH]C(c1cc(NO)cc2c3c(c4c5c(S)cc6cc5n(c4c12)C6)C(=O)NC3=O)C1SN1S


 40%|████████████████████████████▊                                           | 10025/25000 [1:27:13<2:10:18,  1.92it/s]


KeyboardInterrupt: 

In [19]:
generated_molecules = []
agent.dqn.load_state_dict(torch.load('best_weights.pt', weights_only=True))
num_molecules_to_generate = 300
agent.dqn.eval()
eps_threshold = 0.03

for it in range(num_molecules_to_generate):
    done = False
    environment.initialize()
    while not done:
        steps_left = hyp.max_steps_per_episode - environment.num_steps_taken
        valid_actions = list(environment.get_valid_actions())
    
        observations = np.vstack(
            [
                np.append(
                    utils.get_fingerprint(
                        act, hyp.fingerprint_length, hyp.fingerprint_radius
                    ),
                    steps_left,
                )
                for act in valid_actions
            ]
        ) 
    
        observations_tensor = torch.Tensor(observations)
        a = agent.get_action(observations_tensor, eps_threshold)
        action = valid_actions[a]
        result = environment.step(action)
    
        action_fingerprint = np.append(
            utils.get_fingerprint(action, hyp.fingerprint_length, hyp.fingerprint_radius),
            steps_left,
        )
    
        next_state, reward, done = result
        steps_left = hyp.max_steps_per_episode - environment.num_steps_taken
    
        next_state = utils.get_fingerprint(
            next_state, hyp.fingerprint_length, hyp.fingerprint_radius
        )  
    
        action_fingerprints = np.vstack(
            [
                np.append(
                    utils.get_fingerprint(
                        act, hyp.fingerprint_length, hyp.fingerprint_radius
                    ),
                    steps_left,
                )
                for act in environment.get_valid_actions()
            ]
        )
        if reward > 30:
            generated_molecules.append(environment._state)
            print(generated_molecules[-1])
        if len(generated_molecules) == num_molecules_to_generate:
            break
        print(reward)
    if len(generated_molecules) == num_molecules_to_generate:
        break
        
    #generated_molecules.append(environment._state)
    #print(generated_molecules[-1])

  activity_reward = 10 / (1 + np.exp(0.5 * (ic50 - 100)))


3.180538557836398
3.1235761272087124
3.1194069726140348
3.1194069726140343
2.6722052292129566
2.6722052292129566
2.6722052292129566
2.6722052292129566
2.6722052292129566
3.1194069726140348
3.1781723669982154
3.0489651785208194
2.7630303997671457
2.3617036636134574
2.1409490227414696
2.1409490227414696
2.1409490227414696
2.1409490227414696
-0.09749912989736224
-0.4392138642344335
-0.33564906646560433
-0.3679912968192568
-0.6402631460730884
-0.8504989798915996
-0.8504989798915996
-1.0725266236958462
-1.2688962140143956
-1.211012702794386
-0.5904397257259042
-0.8930788501412705
Cc1cc(F)ccc1C1CN(c2nc(-c3ccncc3)c(C)c(=O)n2C)CCN1.Cl
52.961503468853
3.2369253000761846
2.973251250720048
1.4470483485348973
1.1849760703858803
1.4470483485348973
1.4640956185052474
1.30503020470573
1.4640956185052474
1.6038034698381582
1.3635705579897996
0.4906699514125283
C=COC(CCNC(OO)c1nc(-c2cccnc2)no1)N1CCN(c2ccccc2CO)CC1C
49.94639857606416
3.596904743300353
1.3101051758256308
C=COC(CCNC(OO)c1nc(-c2ccc(C)nc2)n

In [21]:
list(set(generated_molecules))

['[H]C1C2C(=NO)C(CN1c1cc(OC)c(O)cc1CO)N2C(CCNC(OO)c1nc(-c2ccc3nc2C3C)no1)OC=C=S',
 'Cc1cc(CCN(N=O)C(=O)c2ccnc3[nH]c(-c4ccccc4F)nc23)ccc1NS(C)(=O)=O',
 '[H]c1c2c(O)cc(NS(C)(=O)=O)c1N(C(=O)c1ccnc3[nH]c(-c4sc5cc4-5)nc13)CC2=C',
 '[H]c1cc(CN(C=Cc2ccc(NS(C)(=O)=O)c(CO)c2)N=O)c2nc(-c3c(F)c(O)c4c(O)c3-4)[nH]c2n1',
 '[H]c1ccc(-c2nc3c(C(=O)N(CCc4ccc(NS(C)(=O)=O)c(C)c4)N=O)cc(C)nc3[nH]2)c(F)c1O',
 '[H]Cc1c2c(CC(=O)N(S)C(=O)c3ccc(O)c4[nH]c(-c5c6cc(F)c(C=N)c5S6)nc34)c-2c(O)c1N(OO)S(=O)(=O)c1ccc(CO)cc1',
 '[H]c1cc(O)c(-c2ccc3c(NC(=O)C4C=C4)n[nH]c3n2)cc1O',
 'CS(=O)(=O)Nc1ccc(C=CN(Cc2ccnc3[nH]c(-c4c(F)c(O)c5c(O)c4-5)nc23)N=O)cc1CO',
 '[H]c1cc(S(=O)(=O)N2C#CN(C)CC2)c(-c2nc(C(=O)Nc3cccnc3)c3nc2N3)cc1C',
 '[H]NN1c2nc3c(C(=O)n4c5cc(N6CCNC(=O)C6)c4cn5)nc2-c2c(ccc(CO)c2F)N31',
 '[H]c1cc(-c2cnccc2OC)c(S)c2c(C(=O)N(O)C(=O)C3CCN(C(=S)c4ccc(C(=O)O)o4)CC3)n[nH]c12',
 '[H]c1c(O)ccc(O)c1-c1ccc2c(NC(=S)C3=C=C3)n[nH]c2n1',
 '[H]N1CCN(c2cc3ncc2n3C(=O)c2nc(-c3ccccc3F)cnc2N)CC1=O',
 'C=C1C2=C(C=CN1c1nccc(-c3cnn4nc(OC

In [22]:
pd.DataFrame(list(set(generated_molecules))).to_csv("generated_mols.csv")

In [None]:
from rdkit import Chem

valid_smiles = []
for smi in generated_molecules:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        valid_smiles.append(smi)

print(f"Сгенерировано валидных молекул: {len(valid_smiles)}/{len(generated_molecules)}")