In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import pandas as pd
import torch.optim as opt
import utils
import hyp
from rdkit import Chem
from rdkit.Chem import QED
from environment import Molecule
import replay_buffer
from torch.utils.tensorboard import SummaryWriter
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors, QED
from rdkit.Contrib.SA_Score import sascorer

In [6]:
import os
import subprocess
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_pdbqt(smiles: str, output_file: str = "legant.pdbqt"):
    """Конвертирует SMILES в PDBQT через Open Babel."""
    # Создание молекулы из SMILES и добавление водородов
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    status = AllChem.EmbedMolecule(mol)
    if status == -1:
        raise RuntimeError("Не удалось сгенерировать 3D-структуру")
    
    # Сохранение во временный файл .mol
    temp_mol = "temp.mol"
    Chem.MolToMolFile(mol, temp_mol)
    if not os.path.exists(temp_mol):
        raise FileNotFoundError("Временный файл не создан")
    
    # Конвертация в PDBQT через Open Babel
    cmd = f"obabel {temp_mol} -O legant.pdbqt"
    result = subprocess.run(
        cmd, 
        shell=True, 
        capture_output=True, 
        text=True
    )

    if result.returncode != 0:
        error_msg = f"Ошибка Open Babel:\n{result.stderr}"
        if "Invalid output format" in result.stderr:
            error_msg += "\nУбедитесь, что Open Babel установлен и добавлен в PATH"
        raise RuntimeError(error_msg)

    if not os.path.exists(output_file):
            raise FileNotFoundError(f"Файл {output_file} не создан")
    
    #os.remove(temp_mol)

def run_vina_docking(protein_pdbqt: str, ligand_pdbqt: str, center: tuple = (27.116, 24.090, 14.936), size: tuple = (10, 10, 10)) -> float:
    """Запускает докинг и возвращает энергию связывания."""
    # Создание конфигурационного файла для Vina
    config = f"""
    receptor = {protein_pdbqt}
    ligand = {ligand_pdbqt}
    out = result.pdbqt
    center_x = {center[0]}
    center_y = {center[1]}
    center_z = {center[2]}
    size_x = {size[0]}
    size_y = {size[1]}
    size_z = {size[2]}
    exhaustiveness = 16
    cpu = 12
    """
    with open("config.txt", "w") as f:
        f.write(config)
    
    # Запуск AutoDock Vina
    with open("log.txt", "w") as log_file:
        result = subprocess.run(
            "vina_1.2.7_win --config config.txt",
            stdout=log_file,
            text=True,
            check=True,
            shell=True
        )
        if result.returncode != 0:
            print("Ошибка Vina:", result.stderr)
            return None
    
    # Извлечение энергии связывания из лога
    with open("log.txt", "r") as f:
        log = f.read()
        affinity_values = []
        for line in log.split("\n"):
            if line.strip().startswith("1"):  # Первая строка с результатами
                parts = line.split()
                if len(parts) >= 2:
                    try:
                        affinity = float(parts[1])
                        affinity_values.append(affinity)
                    except ValueError:
                        continue
    
        # Возвращаем лучшую энергию
        if affinity_values:
            return min(affinity_values)
        else:
            print("Энергии связывания не найдены")
            return None

In [7]:
smiles_to_pdbqt("CC(=O)c1cc(OCCN)c(C)cc1C1=C(C)C(=O)COC=C1")


In [8]:
run_vina_docking("COX-2.pdbqt", "legant.pdbqt")

-6.972

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('error', category=DeprecationWarning)

In [4]:
class MolDQN(nn.Module):
    def __init__(self, input_length, output_length):
        super(MolDQN, self).__init__()

        self.linear_1 = nn.Linear(input_length, 1024)
        self.linear_2 = nn.Linear(1024, 512)
        self.linear_3 = nn.Linear(512, 128)
        self.linear_4 = nn.Linear(128, 32)
        self.linear_5 = nn.Linear(32, output_length)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.linear_1(x))
        x = self.activation(self.linear_2(x))
        x = self.activation(self.linear_3(x))
        x = self.activation(self.linear_4(x))
        x = self.linear_5(x)

        return x

In [5]:
from joblib import dump, load
REPLAY_BUFFER_CAPACITY = hyp.replay_buffer_size

irritation_model = load('model_irrit.joblib')
melanin_model = load('model_melanin.joblib')
corneal_model = load('catboost_model_joblib.pkl')

def get_fingerprint(molecule):
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=hyp.fingerprint_radius,fpSize=hyp.fingerprint_length)
    if molecule is None:
        return np.zeros((hyp.fingerprint_length,))
    fingerprint = mfpgen.GetFingerprint(molecule)
    return np.array(fingerprint)

def smiles_to_features(smiles:str):
    mol = Chem.MolFromSmiles(smiles)
    descriptor_names = [name for name, func in Descriptors.descList]
    def calc_descriptors(mol):
        if mol is not None:
            descriptor_values = [func(mol) for name, func in Descriptors.descList]
            return descriptor_values
        else:
            return [None] * len(descriptor_names)
    
    descriptor_values = calc_descriptors(mol)
    
    
    features = pd.Series(dict(zip(descriptor_names, descriptor_values)))
    
    return features

class QEDRewardMolecule(Molecule):
    
    def __init__(self, discount_factor, **kwargs):
        
        super(QEDRewardMolecule, self).__init__(**kwargs)
        self.discount_factor = discount_factor

    def _reward(self):
        
        molecule = Chem.MolFromSmiles(self._state)
        if molecule is None:
            return 0.0
        qed = QED.qed(molecule)
        irrit_proba = irritation_model.predict_proba(np.expand_dims(get_fingerprint(molecule), axis=0))[0, 1]
        melanin_proba = melanin_model.predict_proba(np.expand_dims(get_fingerprint(molecule), axis=0))[0, 1]
        sa_score = sascorer.calculateScore(molecule)
        corneal = corneal_model.predict(smiles_to_features(self._state))
        """try:
            smiles_to_pdbqt(self._state)
            docking = run_vina_docking("COX-2.pdbqt", "legant.pdbqt")
        except:
            docking = 0"""
        
        docking = 0
        return (2*np.sqrt(qed) -( 1/3*sa_score if sa_score > 3.5 else 0.4) * sa_score + 0.1 * melanin_proba + docking + 0.1*(corneal-1.7) - irrit_proba*2) * self.discount_factor ** (self.num_steps_taken)


In [6]:
class Agent(object):
    def __init__(self, input_length, output_length, device):
        self.device = device
        self.dqn, self.target_dqn = (
            MolDQN(input_length, output_length).to(self.device),
            MolDQN(input_length, output_length).to(self.device),
        )
        for p in self.target_dqn.parameters():
            p.requires_grad = False
        self.replay_buffer = replay_buffer.ReplayBuffer(REPLAY_BUFFER_CAPACITY)
        self.optimizer = getattr(opt, hyp.optimizer)(
            self.dqn.parameters(), lr=hyp.learning_rate
        )
        self.times_of_update = 0

    def get_action(self, observations, epsilon_threshold):

        if np.random.uniform() < epsilon_threshold:
            action = np.random.randint(0, observations.shape[0])
        else:
            q_value = self.dqn.forward(observations.to(self.device)).cpu()
            action = torch.argmax(q_value).numpy()

        return action

    def update_params(self, batch_size, gamma, polyak):
        # update target network

        # sample batch of transitions
        states, _, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        q_t = torch.zeros(batch_size, 1, requires_grad=False)
        v_tp1 = torch.zeros(batch_size, 1, requires_grad=False)
        for i in range(batch_size):
            state = (
                torch.FloatTensor(states[i])
                .reshape(-1, hyp.fingerprint_length + 1)
                .to(self.device)
            )
            q_t[i] = self.dqn(state)

            next_state = (
                torch.FloatTensor(next_states[i])
                .reshape(-1, hyp.fingerprint_length + 1)
                .to(self.device)
            )
            v_tp1[i] = torch.max(self.target_dqn(next_state))

        rewards = torch.FloatTensor(rewards).reshape(q_t.shape).to(self.device)
        q_t = q_t.to(self.device)
        v_tp1 = v_tp1.to(self.device)
        dones = torch.FloatTensor(dones).reshape(q_t.shape).to(self.device)

        # # get q values
        q_tp1_masked = (1 - dones) * v_tp1
        q_t_target = rewards + gamma * q_tp1_masked
        td_error = q_t - q_t_target

        q_loss = torch.where(
            torch.abs(td_error) < 1.0,
            0.5 * td_error * td_error,
            1.0 * (torch.abs(td_error) - 0.5),
        )
        q_loss = q_loss.mean()

        # backpropagate
        self.optimizer.zero_grad()
        q_loss.backward()
        self.optimizer.step()

        if self.times_of_update % 10 == 0:
            with torch.no_grad():
                for p, p_targ in zip(self.dqn.parameters(), self.target_dqn.parameters()):
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)
            self.times_of_update += 1
        return q_loss
        

In [7]:
from tqdm import tqdm
TENSORBOARD_LOG = True
TB_LOG_PATH = "./runs/dqn/run2"
episodes = 0
iterations = 25000
update_interval = 16
batch_size = 512
num_updates_per_it = 1

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

environment = QEDRewardMolecule(
    discount_factor=hyp.discount_factor,
    atom_types=set(hyp.atom_types),
    init_mol=hyp.start_molecule,
    allow_removal=hyp.allow_removal,
    allow_no_modification=hyp.allow_no_modification,
    allow_bonds_between_rings=hyp.allow_bonds_between_rings,
    allowed_ring_sizes=set(hyp.allowed_ring_sizes),
    max_steps=hyp.max_steps_per_episode,
)

# DQN Inputs and Outputs:
# input: appended action (fingerprint_length + 1) .
# Output size is (1).

agent = Agent(hyp.fingerprint_length + 1, 1, device)
agent.dqn.load_state_dict(torch.load("best_weights.pt", weights_only=True))


if TENSORBOARD_LOG:
    writer = SummaryWriter(TB_LOG_PATH)

environment.initialize()

eps_threshold = 0.8
batch_losses = []

best_reward = -1000

In [8]:
for it in tqdm(range(iterations)):

    steps_left = hyp.max_steps_per_episode - environment.num_steps_taken

    valid_actions = list(environment.get_valid_actions())

    observations = np.vstack(
        [
            np.append(
                utils.get_fingerprint(
                    act, hyp.fingerprint_length, hyp.fingerprint_radius
                ),
                steps_left,
            )
            for act in valid_actions
        ]
    )

    observations_tensor = torch.Tensor(observations)

    a = agent.get_action(observations_tensor, max(0.1, eps_threshold))

   
    action = valid_actions[a]
    result = environment.step(action)

    action_fingerprint = np.append(
        utils.get_fingerprint(action, hyp.fingerprint_length, hyp.fingerprint_radius),
        steps_left,
    )

    next_state, reward, done = result

    steps_left = hyp.max_steps_per_episode - environment.num_steps_taken

    next_state = utils.get_fingerprint(
        next_state, hyp.fingerprint_length, hyp.fingerprint_radius
    ) 

    action_fingerprints = np.vstack(
        [
            np.append(
                utils.get_fingerprint(
                    act, hyp.fingerprint_length, hyp.fingerprint_radius
                ),
                steps_left,
            )
            for act in environment.get_valid_actions()
        ]
    )  


    agent.replay_buffer.add(
        obs_t=action_fingerprint,  # (fingerprint_length + 1)
        action=0,  # No use
        reward=reward,
        obs_tp1=action_fingerprints,  # (num_actions, fingerprint_length + 1)
        done=float(result.terminated),
    )

    if done:
        final_reward = reward
        
        
        if episodes != 0 and TENSORBOARD_LOG and len(batch_losses) != 0:
            writer.add_scalar("episode_reward", final_reward, episodes)
            writer.add_scalar("episode_loss", np.array(batch_losses).mean(), episodes)
        if episodes != 0 and episodes % 2 == 0 and len(batch_losses) != 0:
            try:
                smiles_to_pdbqt(environment._state)
                docking = run_vina_docking("COX-2.pdbqt", "legant.pdbqt")
            except:
                docking = 0
            print(
                "reward of final molecule at episode {} is {}, qed is {},sa is {}, dock is {} , irrit is {},  corneal is {}, melanin is {} , molecule is {}".format(
                    episodes, final_reward, QED.qed(Chem.MolFromSmiles(environment._state)), sascorer.calculateScore(Chem.MolFromSmiles(environment._state)), docking, 
                    irritation_model.predict_proba(np.expand_dims(get_fingerprint(Chem.MolFromSmiles(environment._state)), axis=0))[0, 1], corneal_model.predict(smiles_to_features(environment._state)) ,
                    melanin_model.predict_proba(np.expand_dims(get_fingerprint(Chem.MolFromSmiles(environment._state)), axis=0))[0, 1] ,  environment._state
                )
            )
            print(
                "mean loss in episode {} is {}".format(
                    episodes, np.array(batch_losses).mean()
                )
            )
        if final_reward > best_reward:
            torch.save(agent.dqn.state_dict(), 'best_weights.pt')
            print(f"ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда {final_reward}") 
            best_reward = final_reward
        episodes += 1
        eps_threshold *= 0.998
        batch_losses = []
        environment.initialize()

    if it % update_interval == 0 and agent.replay_buffer.__len__() >= batch_size:
        for update in range(num_updates_per_it):
            loss = agent.update_params(batch_size, hyp.gamma, hyp.polyak)
            loss = loss.item()
            batch_losses.append(loss)

  0%|                                                                               | 33/25000 [00:02<23:52, 17.43it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда -2.049457660622108


  0%|▏                                                                              | 63/25000 [00:04<26:27, 15.71it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда -1.3392045257120528


  0%|▎                                                                              | 94/25000 [00:06<26:15, 15.81it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда -1.3286227857933626


  1%|▊                                                                             | 244/25000 [00:17<28:05, 14.69it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда -1.2544200695344456


  2%|█▎                                                                            | 423/25000 [00:30<29:10, 14.04it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда -1.1275825587938615


  2%|█▋                                                                          | 574/25000 [00:58<5:54:37,  1.15it/s]

reward of final molecule at episode 18 is 1.8331135794025883, qed is 0.25776717383960474,sa is 3.5888662897524366, dock is -4.373 , irrit is 0.2168171270527135,  corneal is 3.2389326746640013, melanin is 0.9826699773371248 , molecule is CNCC(O)=C(O)C(O)O
mean loss in episode 18 is 0.39676640927791595
ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда 1.8331135794025883


  3%|█▉                                                                         | 633/25000 [01:40<23:43:59,  3.51s/it]

reward of final molecule at episode 20 is -1.3065364170947205, qed is 0.2031003895548678,sa is 4.863209652759915, dock is -5.497 , irrit is 0.00032102806038313784,  corneal is 3.1229973926069055, melanin is 0.9809659970409773 , molecule is CCN(C)OC(=O)C1(OONC(N)CN)C=C1
mean loss in episode 20 is 0.2518680691719055


  3%|██                                                                         | 693/25000 [02:17<17:29:12,  2.59s/it]

reward of final molecule at episode 22 is -1.3602113884910094, qed is 0.12265080706420729,sa is 4.731372156761147, dock is -6.187 , irrit is 0.06569525500531133,  corneal is 3.1349008608033264, melanin is 0.9775619611646477 , molecule is NOC(C(=O)O)=C(CO)N(C#COO)C1=NC=C1
mean loss in episode 22 is 0.17792390286922455


  3%|██▎                                                                         | 751/25000 [02:30<2:43:41,  2.47it/s]

reward of final molecule at episode 24 is -0.9605650521213758, qed is 0.4330819126741767,sa is 4.962160000939962, dock is -4.64 , irrit is 0.21679138497823225,  corneal is 3.920495157608922, melanin is 0.9623860136986613 , molecule is C1OC2CN3N=C3OC12
mean loss in episode 24 is 0.13839265704154968


  3%|██▍                                                                         | 813/25000 [02:52<6:55:47,  1.03s/it]

reward of final molecule at episode 26 is -1.4300702638472818, qed is 0.27268455378952405,sa is 5.384140829168668, dock is -5.813 , irrit is 0.14139802992894804,  corneal is 3.7539090125777177, melanin is 0.9380220934605771 , molecule is CN=CN(NC(=C=O)C(C)C=N)OC
mean loss in episode 26 is 0.14669674634933472


  3%|██▋                                                                         | 873/25000 [03:13<6:45:23,  1.01s/it]

reward of final molecule at episode 28 is -0.6849463628380351, qed is 0.6290847846455951,sa is 4.701984353386004, dock is -5.823 , irrit is 0.002013985416537951,  corneal is 3.39084341544452, melanin is 0.9734040011982686 , molecule is CCOCC1C2=C(OC2)C(O)C1N
mean loss in episode 28 is 0.13411349058151245


  4%|██▊                                                                        | 933/25000 [03:55<21:22:31,  3.20s/it]

reward of final molecule at episode 30 is -1.5154084456763999, qed is 0.19208337981873141,sa is 4.980018005912909, dock is -5.354 , irrit is 0.2539338374086408,  corneal is 3.501008905128252, melanin is 0.9640764220964158 , molecule is C=CC(OOOC(=O)O)C(OO)(OO)C(C)=NC
mean loss in episode 30 is 0.12549689412117004


  4%|███                                                                         | 990/25000 [04:14<9:01:34,  1.35s/it]

reward of final molecule at episode 32 is -1.5624672327363969, qed is 0.25508981788127116,sa is 5.481243514993945, dock is -5.988 , irrit is 0.0027593126786644574,  corneal is 3.3451123776154494, melanin is 0.9828416449728616 , molecule is CCOC1(C)C(=O)OC2(OC2N)C1=NN
mean loss in episode 32 is 0.11222074925899506


  4%|███▏                                                                       | 1054/25000 [04:40<7:01:54,  1.06s/it]

reward of final molecule at episode 34 is -1.643523922864368, qed is 0.15123770431546701,sa is 5.626797548415757, dock is -5.407 , irrit is 0.00028696470584559737,  corneal is 3.546912169451271, melanin is 0.9382291264769608 , molecule is CC=NOOOC(N)(N=O)C12N=C1C2=O
mean loss in episode 34 is 0.10936999693512917


  4%|███▎                                                                       | 1113/25000 [05:04<7:18:36,  1.10s/it]

reward of final molecule at episode 36 is -1.918983608049826, qed is 0.3655970472166478,sa is 4.374942917429641, dock is -5.271 , irrit is 0.9880226708732617,  corneal is 3.6562348625217087, melanin is 0.9565898898651702 , molecule is CC(=O)N(COO)N(OO)C1=CC1
mean loss in episode 36 is 0.10743515938520432


  5%|███▍                                                                      | 1173/25000 [05:59<36:57:29,  5.58s/it]

reward of final molecule at episode 38 is -1.6772497634356167, qed is 0.13524633456776494,sa is 5.190166442328655, dock is -6.697 , irrit is 3.506649217769019e-05,  corneal is 3.0716663167716245, melanin is 0.9439013922536882 , molecule is CNn1c2c(n1C)OC(=O)C(O)(OC(C)(C)N(N)C(N)(O)O)O2
mean loss in episode 38 is 0.10405859351158142


  5%|███▋                                                                      | 1230/25000 [06:58<93:28:52, 14.16s/it]

reward of final molecule at episode 40 is -2.3777248961215927, qed is 0.10522377521713229,sa is 5.692520780420354, dock is -6.659 , irrit is 0.14294789625328397,  corneal is 2.9548055156303787, melanin is 0.9359626665476967 , molecule is CC(=N)C(N=C=N)C(=O)ON=CC(O)=C=NCNONO
mean loss in episode 40 is 0.0925673209130764


  5%|███▉                                                                         | 1263/25000 [07:05<58:09,  6.80it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда 2.297400802313816


  5%|███▊                                                                      | 1293/25000 [07:45<26:50:40,  4.08s/it]

reward of final molecule at episode 42 is -0.8566442690861472, qed is 0.2865487990681238,sa is 4.665481581461419, dock is -6.074 , irrit is 0.0167902129508324,  corneal is 3.481895300963528, melanin is 0.981681068348881 , molecule is CCOc1non1C(ON(OC)C(C)OC)=C(O)C(=O)O
mean loss in episode 42 is 0.0946304090321064


  5%|████                                                                       | 1353/25000 [08:02<3:31:06,  1.87it/s]

reward of final molecule at episode 44 is -2.135977616548567, qed is 0.40235217613804075,sa is 4.7121916640966, dock is -6.151 , irrit is 0.9334436269136575,  corneal is 3.537258820076777, melanin is 0.9678949874680268 , molecule is C=C1OC(=O)C1C1ON1C(C)=O
mean loss in episode 44 is 0.08315292000770569


  6%|████▏                                                                     | 1413/25000 [08:40<18:04:59,  2.76s/it]

reward of final molecule at episode 46 is -0.5947447199515398, qed is 0.5697303882355536,sa is 5.544937913036765, dock is -6.52 , irrit is 0.04849543409096938,  corneal is 4.559229562009374, melanin is 0.9392894637797955 , molecule is C=C1CC2C3C1=CN(C)OC23N
mean loss in episode 46 is 0.0754447802901268


  6%|████▍                                                                      | 1470/25000 [08:56<4:38:27,  1.41it/s]

reward of final molecule at episode 48 is -2.407865856778383, qed is 0.5117881247009514,sa is 5.598306192708077, dock is -5.322 , irrit is 0.8255745844220519,  corneal is 3.722239166596273, melanin is 0.9734915075835312 , molecule is OC12ON=C1OC(C1=NO1)O2
mean loss in episode 48 is 0.06806677207350731


  6%|████▌                                                                      | 1533/25000 [09:23<9:28:55,  1.45s/it]

reward of final molecule at episode 50 is -0.811645643830272, qed is 0.20727306553378244,sa is 5.241019744005108, dock is -5.457 , irrit is 0.076855808779128,  corneal is 4.391451143196445, melanin is 0.9324962884281361 , molecule is CCCNOC=C=NC(C)ON=N
mean loss in episode 50 is 0.058277104049921036


  6%|████▊                                                                      | 1593/25000 [09:40<3:38:38,  1.78it/s]

reward of final molecule at episode 52 is -0.6737308254490184, qed is 0.468404101148885,sa is 4.954529947012511, dock is -5.695 , irrit is 0.007706230119150885,  corneal is 3.8229343391829915, melanin is 0.9785254102734289 , molecule is CN(C)OC1N=NNC1=C1CO1
mean loss in episode 52 is 0.0584194865077734


  7%|████▊                                                                    | 1653/25000 [16:16<250:17:52, 38.59s/it]

reward of final molecule at episode 54 is -2.6643604796078244, qed is 0.23660633632811068,sa is 6.855875607327678, dock is -6.441 , irrit is 0.0006163693010063516,  corneal is 3.257561127446319, melanin is 0.9739968778754288 , molecule is CC1(OO)N2C(=CCN)N(N)C3=C4C3C421
mean loss in episode 54 is 0.055832820013165474


  7%|█████▏                                                                     | 1710/25000 [16:30<6:19:10,  1.02it/s]

reward of final molecule at episode 56 is -2.1957811022454874, qed is 0.3009158969979917,sa is 6.562567336087521, dock is 0 , irrit is 1.978813036807812e-05,  corneal is 3.5415098997494177, melanin is 0.9678049455705007 , molecule is C#CC1C2=C3CN4C(=C2)C4(CC(=O)OCN)N31
mean loss in episode 56 is 0.07319789379835129


  7%|█████▎                                                                     | 1773/25000 [16:51<4:41:26,  1.38it/s]

reward of final molecule at episode 58 is 1.6980938812742115, qed is 0.24293317863060052,sa is 3.831806171840997, dock is -5.34 , irrit is 0.16721728852247614,  corneal is 3.056148226306088, melanin is 0.9427898409667022 , molecule is NN=NC(=O)C=CCC(N)=O
mean loss in episode 58 is 0.07540586218237877


  7%|█████▍                                                                     | 1833/25000 [17:09<4:15:15,  1.51it/s]

reward of final molecule at episode 60 is -1.74085103408112, qed is 0.3359370915289796,sa is 6.023822934544889, dock is -6.042 , irrit is 0.05567735087026612,  corneal is 3.713754416989442, melanin is 0.9546878765088443 , molecule is COC1=NC23CN(C)CN2C3(OO)O1
mean loss in episode 60 is 0.07989434525370598


  8%|█████▋                                                                     | 1894/25000 [17:26<3:24:08,  1.89it/s]

reward of final molecule at episode 62 is -2.273201322192734, qed is 0.43922105327258437,sa is 5.364484149064273, dock is -4.257 , irrit is 0.9467191666907636,  corneal is 3.99542877820531, melanin is 0.9667564374763974 , molecule is N#CC1NC(O)C1N=O
mean loss in episode 62 is 0.0748961791396141


  8%|█████▊                                                                     | 1950/25000 [17:46<7:51:33,  1.23s/it]

reward of final molecule at episode 64 is -1.042549446733794, qed is 0.12845440388524185,sa is 4.377354182451077, dock is -4.992 , irrit is 0.10000916468161583,  corneal is 3.2922673899152652, melanin is 0.9648196256522036 , molecule is NCN(O)N=C(OO)C(=O)O
mean loss in episode 64 is 0.07240292429924011


  8%|██████                                                                       | 1984/25000 [17:52<34:44, 11.04it/s]

ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда 3.3607430278687853


  8%|██████                                                                     | 2014/25000 [18:09<6:43:57,  1.05s/it]

reward of final molecule at episode 66 is -1.6959031179645299, qed is 0.17798331335887146,sa is 4.966476335470299, dock is -5.756 , irrit is 0.2837175774152707,  corneal is 3.3482599478318837, melanin is 0.9433192980681434 , molecule is CN(N)C1COC(C(=O)NN)C1=CO
mean loss in episode 66 is 0.06797211244702339


  8%|██████▏                                                                    | 2073/25000 [18:34<9:34:17,  1.50s/it]

reward of final molecule at episode 68 is -0.11191435647203092, qed is 0.3050078921248165,sa is 4.652655129649093, dock is -6.386 , irrit is 0.01616679839443412,  corneal is 4.482174838779105, melanin is 0.9643394122995617 , molecule is CCC(C)C(C=N)OOC(=O)C1C=C1
mean loss in episode 68 is 0.060365770012140274


  9%|██████▍                                                                    | 2133/25000 [18:59<8:47:35,  1.38s/it]

reward of final molecule at episode 70 is -2.267982985582912, qed is 0.13668689981971438,sa is 5.917105405558614, dock is -6.255 , irrit is 2.0151318712828126e-05,  corneal is 2.970058402203172, melanin is 0.9628780493920519 , molecule is N=C1COOOC1OC(N=O)NCNN
mean loss in episode 70 is 0.06126335822045803


  9%|██████▌                                                                    | 2190/25000 [19:15<9:26:20,  1.49s/it]

reward of final molecule at episode 72 is -1.3289307359304947, qed is 0.18169899254912147,sa is 4.717452199644129, dock is 0 , irrit is 0.027916737365444283,  corneal is 3.0222664681635845, melanin is 0.9818269541141473 , molecule is NOOC(O)(C(=O)O)C(=C=O)NC(=O)CC1=COCC1
mean loss in episode 72 is 0.06109953299164772


  9%|██████▊                                                                    | 2254/25000 [19:39<6:24:46,  1.01s/it]

reward of final molecule at episode 74 is -0.5789557433073103, qed is 0.504676264528786,sa is 4.663701638323717, dock is -5.445 , irrit is 0.06603515839257716,  corneal is 3.7834647766560536, melanin is 0.9499609174614903 , molecule is C=C(OCC(=N)C#N)N1CN=C1C#N
mean loss in episode 74 is 0.06436359137296677


  9%|██████▉                                                                    | 2313/25000 [19:58<4:06:24,  1.53it/s]

reward of final molecule at episode 76 is -1.5939918706657423, qed is 0.25192714061360655,sa is 5.199488893971161, dock is 0 , irrit is 0.006446540705087649,  corneal is 3.0779418053620926, melanin is 0.9517316225549421 , molecule is C=CCN(O)C1(C(=O)O)OC(=O)C(N=N)O1
mean loss in episode 76 is 0.07060384750366211


  9%|███████                                                                    | 2373/25000 [20:23<7:58:00,  1.27s/it]

reward of final molecule at episode 78 is -1.8714874836719473, qed is 0.2864925077825296,sa is 5.691453131666548, dock is -5.422 , irrit is 0.034134656989817456,  corneal is 3.188712354673362, melanin is 0.9696373460495219 , molecule is CC1(N)C(COO)C1(N)ON1C=N1
mean loss in episode 78 is 0.042719315737485886


 10%|███████▎                                                                   | 2431/25000 [20:40<6:27:44,  1.03s/it]

reward of final molecule at episode 80 is -1.3442903416261436, qed is 0.4492826320933836,sa is 5.1799447929464435, dock is -5.901 , irrit is 0.0265503738801865,  corneal is 3.214945143300699, melanin is 0.9676489008614221 , molecule is CCC1OC(=NN=O)C(N)C(O)O1
mean loss in episode 80 is 0.06925786286592484


 10%|███████▍                                                                   | 2493/25000 [20:57<3:41:28,  1.69it/s]

reward of final molecule at episode 82 is -0.6968062275124933, qed is 0.48019173689413036,sa is 5.152311347663239, dock is -5.747 , irrit is 0.4445753205649847,  corneal is 4.868089734055105, melanin is 0.9674460396311408 , molecule is CC1C(=NOOO)CC2=C1O2
mean loss in episode 82 is 0.04521620646119118


 10%|███████▋                                                                   | 2554/25000 [21:15<4:05:20,  1.52it/s]

reward of final molecule at episode 84 is 0.10536975511997135, qed is 0.5845008255082595,sa is 4.260805896879071, dock is -5.317 , irrit is 0.6526816365865612,  corneal is 5.361906678432862, melanin is 0.9748072699323643 , molecule is CC=CCC(O)C1N=C1C
mean loss in episode 84 is 0.05512072704732418


 10%|███████▋                                                                  | 2613/25000 [21:50<15:16:36,  2.46s/it]

reward of final molecule at episode 86 is -1.8339748422704472, qed is 0.08486912338953077,sa is 5.626105990827793, dock is -5.44 , irrit is 0.0025288732811207314,  corneal is 3.313891079180576, melanin is 0.9687035088057687 , molecule is C#COC(N)(C=O)C(C)NNCOC1OO1
mean loss in episode 86 is 0.06341546028852463


 11%|███████▉                                                                  | 2671/25000 [22:17<15:35:39,  2.51s/it]

reward of final molecule at episode 88 is -1.633980620201147, qed is 0.4927328175987601,sa is 4.87635154626769, dock is -5.718 , irrit is 0.3701466346622587,  corneal is 3.14873698924322, melanin is 0.9774657821434556 , molecule is COCC(=O)C(C)On1on(N(O)O)c1=N
mean loss in episode 88 is 0.061793575063347816


 11%|████████                                                                  | 2733/25000 [23:24<42:33:10,  6.88s/it]

reward of final molecule at episode 90 is -1.3990451626739813, qed is 0.09275527301461868,sa is 4.676015440670037, dock is -6.012 , irrit is 2.5237488996767045e-06,  corneal is 2.9252237426952443, melanin is 0.977782723372176 , molecule is CNC(C)(CCOC)C(OOCN)C(=O)OCC(C)C(=N)N
mean loss in episode 90 is 0.06376066245138645


 11%|████████▍                                                                  | 2793/25000 [23:48<7:15:44,  1.18s/it]

reward of final molecule at episode 92 is -1.154879252442743, qed is 0.3520854386116151,sa is 5.097643211047338, dock is -5.373 , irrit is 0.0024777623715837834,  corneal is 3.427250307801164, melanin is 0.9746554583061078 , molecule is CONC(C)(C(N)C#N)C1(N)OC1=O
mean loss in episode 92 is 0.0596288088709116


 11%|████████▍                                                                 | 2853/25000 [24:22<17:17:40,  2.81s/it]

reward of final molecule at episode 94 is -1.5807181912924502, qed is 0.39459573361355266,sa is 5.300248017611476, dock is -7.291 , irrit is 0.05508221380034148,  corneal is 3.162746657406924, melanin is 0.9440660859925518 , molecule is N#CC(=N)C(O)=C1C=NN=C2ON=NNC21OCC(=O)O
mean loss in episode 94 is 0.031497630290687084


 12%|████████▌                                                                 | 2910/25000 [24:45<10:18:27,  1.68s/it]

reward of final molecule at episode 96 is -1.7250651544342521, qed is 0.1918896447065822,sa is 5.492914562397126, dock is -4.923 , irrit is 0.06506449243297606,  corneal is 3.332642520035574, melanin is 0.9775986535084373 , molecule is NOC1(C=COOCO)CN=N1
mean loss in episode 96 is 0.04709058441221714


 12%|████████▉                                                                  | 2973/25000 [25:09<8:53:55,  1.45s/it]

reward of final molecule at episode 98 is -1.229736509752552, qed is 0.39822309344668777,sa is 5.488843109583405, dock is -6.476 , irrit is 0.10204385263573859,  corneal is 3.9637245188078705, melanin is 0.9123359794161323 , molecule is C=C=NOC(O)C12OCC1C(C)(O)CCC2=O
mean loss in episode 98 is 0.03798791952431202


 12%|████████▉                                                                 | 3033/25000 [25:40<10:30:58,  1.72s/it]

reward of final molecule at episode 100 is -1.1403684314112934, qed is 0.21488702782626826,sa is 4.766075130638236, dock is -5.435 , irrit is 0.009661850172038957,  corneal is 3.277614886066365, melanin is 0.9674895171904876 , molecule is CN(OO)OC(=O)C(O)(O)COC1=C=C1
mean loss in episode 100 is 0.030942938290536404


 12%|█████████▏                                                                | 3093/25000 [26:10<10:36:16,  1.74s/it]

reward of final molecule at episode 102 is -1.2325142853965245, qed is 0.35798709811657475,sa is 4.7977780283724325, dock is -6.209 , irrit is 3.6991141926546205e-05,  corneal is 3.0308638738762363, melanin is 0.9618443558697521 , molecule is CC(C)OC(=N)C(N)NC=C1N=CN=N1
mean loss in episode 102 is 0.036126479506492615


 13%|█████████▎                                                                | 3150/25000 [26:34<13:06:25,  2.16s/it]

reward of final molecule at episode 104 is -1.337304564433002, qed is 0.4465669444860701,sa is 4.324497776047633, dock is -6.091 , irrit is 0.3857809694210505,  corneal is 3.1055675240921228, melanin is 0.9573499304334618 , molecule is CNC(=O)C(=O)OC1=C(On2[nH]cc2O)C1
mean loss in episode 104 is 0.048021065071225166


 13%|█████████▌                                                                | 3213/25000 [27:08<12:14:05,  2.02s/it]

reward of final molecule at episode 106 is -1.7246677949565907, qed is 0.2427527168180609,sa is 5.506570927840357, dock is -5.826 , irrit is 0.000850832383750596,  corneal is 3.182053617128949, melanin is 0.967926697452184 , molecule is N=C(CC=O)OC1(C(OC=O)OCN)N=NO1
mean loss in episode 106 is 0.0439800750464201


 13%|█████████▊                                                                 | 3273/25000 [27:36<9:27:53,  1.57s/it]

reward of final molecule at episode 108 is -0.7285285171700363, qed is 0.19080137943009665,sa is 4.769917480333625, dock is -6.287 , irrit is 0.015519341093396609,  corneal is 3.9042889640542797, melanin is 0.947312691559496 , molecule is CC=C(CC)C(N)OC(=O)C(=C=NC)C#N
mean loss in episode 108 is 0.0334343146532774


 13%|█████████▉                                                                 | 3333/25000 [27:54<4:14:11,  1.42it/s]

reward of final molecule at episode 110 is 3.226261114014295, qed is 0.5759693713530296,sa is 3.9497275145179493, dock is -5.959 , irrit is 0.4836918867343253,  corneal is 5.37284708428398, melanin is 0.9834122614582703 , molecule is CCC(C)C1(O)C2=C1C2C
mean loss in episode 110 is 0.05917879566550255


 14%|██████████▍                                                                  | 3389/25000 [28:03<42:20,  8.51it/s]

reward of final molecule at episode 112 is 2.388921588299709, qed is 0.3841200837982494,sa is 3.0337116580540497, dock is -3.292 , irrit is 0.9994724011641284,  corneal is 5.388968103101906, melanin is 0.9725385716648943 , molecule is o1c2c3c1c23
mean loss in episode 112 is 0.04198596067726612


 14%|██████████▏                                                               | 3453/25000 [28:43<14:57:56,  2.50s/it]

reward of final molecule at episode 114 is -1.7339385522732775, qed is 0.08494799822536156,sa is 5.126534828037095, dock is -5.192 , irrit is 0.24561050698499945,  corneal is 3.4312162339547667, melanin is 0.9716548481624837 , molecule is CC(NN)(ON(N)CO)C(O)(C=O)C(=O)O
mean loss in episode 114 is 0.058798547834157944


 14%|██████████▌                                                                | 3513/25000 [29:06<7:31:11,  1.26s/it]

reward of final molecule at episode 116 is -0.002437824312260458, qed is 0.2640985861446579,sa is 4.002510497711811, dock is -5.244 , irrit is 0.002801706010792341,  corneal is 3.9891366288532852, melanin is 0.9677220020856445 , molecule is CCCOCC(C=O)C(=N)NC
mean loss in episode 116 is 0.07209250330924988


 14%|██████████▋                                                                | 3573/25000 [29:33<8:23:31,  1.41s/it]

reward of final molecule at episode 118 is -0.6781583258988423, qed is 0.709311891488156,sa is 4.374340940721723, dock is -5.572 , irrit is 0.7097154586575795,  corneal is 4.454916523555436, melanin is 0.9418280394810389 , molecule is C=C(OCc1c[nH]o1)N1N=C1CC#N
mean loss in episode 118 is 0.055390264838933945


 15%|██████████▋                                                               | 3630/25000 [29:52<13:41:46,  2.31s/it]

reward of final molecule at episode 120 is -1.9971570118401882, qed is 0.4530459333846423,sa is 6.4983247342105175, dock is -6.67 , irrit is 0.00028014488662221187,  corneal is 3.635134929124738, melanin is 0.940500080693622 , molecule is CNC12N=C3CC(=O)N(O1)C(OO)(C3)O2
mean loss in episode 120 is 0.04965860769152641


 15%|███████████                                                                | 3695/25000 [30:08<1:57:13,  3.03it/s]

reward of final molecule at episode 122 is 2.6624273080020546, qed is 0.474106795831282,sa is 1.7503892021323146, dock is -3.481 , irrit is 0.9357995685917635,  corneal is 5.401014969070133, melanin is 0.9805656037244227 , molecule is CCCOCC
mean loss in episode 122 is 0.04670894332230091


 15%|███████████▎                                                               | 3753/25000 [30:31<7:41:41,  1.30s/it]

reward of final molecule at episode 124 is 3.258019837106251, qed is 0.340844792213851,sa is 3.1938541824510747, dock is -4.735 , irrit is 0.3165382971136323,  corneal is 5.2446396610432435, melanin is 0.980993639821815 , molecule is C=C(CCOC)CCOCN
mean loss in episode 124 is 0.03755421005189419


 15%|███████████▎                                                              | 3813/25000 [31:24<30:39:17,  5.21s/it]

reward of final molecule at episode 126 is -1.6964461667446105, qed is 0.10159684132828234,sa is 5.113701122750516, dock is -5.295 , irrit is 5.838873943716884e-05,  corneal is 2.9698902798310556, melanin is 0.9659382334017731 , molecule is CC(N)COC(=N)C(OO)(C(O)C#N)N(C)C(=O)OO
mean loss in episode 126 is 0.045885154977440834


 15%|███████████▍                                                              | 3871/25000 [31:46<10:34:55,  1.80s/it]

reward of final molecule at episode 128 is -1.0979146782375533, qed is 0.10887563243191249,sa is 4.582390995637889, dock is -4.598 , irrit is 0.15784478427440707,  corneal is 3.552384422397288, melanin is 0.9683673622126222 , molecule is NNN(C(N)N)C(C=O)C=O
mean loss in episode 128 is 0.021768351085484028


 16%|███████████▊                                                               | 3933/25000 [32:08<4:53:30,  1.20it/s]

reward of final molecule at episode 130 is -0.93032169614579, qed is 0.4236300490947667,sa is 4.824440855121461, dock is -5.082 , irrit is 0.029384776484722604,  corneal is 3.4358747964659786, melanin is 0.9773365581257993 , molecule is CC1N2C(N)(CC(=O)O)C12N
mean loss in episode 130 is 0.03706745430827141


 16%|███████████▉                                                               | 3993/25000 [32:28<5:46:17,  1.01it/s]

reward of final molecule at episode 132 is -0.13804344590679102, qed is 0.47397599300849164,sa is 4.854354585634068, dock is -5.932 , irrit is 0.06898592819771587,  corneal is 4.577120538355008, melanin is 0.9697394337297589 , molecule is C#CN1CC(C)(C=C2CC(O)C2)O1
mean loss in episode 132 is 0.03488376270979643


 16%|████████████▏                                                              | 4053/25000 [32:47<4:50:46,  1.20it/s]

reward of final molecule at episode 134 is -0.7289748031135234, qed is 0.4324295613040809,sa is 4.884012637985819, dock is -4.689 , irrit is 0.05518690821762226,  corneal is 3.8495620135601065, melanin is 0.9512631763047107 , molecule is C=NC(COC)=C1NC1NC
mean loss in episode 134 is 0.045208826661109924


 16%|████████████▏                                                             | 4110/25000 [33:10<15:19:14,  2.64s/it]

reward of final molecule at episode 136 is -1.5633369806118402, qed is 0.2351846695408082,sa is 5.420694673659754, dock is -6.134 , irrit is 0.0001052088321565247,  corneal is 3.316933794382272, melanin is 0.9702088926346952 , molecule is CN=C(ON)OC12C(=O)C1COC(=NC)N2N
mean loss in episode 136 is 0.035669014789164066


 17%|████████████▌                                                              | 4174/25000 [33:33<5:47:53,  1.00s/it]

reward of final molecule at episode 138 is -1.2196506933516134, qed is 0.24779428664514064,sa is 5.166224266646353, dock is -5.835 , irrit is 0.005517201543526907,  corneal is 3.5321659760318735, melanin is 0.9656361173512795 , molecule is N=C(OC1OON1C(=O)N=C=O)C(=O)O
mean loss in episode 138 is 0.03167582955211401


 17%|████████████▌                                                             | 4233/25000 [34:13<19:10:45,  3.32s/it]

reward of final molecule at episode 140 is -2.407012350748103, qed is 0.1292868700273858,sa is 6.072803510921197, dock is -4.951 , irrit is 0.013676414982442316,  corneal is 3.0120561329213755, melanin is 0.936517588520928 , molecule is COC(C(=O)C(N=N)NONON)N1CN1O
mean loss in episode 140 is 0.04318300262093544


 17%|████████████▉                                                              | 4293/25000 [34:33<4:54:49,  1.17it/s]

reward of final molecule at episode 142 is -3.6147754608728486, qed is 0.2620112521504365,sa is 7.709836699604474, dock is -5.662 , irrit is 0.2739225385353677,  corneal is 3.360418969019524, melanin is 0.9656274520410136 , molecule is ONC1CN2OC34NC23COC14O
mean loss in episode 142 is 0.037627093493938446


 17%|████████████▉                                                             | 4350/25000 [34:52<13:25:25,  2.34s/it]

reward of final molecule at episode 144 is 3.0332606310694383, qed is 0.5962461076165981,sa is 3.006999479959873, dock is -5.066 , irrit is 0.048159047539786816,  corneal is 4.130350557039205, melanin is 0.9807217282210762 , molecule is COCCC(=O)CN1CCCO1
mean loss in episode 144 is 0.03366250265389681


 18%|█████████████▏                                                             | 4413/25000 [35:21<9:48:13,  1.71s/it]

reward of final molecule at episode 146 is 1.4665130054046351, qed is 0.19773480267553953,sa is 3.7158595330605593, dock is -5.334 , irrit is 0.6495144251925626,  corneal is 3.707044994309098, melanin is 0.9656080569549997 , molecule is CCC(=O)CC(=NNC)OOCCO
mean loss in episode 146 is 0.03672241047024727


 18%|█████████████▍                                                             | 4473/25000 [35:38<2:14:28,  2.54it/s]

reward of final molecule at episode 148 is 2.700060671134788, qed is 0.48753202939562973,sa is 3.80583647542883, dock is -5.527 , irrit is 0.7316069041898678,  corneal is 5.268292895400081, melanin is 0.95878742685255 , molecule is Cc1nc2c(o1)-c1[nH][nH]c1-2
mean loss in episode 148 is 0.04893745668232441


 18%|█████████████▌                                                             | 4533/25000 [35:58<4:38:11,  1.23it/s]

reward of final molecule at episode 150 is 3.62123634020575, qed is 0.5896896085072948,sa is 2.768558653981531, dock is -4.307 , irrit is 0.09933153526741612,  corneal is 5.002525261736194, melanin is 0.9858992651590331 , molecule is CCCC(N)COC
mean loss in episode 150 is 0.03155993949621916
ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда 3.62123634020575


 18%|█████████████▌                                                            | 4590/25000 [36:23<18:36:39,  3.28s/it]

reward of final molecule at episode 152 is -0.880046210598823, qed is 0.3773766087188275,sa is 5.332961394569603, dock is -6.552 , irrit is 0.005820967142527897,  corneal is 4.094852040529209, melanin is 0.9217608797608838 , molecule is C=C1C=C1C=C1OC1C(O)ON(C)CO
mean loss in episode 152 is 0.03776727430522442


 19%|█████████████▉                                                             | 4654/25000 [36:40<1:43:47,  3.27it/s]

reward of final molecule at episode 154 is -1.7407544780702726, qed is 0.49428880186084717,sa is 6.629295748652009, dock is -5.596 , irrit is 0.13392486345883775,  corneal is 4.2739489274501175, melanin is 0.9837226448026867 , molecule is C1C2=C3OC4NC3C1=C24
mean loss in episode 154 is 0.03883007727563381


 19%|██████████████▏                                                            | 4713/25000 [36:56<2:24:06,  2.35it/s]

reward of final molecule at episode 156 is -0.9136242211099341, qed is 0.41770158576755195,sa is 4.846459040862413, dock is 0 , irrit is 0.03244177447231539,  corneal is 3.488716221859609, melanin is 0.9798649014921657 , molecule is CCCCNC12C3=C(CNC)C1OC32O
mean loss in episode 156 is 0.045105105265975


 19%|██████████████▎                                                            | 4773/25000 [37:22<9:52:56,  1.76s/it]

reward of final molecule at episode 158 is 3.6155212300265487, qed is 0.4686290391430532,sa is 2.867705197771473, dock is -5.145 , irrit is 0.03927053585776307,  corneal is 5.133135279699559, melanin is 0.900910569646816 , molecule is CCCOC=C(C)CCCNC
mean loss in episode 158 is 0.03998011723160744


 19%|██████████████▍                                                            | 4832/25000 [37:35<1:28:18,  3.81it/s]

reward of final molecule at episode 160 is 2.5906253386796387, qed is 0.44444083512699417,sa is 1.6827192913792715, dock is -3.646 , irrit is 0.9798939087214658,  corneal is 5.408127751718027, melanin is 0.9851695208257708 , molecule is CCC(C)C
mean loss in episode 160 is 0.044678349047899246


 20%|██████████████▍                                                           | 4892/25000 [38:07<12:15:58,  2.20s/it]

reward of final molecule at episode 162 is -1.8986469520499851, qed is 0.23926482388854603,sa is 5.0653956618020475, dock is -5.255 , irrit is 0.5514664052378799,  corneal is 3.638030156405397, melanin is 0.949503384593265 , molecule is C#COC(C=O)(C(=O)C#N)C(OC=O)C(=O)O
mean loss in episode 162 is 0.03326265420764685


 20%|██████████████▋                                                           | 4953/25000 [38:41<11:27:12,  2.06s/it]

reward of final molecule at episode 164 is -2.268325181657575, qed is 0.12387905400787463,sa is 5.8958117994793735, dock is -5.897 , irrit is 0.00047765855658603095,  corneal is 2.955141274543161, melanin is 0.9674672276770291 , molecule is CNOC(C#CN=C=N)C(O)=NONN
mean loss in episode 164 is 0.03234289959073067


 20%|███████████████                                                            | 5013/25000 [39:04<6:22:56,  1.15s/it]

reward of final molecule at episode 166 is 2.248856138610672, qed is 0.47890435425905037,sa is 3.728721508250585, dock is -5.247 , irrit is 0.04156388859653244,  corneal is 3.251904689277261, melanin is 0.9769444375442257 , molecule is Cn1c(CCN)c(OOO)oc1=O
mean loss in episode 166 is 0.04013737104833126


 20%|███████████████▏                                                           | 5070/25000 [39:24<7:49:15,  1.41s/it]

reward of final molecule at episode 168 is -1.6927754029840476, qed is 0.4162604157128388,sa is 5.372756169150133, dock is -5.917 , irrit is 0.17223237794489732,  corneal is 3.2605614952810833, melanin is 0.9679587239784355 , molecule is N#CCC(=O)C1ON2C(=C=N)N(N)C12
mean loss in episode 168 is 0.04014873318374157


 21%|███████████████▍                                                           | 5134/25000 [39:48<5:25:13,  1.02it/s]

reward of final molecule at episode 170 is 2.1697850932680027, qed is 0.3845773951711409,sa is 3.629322621113384, dock is -5.467 , irrit is 0.09398429865572595,  corneal is 3.3472926331678376, melanin is 0.968240371096171 , molecule is CC=CC(O)(O)CCCN
mean loss in episode 170 is 0.028432572260499


 21%|███████████████▌                                                           | 5193/25000 [40:10<4:48:35,  1.14it/s]

reward of final molecule at episode 172 is 3.4579886752065336, qed is 0.6791644902919668,sa is 3.9550902556693712, dock is -5.532 , irrit is 0.1905288944690606,  corneal is 5.008635875760385, melanin is 0.9757446456604976 , molecule is CC(C)(O)C(C)(C)C(=O)C1C=N1
mean loss in episode 172 is 0.03146407287567854


 21%|███████████████▊                                                           | 5253/25000 [40:32<4:59:35,  1.10it/s]

reward of final molecule at episode 174 is -0.6555087270688298, qed is 0.5744042043902059,sa is 4.872915946696552, dock is -4.877 , irrit is 0.17091438562492717,  corneal is 3.9918817149547703, melanin is 0.9748515859099679 , molecule is CCOC(C#N)C1N(C)N1O
mean loss in episode 174 is 0.03501533344388008


 21%|███████████████▉                                                           | 5311/25000 [40:46<3:08:19,  1.74it/s]

reward of final molecule at episode 176 is -0.5749301045758166, qed is 0.4783427373119974,sa is 4.380415965265491, dock is -5.499 , irrit is 0.534908864593174,  corneal is 4.464865194181542, melanin is 0.9531856540935186 , molecule is C=C1CC(C)C(=O)C1N
mean loss in episode 176 is 0.03768528252840042


 21%|████████████████                                                           | 5374/25000 [41:05<3:25:05,  1.59it/s]

reward of final molecule at episode 178 is 0.36532858185729555, qed is 0.4832589337301299,sa is 4.190718112064525, dock is -4.892 , irrit is 0.267693561422268,  corneal is 4.9588864920738285, melanin is 0.9852313714935157 , molecule is CCC1CC(C(C)OO)O1
mean loss in episode 178 is 0.03954256325960159


 22%|████████████████▎                                                          | 5433/25000 [41:29<7:25:00,  1.36s/it]

reward of final molecule at episode 180 is -2.069150984401427, qed is 0.21755814947081056,sa is 5.5887529568605965, dock is -6.36 , irrit is 0.2007339936638041,  corneal is 3.2176320926644535, melanin is 0.9718320673963973 , molecule is N=C1CC2(O1)C1N2C1(N)NC(=O)C(=O)O
mean loss in episode 180 is 0.04565195366740227


 22%|████████████████▍                                                          | 5493/25000 [41:49<5:11:51,  1.04it/s]

reward of final molecule at episode 182 is 2.955643849156261, qed is 0.5679621520090794,sa is 2.673921462760301, dock is -4.584 , irrit is 0.6632763018701288,  corneal is 5.259296434472222, melanin is 0.9749448987101939 , molecule is CCC(N)CCCOC
mean loss in episode 182 is 0.02923039346933365


 22%|████████████████▍                                                         | 5550/25000 [42:44<69:58:14, 12.95s/it]

reward of final molecule at episode 184 is -2.298301175747952, qed is 0.2559834861925747,sa is 6.3584973088169665, dock is -6.826 , irrit is 0.00036403694270362397,  corneal is 3.2629963566757323, melanin is 0.9554500565193107 , molecule is CCC(N)C1(N)C2(C(=O)OO)C(=O)C3C4(COC4)NOC321
mean loss in episode 184 is 0.02813488431274891


 22%|████████████████▊                                                          | 5613/25000 [43:08<5:53:32,  1.09s/it]

reward of final molecule at episode 186 is -1.7060140363049139, qed is 0.24832200639949803,sa is 4.775832324634887, dock is -5.391 , irrit is 0.6361098483336775,  corneal is 3.742845788403291, melanin is 0.9670173378431446 , molecule is C=C(C)C(N)C(=C)C(OO)OO
mean loss in episode 186 is 0.035288216546177864


 23%|█████████████████                                                          | 5673/25000 [43:29<5:39:46,  1.05s/it]

reward of final molecule at episode 188 is 3.6432384611607085, qed is 0.48322790576431784,sa is 3.278808447004386, dock is -5.433 , irrit is 0.007296557263397047,  corneal is 5.035065323823597, melanin is 0.9663134674683855 , molecule is CCCCC(C)NC(=N)OC
mean loss in episode 188 is 0.023615934886038303
ЛУЧШИЕ ВЕСА СОХРАНЕНЫ, награда 3.6432384611607085


 23%|█████████████████▏                                                         | 5733/25000 [43:54<7:39:04,  1.43s/it]

reward of final molecule at episode 190 is -0.24662932371488133, qed is 0.5292519489709563,sa is 4.70282042987337, dock is -5.916 , irrit is 0.2165483085105373,  corneal is 4.538847658711679, melanin is 0.9562663787701493 , molecule is C=COC(=Cc1no[nH]1)C(CC)NOC
mean loss in episode 190 is 0.03613770008087158


 23%|█████████████████▏                                                        | 5790/25000 [44:37<50:45:41,  9.51s/it]

reward of final molecule at episode 192 is -1.9736031370714455, qed is 0.09782037270905912,sa is 5.563460273588846, dock is -6.293 , irrit is 0.02746042365925245,  corneal is 3.1208322333535965, melanin is 0.9544112852965372 , molecule is C#CC(=O)OOC(O)C(=NO)C1ON(N=CO)C1=O
mean loss in episode 192 is 0.04162379913032055


 23%|█████████████████▌                                                         | 5853/25000 [45:06<9:54:31,  1.86s/it]

reward of final molecule at episode 194 is -1.67362510115883, qed is 0.26179838358499585,sa is 5.604636767177551, dock is -6.178 , irrit is 0.008825862584960988,  corneal is 3.3327996847223003, melanin is 0.9767458431989813 , molecule is N=C(O)COC(=O)C12NC(CO)C3C1C32
mean loss in episode 194 is 0.027222738601267338


 24%|█████████████████▌                                                        | 5913/25000 [45:46<17:07:06,  3.23s/it]

reward of final molecule at episode 196 is -1.5013381674932198, qed is 0.17583094188449544,sa is 4.11680071148429, dock is -5.707 , irrit is 0.5317834985445691,  corneal is 3.2317041807261844, melanin is 0.9621169430792896 , molecule is C=C(C)CC(=O)OOC(=CCN)CNCO
mean loss in episode 196 is 0.03264165669679642


 24%|█████████████████▉                                                         | 5973/25000 [46:04<3:30:37,  1.51it/s]

reward of final molecule at episode 198 is 2.925418884540221, qed is 0.552213853856346,sa is 3.613949539683377, dock is -4.527 , irrit is 0.08846854017811134,  corneal is 4.1646583086702345, melanin is 0.9842199765734256 , molecule is CCCOC1OCC1=O
mean loss in episode 198 is 0.027571342885494232


 24%|█████████████████▊                                                        | 6032/25000 [46:34<13:31:18,  2.57s/it]

reward of final molecule at episode 200 is -0.4038392950183339, qed is 0.3388193851799443,sa is 4.0379080936478156, dock is -5.33 , irrit is 0.0512259929262676,  corneal is 3.4839718175087886, melanin is 0.9810793628141937 , molecule is CCC(NCC=CC(C)O)C(O)O
mean loss in episode 200 is 0.04409034177660942


 24%|██████████████████                                                        | 6093/25000 [47:07<13:15:23,  2.52s/it]

reward of final molecule at episode 202 is -1.1757511552019075, qed is 0.13723794875674028,sa is 4.525475353098565, dock is -6.606 , irrit is 0.005456237802090581,  corneal is 3.0745936568704515, melanin is 0.9567064814622546 , molecule is CC(C)OC1=C(C(N)=NN)C1(OO)C(=N)C(=O)O
mean loss in episode 202 is 0.05531022511422634


 25%|██████████████████▏                                                       | 6152/25000 [48:01<30:43:24,  5.87s/it]

reward of final molecule at episode 204 is -2.6189153603288324, qed is 0.3807127451437829,sa is 7.253287544733991, dock is -6.322 , irrit is 0.00017767157786289739,  corneal is 3.5588064612265704, melanin is 0.9824102103333514 , molecule is CC(O)C(=N)CCC1=NC12C1OC2(N=O)N1ON=O
mean loss in episode 204 is 0.032820213586091995


 25%|██████████████████▍                                                       | 6213/25000 [48:52<26:23:44,  5.06s/it]

reward of final molecule at episode 206 is -0.7077193270029768, qed is 0.30044426476333325,sa is 4.38772907383435, dock is -5.611 , irrit is 0.00021393958680709642,  corneal is 3.386769490078209, melanin is 0.9627855562394779 , molecule is CCCCC(ON(C)NON)C(O)(O)OCCC
mean loss in episode 206 is 0.034371837973594666


 25%|██████████████████▌                                                       | 6270/25000 [49:27<22:03:34,  4.24s/it]

reward of final molecule at episode 208 is -1.746437461436803, qed is 0.1880057597031691,sa is 5.3663315902009625, dock is -5.755 , irrit is 0.0003620653943896895,  corneal is 3.067322796678847, melanin is 0.9671470450143974 , molecule is C#COC(C=O)(C1=NN1NO)N(O)CC(C)(C)O
mean loss in episode 208 is 0.04071790166199207


 25%|██████████████████▉                                                        | 6332/25000 [49:44<2:27:19,  2.11it/s]

reward of final molecule at episode 210 is 2.4433302204914034, qed is 0.5571728864199083,sa is 3.0730206493660255, dock is -4.839 , irrit is 0.9798217228851356,  corneal is 5.237059465371573, melanin is 0.9838987975208043 , molecule is CCOC1(C)CC(=O)C1
mean loss in episode 210 is 0.03064096439629793


 26%|███████████████████▏                                                       | 6393/25000 [50:03<3:00:08,  1.72it/s]

reward of final molecule at episode 212 is 2.6192572782504606, qed is 0.6621194280302631,sa is 3.7553926166064544, dock is -5.256 , irrit is 0.9747020471825278,  corneal is 5.438198187247847, melanin is 0.9770642301153886 , molecule is CCC(C)Cc1c[nH]o1
mean loss in episode 212 is 0.03065971378237009


 26%|███████████████████                                                       | 6453/25000 [50:38<13:34:10,  2.63s/it]

reward of final molecule at episode 214 is 2.831287971383382, qed is 0.35784784108544515,sa is 3.7501229874718343, dock is -5.797 , irrit is 0.0005311235916875847,  corneal is 4.12425835650564, melanin is 0.9477219859074814 , molecule is CCOCCC(=O)NON=CC(=N)CC
mean loss in episode 214 is 0.0350216431543231


 26%|███████████████████▎                                                      | 6512/25000 [51:03<10:22:32,  2.02s/it]

reward of final molecule at episode 216 is 2.079135640761686, qed is 0.5059051579501217,sa is 3.2049067335345747, dock is -5.456 , irrit is 0.37861957721758127,  corneal is 3.6112546152033675, melanin is 0.9809005999393142 , molecule is CC(=O)C(COC(=O)NC(C)C)OO
mean loss in episode 216 is 0.0386694110929966


 26%|███████████████████▍                                                      | 6573/25000 [51:37<13:09:59,  2.57s/it]

reward of final molecule at episode 218 is -1.667068055889446, qed is 0.13950842426149015,sa is 5.3495763807893, dock is -6.096 , irrit is 0.02758163858169681,  corneal is 3.2402531054025308, melanin is 0.9808472868401968 , molecule is CC(=N)OCC(C)C1(C(O)ONO)OC1=N
mean loss in episode 218 is 0.03631945885717869


 27%|███████████████████▋                                                      | 6633/25000 [52:11<10:34:47,  2.07s/it]

reward of final molecule at episode 220 is 1.91940259706353, qed is 0.43501500962046213,sa is 3.821042728184814, dock is -5.181 , irrit is 0.465538234090588,  corneal is 3.6993185538879825, melanin is 0.9824567485662676 , molecule is CCNOC(CC(=O)O)C(=O)OOCC
mean loss in episode 220 is 0.04340638779103756


 27%|████████████████████                                                       | 6692/25000 [52:30<4:46:24,  1.07it/s]

reward of final molecule at episode 222 is -1.0574406320830105, qed is 0.5623025868289165,sa is 5.120155913751505, dock is -6.05 , irrit is 0.9554715212514712,  corneal is 5.277462053712878, melanin is 0.9745211076813536 , molecule is COC1=C2CC3(C)C(=O)C23C1
mean loss in episode 222 is 0.03603288345038891


 27%|████████████████████▎                                                      | 6750/25000 [52:48<4:51:42,  1.04it/s]

reward of final molecule at episode 224 is 1.9380760201619391, qed is 0.4304024682806884,sa is 2.902857532355986, dock is -5.058 , irrit is 0.9033039938247429,  corneal is 4.522728461540149, melanin is 0.9758968890073053 , molecule is CCOC(=O)CC(C)=C=O
mean loss in episode 224 is 0.03228164371103048


 27%|███████████████████                                                   | 6812/25000 [1:16:06<808:34:07, 160.04s/it]

reward of final molecule at episode 226 is -1.4544970543644329, qed is 0.4810483375152063,sa is 4.810703072141196, dock is -5.128 , irrit is 0.9686613430271304,  corneal is 4.53708833925798, melanin is 0.9757042344817421 , molecule is CC12COC(C=O)C1C2
mean loss in episode 226 is 0.02853256370872259


 27%|████████████████████                                                     | 6873/25000 [1:16:23<2:38:50,  1.90it/s]

reward of final molecule at episode 228 is 1.8437761714748249, qed is 0.5613004850577169,sa is 3.270235802842393, dock is -4.523 , irrit is 0.7025444639220901,  corneal is 3.903322060377508, melanin is 0.9733924259130706 , molecule is O=C(O)CC1=CC=N1
mean loss in episode 228 is 0.03278934024274349


 28%|████████████████████▏                                                    | 6934/25000 [1:16:48<5:30:48,  1.10s/it]

reward of final molecule at episode 230 is -0.19780725644214397, qed is 0.46772085413903647,sa is 4.784545683947336, dock is -5.571 , irrit is 0.06683694126303147,  corneal is 4.458866920766283, melanin is 0.9494776077063068 , molecule is CCCCCON1OCC2N(C)N21
mean loss in episode 230 is 0.03512331284582615


 28%|████████████████████▍                                                    | 6992/25000 [1:17:06<4:57:32,  1.01it/s]

reward of final molecule at episode 232 is -0.8938209542627696, qed is 0.5603346124941921,sa is 4.576519506346526, dock is -4.674 , irrit is 0.00215581107152472,  corneal is 3.0637391209034313, melanin is 0.9656013177166405 , molecule is CC(N)C(CO)CN1N=N1
mean loss in episode 232 is 0.03187015280127525


 28%|████████████████████▎                                                   | 7052/25000 [1:17:39<13:33:21,  2.72s/it]

reward of final molecule at episode 234 is -1.2458910094674647, qed is 0.5431524169358682,sa is 5.333423474265125, dock is -5.661 , irrit is 0.0043101114651019085,  corneal is 3.3585628561542444, melanin is 0.9706732952529805 , molecule is CC12N=C(C#N)CC1(OCN)C2O
mean loss in episode 234 is 0.030762660317122936


 28%|████████████████████▍                                                   | 7113/25000 [1:18:11<10:40:56,  2.15s/it]

reward of final molecule at episode 236 is 2.491412675261941, qed is 0.2407871019438651,sa is 3.67333968413954, dock is -6.325 , irrit is 0.05758755236308439,  corneal is 3.8566704815462316, melanin is 0.9687927579045037 , molecule is CCCC1=C2C(=C(C(=O)O)C(O)=NOC)N12
mean loss in episode 236 is 0.030262799002230167


 29%|████████████████████▉                                                    | 7174/25000 [1:18:31<2:31:16,  1.96it/s]

reward of final molecule at episode 238 is -0.8159129650677183, qed is 0.22000018801040658,sa is 4.095218770987847, dock is 0 , irrit is 0.019700555211845185,  corneal is 3.075500749042111, melanin is 0.9573914919866257 , molecule is CCC(=C=C(N)NC(C)=N)C(=O)O
mean loss in episode 238 is 0.03038691356778145


 29%|█████████████████████                                                    | 7232/25000 [1:18:53<7:12:00,  1.46s/it]

reward of final molecule at episode 240 is 0.14554135249851147, qed is 0.5190185990981249,sa is 4.228276843424682, dock is -6.158 , irrit is 0.005425404772626468,  corneal is 4.144342685242195, melanin is 0.9816823594313157 , molecule is CCCC(=C1C(C)C1(O)O)N1OC1=O
mean loss in episode 240 is 0.022883868776261806


 29%|█████████████████████                                                   | 7293/25000 [1:19:30<12:33:00,  2.55s/it]

reward of final molecule at episode 242 is 2.0753603161756815, qed is 0.29096613921166364,sa is 3.9801415151670625, dock is -5.696 , irrit is 0.14450767954283675,  corneal is 3.4283958183508507, melanin is 0.9822296945179256 , molecule is CC(C)NC(C)CC(=O)OOC(O)=C=O
mean loss in episode 242 is 0.02594667673110962


 29%|█████████████████████▍                                                   | 7353/25000 [1:19:51<3:49:34,  1.28it/s]

reward of final molecule at episode 244 is -0.6541517485080891, qed is 0.5921644569740562,sa is 4.045737436297419, dock is -6.858 , irrit is 0.2609957990056432,  corneal is 3.324252769506958, melanin is 0.9779765904617466 , molecule is CC(C)(O)C1C=C(N)C=CC1=O
mean loss in episode 244 is 0.025535881519317627


 30%|█████████████████████▋                                                   | 7413/25000 [1:20:19<8:30:28,  1.74s/it]

reward of final molecule at episode 246 is 2.0606269849763605, qed is 0.19716503723116724,sa is 3.6323250646941405, dock is -5.545 , irrit is 0.05979975123348316,  corneal is 3.3232571206712103, melanin is 0.965445911961105 , molecule is CCCC(=O)C(NN)=C(C)C(N)N
mean loss in episode 246 is 0.022981461137533188


 30%|█████████████████████▊                                                   | 7472/25000 [1:20:39<4:28:54,  1.09it/s]

reward of final molecule at episode 248 is 0.044090061484158415, qed is 0.24525313069066482,sa is 4.35126474528718, dock is -4.759 , irrit is 0.03189149564347786,  corneal is 4.495765778824695, melanin is 0.955756083640769 , molecule is CC(C=O)COOC1C=C1
mean loss in episode 248 is 0.030059633776545525


 30%|█████████████████████▉                                                   | 7534/25000 [1:21:03<4:48:48,  1.01it/s]

reward of final molecule at episode 250 is -0.06212886249351415, qed is 0.6251915149719562,sa is 4.604739229233077, dock is -5.516 , irrit is 0.0026669091514347986,  corneal is 4.124482580029434, melanin is 0.9842713478959041 , molecule is C#CC1CC(OC(C)(C)CN)O1
mean loss in episode 250 is 0.033972613513469696


 30%|██████████████████████▏                                                  | 7592/25000 [1:21:29<7:52:06,  1.63s/it]

reward of final molecule at episode 252 is -1.1665786767552349, qed is 0.5882734329329844,sa is 4.759027703246555, dock is -5.065 , irrit is 0.25403047816090324,  corneal is 3.326055611994443, melanin is 0.9837755838528378 , molecule is COC(N)C(C)(CO)C(C)C#N
mean loss in episode 252 is 0.027348547242581844


 31%|██████████████████████▎                                                  | 7653/25000 [1:21:53<5:01:36,  1.04s/it]

reward of final molecule at episode 254 is 2.464097664277972, qed is 0.34255299306023035,sa is 3.0642344022312953, dock is -4.781 , irrit is 0.11381036520374396,  corneal is 3.7917113022415316, melanin is 0.953993595803488 , molecule is CCN(C)CC(=O)C(=O)OO
mean loss in episode 254 is 0.018074081279337406


 31%|██████████████████████▌                                                  | 7710/25000 [1:22:14<8:38:20,  1.80s/it]

reward of final molecule at episode 256 is 2.0780058175287586, qed is 0.5310181133038439,sa is 3.7903049284761927, dock is -4.537 , irrit is 0.3149532778590541,  corneal is 3.5160777884742904, melanin is 0.9807297236404227 , molecule is COC(N)CCC(C)N
mean loss in episode 256 is 0.016719136387109756


 31%|██████████████████████▍                                                 | 7773/25000 [1:22:49<11:23:24,  2.38s/it]

reward of final molecule at episode 258 is -0.25173287699926766, qed is 0.4780611895266687,sa is 4.489341577810711, dock is -5.213 , irrit is 0.03121101875148247,  corneal is 4.002157027305231, melanin is 0.9541520267930379 , molecule is CC=NC(N)(CCCC(C)N)OC
mean loss in episode 258 is 0.020987090654671192


 31%|██████████████████████▌                                                 | 7833/25000 [1:23:40<20:36:27,  4.32s/it]

reward of final molecule at episode 260 is -1.1037546666613796, qed is 0.2258997948651102,sa is 4.90086197245966, dock is -6.211 , irrit is 0.06277345706211901,  corneal is 3.5354494074171994, melanin is 0.9819300968620225 , molecule is COOC(C)OC(=O)C(=O)C12CC1C(=O)C2C
mean loss in episode 260 is 0.02950931526720524


 32%|███████████████████████                                                  | 7892/25000 [1:24:06<9:52:40,  2.08s/it]

reward of final molecule at episode 262 is 2.752388752228936, qed is 0.60207711919728,sa is 2.6309819050504935, dock is -5.031 , irrit is 0.09004447721422086,  corneal is 3.7839973682225603, melanin is 0.9853756889120232 , molecule is CCC(CCCN)C(=O)O
mean loss in episode 262 is 0.02467042114585638


 32%|██████████████████████▉                                                 | 7952/25000 [1:24:41<16:59:55,  3.59s/it]

reward of final molecule at episode 264 is 2.482093383973993, qed is 0.5159665100263395,sa is 3.9227031327128854, dock is -4.991 , irrit is 0.004785395973807747,  corneal is 3.4834850695480712, melanin is 0.9719522898240925 , molecule is CNCC(CC(CCOC)ON)OC
mean loss in episode 264 is 0.033611828461289406


 32%|███████████████████████                                                 | 8013/25000 [1:25:32<21:03:12,  4.46s/it]

reward of final molecule at episode 266 is -1.4967349578510443, qed is 0.29610308283063547,sa is 5.389106727530856, dock is 0 , irrit is 0.589425963748347,  corneal is 4.514821403768086, melanin is 0.955733380255363 , molecule is C=NC1(NOOCCC)COC2CC21
mean loss in episode 266 is 0.022205770015716553


 32%|███████████████████████▍                                                 | 8016/25000 [1:25:34<3:01:19,  1.56it/s]

KeyboardInterrupt



In [None]:
generated_molecules = []
num_molecules_to_generate = 100
agent.dqn.eval()
eps_threshold = 0.03

for it in range(num_molecules_to_generate):
    done = False
    environment.initialize()
    while not done:
        steps_left = hyp.max_steps_per_episode - environment.num_steps_taken
        valid_actions = list(environment.get_valid_actions())
    
        observations = np.vstack(
            [
                np.append(
                    utils.get_fingerprint(
                        act, hyp.fingerprint_length, hyp.fingerprint_radius
                    ),
                    steps_left,
                )
                for act in valid_actions
            ]
        ) 
    
        observations_tensor = torch.Tensor(observations)
        a = agent.get_action(observations_tensor, eps_threshold)
        action = valid_actions[a]
        result = environment.step(action)
    
        action_fingerprint = np.append(
            utils.get_fingerprint(action, hyp.fingerprint_length, hyp.fingerprint_radius),
            steps_left,
        )
    
        next_state, reward, done = result
        steps_left = hyp.max_steps_per_episode - environment.num_steps_taken
    
        next_state = utils.get_fingerprint(
            next_state, hyp.fingerprint_length, hyp.fingerprint_radius
        )  
    
        action_fingerprints = np.vstack(
            [
                np.append(
                    utils.get_fingerprint(
                        act, hyp.fingerprint_length, hyp.fingerprint_radius
                    ),
                    steps_left,
                )
                for act in environment.get_valid_actions()
            ]
        )
        #print(environment._state)
    
    
    generated_molecules.append(environment._state)
    print(generated_molecules[-1])

In [None]:
generated_molecules

In [None]:
from rdkit import Chem

valid_smiles = []
for smi in generated_molecules:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        valid_smiles.append(smi)

print(f"Сгенерировано валидных молекул: {len(valid_smiles)}/{len(generated_molecules)}")