In [1]:
# !pip install rdkit
# !pip install allennlp
# !pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html -U
# !pip install torch==1.7.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
# !pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
import torch
import pickle as pi
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import tensorflow as tf

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
import os
# os.chdir('/content/drive/MyDrive/MolGen')

In [5]:
clf = pi.load(open('weights/clf.pkl', 'rb'))

In [6]:
drug = 'CN1C=NC2=C1C(=O)N(C)C(=O)N2C'

In [7]:
generated_example = 'NC(=O)CN1CCCC1=O'

In [8]:
def generate_descriptors(smiles):
  
  descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
  get_descriptors = rdMolDescriptors.Properties(descriptor_names)
  
  molecule_object = Chem.MolFromSmiles(smiles)
  final_descriptors = np.array(get_descriptors.ComputeProperties(molecule_object)).reshape((-1, 43))

  return final_descriptors


def get_clf_input(coformer_smiles, drug_smiles=drug):

  drug_descriptors, coformer_descriptors = generate_descriptors(drug_smiles), generate_descriptors(coformer_smiles)

  final_input = np.concatenate((drug_descriptors, coformer_descriptors), axis=1)

  return final_input


def calculate_clf_error(coformer_smiles, desired_clf_output=1, drug_smiles=drug, classifier=clf):

  clf_input = get_clf_input(coformer_smiles, drug_smiles)
  clf_prediction = classifier.predict_proba(clf_input)[:,desired_clf_output]

  error = tf.keras.metrics.binary_crossentropy(desired_clf_output, 
                                               clf_prediction)

  return float(error)

### Объединенная модель

In [9]:
from model import MolGen

In [10]:
#создание списка SMILES
with open('data/database_cof_100smb_kekule.csv', "r") as file:
  data = [molecule.replace('\n', '') for molecule in file]

In [11]:
data

['CC1(C)C(=O)NC1S(=O)(=O)C1=CC=CC=C1',
 'CC1C(=O)NC1S(=O)(=O)C1=CC=CC=C1',
 'OC1=CC=CC(O)=C1',
 'C(=CC1=CC=NC=C1)C1=CC=NC=C1',
 'C1=NC=CC(C2C(C3=CC=NC=C3)C(C3=CC=NC=C3)C2C2=CC=NC=C2)=C1',
 'COC1=CC(O)=CC(O)=C1',
 'C(=CC1=CC=CC=N1)C1=CC=CC=N1',
 'NC(=O)N1C2=CC=CC=C2C=CC2=CC=CC=C21',
 'OC1=CC=C(O)C=C1',
 'NC(=O)CN1CCCC1=O',
 'O=C1C(O)=C(Cl)C(=O)C(O)=C1Cl',
 'C(#CC1=CC=CC=N1)C1=CC=CC=N1',
 'NC(=O)C1=CC=CN=C1',
 'O=C(O)C1=CC2=CC=CC=C2C=C1O',
 'O=C(O)C1=CC([N+](=O)[O-])=CC([N+](=O)[O-])=C1',
 'C1=CC=C2C(=C1)C=CC1=CC=C3C=CC4=CC=C5C=CC=CC5=C4C3=C12',
 'N#CC(C#N)=C1C=CC(=C(C#N)C#N)C=C1',
 'O=C(O)C1=CC(C(=O)O)=CC([N+](=O)[O-])=C1',
 'C1=CC=C2C(=C1)C=CC1=CC=CC=C12',
 'CC(=O)C=C(C)O',
 'CC1=C(C2=C(C)C=C3C(C(C)C)=C(O)C(O)=C(C=O)C3=C2O)C(O)=C2C(C=O)=C(O)C(O)=C(C(C)C)C2=C1',
 'COC(=O)N1C=NC(C(N)=O)=C1N',
 'CN1N=NC2=C(C(N)=O)N=CN2C1=O',
 'NC1=CC=CC=C1C(=O)O',
 'O=C(O)C(O)C(O)C(=O)O',
 'O=C(O)C(=O)O',
 'O=C(O)C1=CC=CC=C1O',
 'C1=CC2=CC=CC3=C2C(=C1)CC3',
 'O=C1C(Cl)=C(Cl)C(=O)C(Cl)=C1Cl',
 'C1COCCOCCOC

In [12]:
#загрузка предобученной только на ChemBL генеративной модели и ее дообучение
# gan_mol = pi.load(open('weights/gan_mol7_50ksteps_100symb_kekuke.pkl', 'rb'))  # Nina

gan_mol = MolGen(data=data, clf_path='weights/clf.pkl')
pretrained_model = pi.load(open('weights/gan_mol7_50ksteps_100symb_kekuke.pkl', 'rb'))

# загрузчик данных для модели
loader = gan_mol.create_dataloader(data, batch_size=20, shuffle=True, num_workers=10)

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
pretrained_model.parameters

In [None]:
# train model for 10000 steps
pretrained_model.train()

In [None]:
torch.cuda.current_device()

In [None]:
torch.cuda.device(0)

In [None]:
torch.cuda.device_count()

In [14]:
torch.cuda.get_device_name(0)

AssertionError: Torch not compiled with CUDA enabled

In [13]:
torch.cuda.is_available()

False

In [None]:
torch._C._cuda_getCompiledVersion()

In [None]:
import warnings #возникает одна и та же ошибка
warnings.filterwarnings('ignore')

training_steps = 1000
cofomers_training_history = pretrained_model.train_n_steps(loader, max_step=training_steps, evaluate_every=10)

### График измененеие clf loss в процессе обучения

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
x1 = np.arange(training_steps)
x2 = np.arange(training_steps)

# clf_loss_chembl = chembl_training_history['classifier_loss']
clf_loss_cofomers = cofomers_training_history['classifier_loss']

In [None]:
# пока чтобы просто работало, потому что метрики первого обучения не вытащить, судя по всему
clf_loss_chembl = np.sqrt(
  np.array(
  [0.6666666666666666, 0.618421052631579, 0.5135135135135135, 0.4050632911392405, 0.3, 0.34285714285714286, 0.2875, 0.23333333333333334, 0.4574468085106383, 0.5157894736842106, 0.7065217391304348, 0.5952380952380952, 0.5909090909090909, 0.5, 0.4878048780487805, 0.36764705882352944, 0.2830188679245283, 0.375, 0.44, 0.3220338983050847, 0.475, 0.7755102040816326, 0.7727272727272727, 0.66, 0.6428571428571429, 0.48214285714285715, 0.48333333333333334, 0.5625, 0.4069767441860465, 0.4090909090909091, 0.38372093023255816, 0.4880952380952381, 0.5116279069767442, 0.49333333333333335, 0.5945945945945946, 0.3977272727272727, 0.32222222222222224, 0.3333333333333333, 0.3522727272727273, 0.36046511627906974, 0.4431818181818182, 0.38095238095238093, 0.47674418604651164, 0.5851063829787234, 0.6222222222222222, 0.627906976744186, 0.7228915662650602, 0.6046511627906976, 0.6951219512195121, 0.7333333333333333, 0.6477272727272727, 0.7439024390243902, 0.5444444444444444, 0.7093023255813954, 0.5, 0.5, 0.44047619047619047, 0.5227272727272727, 0.40789473684210525, 0.34146341463414637, 0.45555555555555555, 0.4090909090909091, 0.4714285714285714, 0.48717948717948717, 0.5972222222222222, 0.4594594594594595, 0.5294117647058824, 0.5735294117647058, 0.5384615384615384, 0.47560975609756095, 0.4367816091954023, 0.5217391304347826, 0.4186046511627907, 0.36585365853658536, 0.4230769230769231, 0.46153846153846156, 0.47619047619047616, 0.4868421052631579, 0.5606060606060606, 0.5714285714285714, 0.4166666666666667, 0.5217391304347826, 0.38823529411764707, 0.3333333333333333, 0.4186046511627907, 0.43617021276595747, 0.36666666666666664, 0.40476190476190477, 0.45555555555555555, 0.5977011494252874, 0.46987951807228917, 0.4148936170212766, 0.4418604651162791, 0.36666666666666664, 0.3953488372093023, 0.38372093023255816, 0.5760869565217391, 0.47, 0.5416666666666666, 0.4791666666666667]
)
)

In [None]:
# построение графика
plt.figure(figsize=(15,10))
plt.plot(x1, clf_loss_cofomers, label = "Предобученная на коформерах модель")
plt.plot(x2, clf_loss_chembl, label = "Предобученной только на ChemBL модель")
plt.title('Изменение clf loss в процессе обучения')
plt.xlabel('Итерация обучения')
plt.ylabel('clf loss mean')
plt.legend()
plt.show()

In [None]:
# discriminator + generator loss plot
x = np.arange(training_steps)
discr_loss = cofomers_training_history['discriminator_loss']
gen_loss = cofomers_training_history['generator_loss']
plt.plot(x, discr_loss, label='Discriminator loss')
plt.plot(x, gen_loss, label='Generator loss')
plt.title('GAN metrics during training')
plt.xlabel('Steps')
plt.ylabel('Mean loss')

In [None]:
# discriminator + generator + classifier loss plot
x = np.arange(training_steps)
discr_loss = cofomers_training_history['discriminator_loss']
gen_loss = cofomers_training_history['generator_loss']
classifier_loss = cofomers_training_history['classifier_loss']
plt.plot(x, discr_loss, label='Discriminator loss')
plt.plot(x, gen_loss, label='Generator loss')
plt.plot(x, classifier_loss, label='Classifier loss')
plt.title('GAN and classifier metrics during training')
plt.xlabel('Steps')
plt.ylabel('Mean loss')