In [None]:
import os
os.chdir("..")

In [2]:
import torch
import numpy as np
import zipfile
import pandas as pd
import pickle as pi
from rdkit import Chem
from model import MolGen
import matplotlib.pyplot as plt

In [4]:
# load data
# data = []
# with open('qm9.csv', "r") as f:
#     for line in f.readlines()[1:]:
#         data.append(line.split(",")[1])

# data = pd.read_csv("database_final_all_100smb_kekule.csv")
zf = zipfile.ZipFile("database/concatenated_smiles.zip", "r")
data = pd.read_csv(zf.open("concatenated_smiles.csv"))
x = data["smiles"]

clf = pi.load(open("weights/clf.pkl", "rb"))

# create model
gan_mol = MolGen(x, classifier=clf, hidden_dim=128, lr=5e-4, device="cpu")

### Pre-train GAN on CHEMBL

In [None]:
# create dataloader
loader = gan_mol.create_dataloader(x, batch_size=64, shuffle=True, num_workers=4)

# initial training for discriminator
initial_history = gan_mol.initial_train_n_steps(loader, max_step=2000, evaluate_every=50)

In [None]:
# stop GAN training
gan_mol.eval()
print('ok')

# save the model weights
torch.save(gan_mol.state_dict(), "weights/initial_discr_mol_gan_new.pt")

### Plot GAN Discriminator inital training loss

In [None]:
steps = np.arange(len(initial_history["loss_disc"]))
plt.plot(steps, initial_history["loss_disc"], label="Initial discriminator loss")
plt.legend(loc="upper right")
plt.xlabel("steps")
plt.ylabel("loss")
plt.grid(True)
plt.show()

In [5]:
# train model for 10000 steps

# set GAN to the training mode
# gan_mol.train()

# load initially trained discriminator weights
# gan_mol = MolGen(x, clf, hidden_dim=64, lr=1e-4, device="cuda")
# gan_mol.load_state_dict(torch.load("initial_discr_mol_gan.pt"))

# create dataloader
loader = gan_mol.create_dataloader(x, batch_size=128, shuffle=True, num_workers=4)

pretrain_history = gan_mol.train_n_steps(loader, mode="pretrain", max_step=5000, evaluate_every=50)

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

['SF']
valid: 0.01 
 

[]
valid: 0.0 
 

['I']
valid: 0.01 
 

[]
valid: 0.0 
 

['I', 'SP']
valid: 0.02 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

['F']
valid: 0.01 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

['FI', 'O']
valid: 0.02 
 

[]
valid: 0.0 
 

['P']
valid: 0.01 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

[]
valid: 0.0 
 

['B']
valid: 0.01 
 

[]
valid: 0.0 
 



KeyboardInterrupt: 

In [None]:
# stop GAN training
gan_mol.eval()
print('ok')

# stop model training and save the model weights
torch.save(gan_mol.state_dict(), "weights/pretrain_mol_gan.pt")

### Generate Smiles molecules

In [None]:
from rdkit.Chem import PandasTools
from tensorboard.notebook import display
from rdkit.Chem import Draw
# After training
smiles_list = gan_mol.generate_n(100)

valid_smiles = []
for mol in smiles_list:
  if Chem.MolFromSmiles(mol) is not None:
    valid_smiles.append(Chem.MolFromSmiles(mol))
# df = pd.DataFrame(valid_smiles, columns=["smiles"])
valid_smiles

In [None]:
Draw.MolsToGridImage(valid_smiles, molsPerRow=5)

### Plot GAN pre-training loss

In [None]:
pretrain_history.keys()

In [None]:
steps = np.arange(len(pretrain_history["loss_disc"][:1000]))
plt.plot(steps, pretrain_history["loss_disc"][:1000], label="discriminator loss")
plt.plot(steps, pretrain_history["loss_gen"][:1000], label="generator loss")
plt.legend(loc="upper right")
plt.xlabel("steps")
plt.ylabel("loss")
plt.grid(True)
plt.show()

### Train GAN on coformers

In [None]:
import warnings
warnings.filterwarnings("ignore")

coformer_data = pd.read_csv("database/database_cof_100smb_kekule.csv")
coformer_x = coformer_data["smiles"]

clf = pi.load(open("clf.pkl", "rb"))

gan_mol = MolGen(coformer_x, classifier=clf, hidden_dim=64, lr=1e-3, device="cpu")
gan_mol.load_state_dict(torch.load("weights/pretrained_mol_gan.pt"))

# set GAN to the training mode
gan_mol.train()

coformer_loader = gan_mol.create_dataloader(coformer_x, batch_size=128, shuffle=True, num_workers=4)

coformer_history = gan_mol.train_n_steps_coformer(coformer_loader, max_step=5000, evaluate_every=50)

In [None]:
# save coformer trained GAN
torch.save(gan_mol.state_dict(), "weights/coformer_trained_gan_mol.pt")

### Generate Smiles molecules

In [None]:
# After training
smiles_list = gan_mol.generate_n(8)

# convert with rdkit
mol_list = [Chem.MolFromSmiles(m) for m in smiles_list]

# draw
Chem.Draw.MolsToGridImage(mol_list, molsPerRow=4, subImgSize=(250, 250), maxMols=10)

### Plot GAN coformer training loss

In [None]:
steps = len(coformer_history["loss_discr"])
plt.plot(steps, coformer_history["loss_discr"], label="discriminator loss")
plt.plot(steps, coformer_history["loss_gen"], label="generator loss")
plt.legend(loc="upper right")
plt.xlabel("steps")
plt.ylabel("loss")
plt.grid(True)
plt.show()