In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import gc
import glob
import sys
import random
import string
import tqdm
import json
import time
import sqlite3
import warnings
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger

import codecs
from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus
from SmilesPE.learner import *
from SmilesPE.tokenizer import *

from multiprocessing import Pool

import torch
from fastai.basic_data import load_data
from fastai import *
from fastai.text import *
#from utils import *


import fastai
print (fastai.__version__)

supp_script_path = '../../supp_scripts/'
sys.path.append(supp_script_path) # path for support scripts folder
import supp_utils as su


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

1.0.61
Could not import custom script CNN


(device(type='cuda'), True)

In [2]:
# Ignore warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
# To remove rdkit warning
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [3]:
# Input parameters
input_file = "../../data/cid_smiles_sanitized_canonical.txt"
lower_label_count_cutoff = 0
upper_label_count_cutoff = 1000000
tokenization = "SPE"
spe_min_frequency = 2000

number_of_augmentation = 0
iteration = 1000000

Number_of_workers = 20

epochs = 10
bs = 64 # batch size
learning_rate = 1e-3

train_percentage = 0.8
valid_percentage = 0.1
test_percentage = 0.1

In [4]:
result_path = Path(tokenization)
name = 'pretraining'
path = result_path/name
path.mkdir(exist_ok=True, parents=True)

mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
token_path = 'results/tokens.txt'

In [5]:
# Reading input file
with open(input_file,"r") as f:
    smiles_data = [entry.split()[1] for entry in f.readlines()]
    
canonical_smiles = su.sanity_check(smiles_data,output_type = "canonical",Number_of_workers = Number_of_workers)

unique_canonical_smiles = su.remove_duplicates_list(canonical_smiles)

                                                         

In [6]:
train,valid,test = su.split_data_without_label(unique_canonical_smiles,train_percentage=train_percentage,valid_percentage=valid_percentage)

In [7]:
# Data augmentation
if number_of_augmentation > 0:
    train_data = su.smiles_augmentation(train,
                                            N_rounds=number_of_augmentation,
                                            iteration=iteration,
                                            data_set_type="train_data",
                                            Number_of_workers=Number_of_workers) 
    valid_data = su.smiles_augmentation(valid,
                                            N_rounds=number_of_augmentation,
                                            iteration=iteration,
                                            data_set_type="train_data",
                                            Number_of_workers=Number_of_workers) 
    test_data = su.smiles_augmentation(test,
                                            N_rounds=number_of_augmentation,
                                            iteration=iteration,
                                            data_set_type="train_data",
                                            Number_of_workers=Number_of_workers)
else:
    train_data,valid_data,test_data = train,valid,test

In [8]:
# Creating train, valid, and test dataframes
train_df = DataFrame (train_data,columns=['SMILES'])
train_df["canonical"] = ["yes" for i in range(len(train_data))]

valid_df = DataFrame (valid_data,columns=['SMILES'])
valid_df["canonical"] = ["yes" for i in range(len(valid_df))]

test_df = DataFrame (test_data,columns=['SMILES'])
test_df["canonical"] = ["yes" for i in range(len(test_df))]

In [9]:
# Tokenization of the data
if tokenization == "SPE":
    spe_token_path = 'pretraining_tokens.txt'
    output = codecs.open(spe_token_path, 'w')
    learn_SPE(train_data, output, 30000, min_frequency=spe_min_frequency, augmentation=0, verbose=False, total_symbols=True)
    
if tokenization == "SPE":
    MolTokenizer = su.molpmofit.MolTokenizer_spe_sos_eos
    tok = Tokenizer(partial(MolTokenizer,token_path=spe_token_path), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])
else:
    MolTokenizer = su.molpmofit.MolTokenizer_atomwise_sos_eos
    tok = Tokenizer(partial(MolTokenizer), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])

Counting SMILES...
1620 unique Canonical SMILES
Gettting Pair Statistics


Number of unique characters & Reducing number of merge operations by: 65
Unique characters: {'o', '[CH2-]', '[P@]', 'c', '[nH]', '.', '[NH3+]', '[Os]', '[C@]', '[C@@]', '[I-]', '[Mg+2]', '[As+]', 'S', '[N-]', '[Ca+2]', '[C@@H]', '#', '[47Ca]', 'B', '/', '[125I]', 'C', '[Li+]', '=', '2', '[O-]', '[N+]', '[Na+]', '[Zn]', '[n+]', ')', '-', '4', 'n', '\\', '[Br-]', '5', 'I', '6', '[252Cf]', '[Al+3]', 'Cl', '[Cl+3]', 'P', '[Co+3]', 'Br', '[NH4+]', '[Fe]', '[OH-]', '[Cl-]', '[Cd]', '[Se]', 'F', '3', 'N', '[C@H]', '[18F]', '(', '[Si]', '[NH2+]', 'O', '1', '[NH+]', 's'}
no pair has frequency >= 2000. Stopping


In [10]:
# Databunch of train and valid df
data = TextLMDataBunch.from_df(path, train_df, valid_df, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols=0, max_vocab=60000, include_bos=False)

data.save(f'{name}_databunch')
len(data.vocab.itos),len(data.train_ds)

(72, 1620)

In [11]:
data_lm = load_data(path, f'{name}_databunch', bs=bs)

In [12]:
# Loading model
learner = language_model_learner(data_lm, AWD_LSTM, drop_mult = 1.,pretrained=False)

In [14]:
# model training
lr = 3e-3
lr *= bs/48  # Scale learning rate by batch size

learner.unfreeze()
learner.fit_one_cycle(epochs, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.716894,3.224354,0.079576,00:13
1,3.440904,3.305238,0.090402,00:14
2,3.340073,3.168983,0.080134,00:14
3,3.271028,3.133582,0.105134,00:14
4,3.219376,2.988254,0.180804,00:14
5,3.074159,2.511738,0.29933,00:14
6,2.862222,2.235087,0.350781,00:14
7,2.68716,2.151086,0.363728,00:14
8,2.531613,2.075841,0.387946,00:14
9,2.407015,2.066675,0.393415,00:14


In [15]:
# Saving model
lm_fns = [f'{name}_wt', f'{name}_vocab']

learner.save(lm_fns[0], with_opt=False)
learner.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

### Test set check

In [16]:
# Load model and test set check
spe_token_path = 'pretraining_tokens.txt'

if tokenization == "SPE":
    MolTokenizer = su.molpmofit.MolTokenizer_spe_sos_eos
    tok = Tokenizer(partial(MolTokenizer,token_path=spe_token_path), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])
else:
    MolTokenizer = su.molpmofit.MolTokenizer_atomwise_sos_eos
    tok = Tokenizer(partial(MolTokenizer), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])
    
vocab_file = "pretraining_vocab.pkl"
data_lm = load_data(path, f'{name}_databunch', bs=bs)

test_data = TextLMDataBunch.from_df(path, train_df, test_df, bs=bs, tokenizer=tok, vocab=data_lm.vocab,
                              chunksize=50000, text_cols=0, max_vocab=60000, include_bos=False)
learner = language_model_learner(test_data, AWD_LSTM, drop_mult = 1.,pretrained=True)

learner.load(f'pretraining_wt', purge=False);

def get_accuracy(yhat,y):
    accuracy_list = []
    for i,chemical_tensor in enumerate(yhat):
        softmax = torch.exp(chemical_tensor.float())
        prob = softmax.cpu().detach().numpy()
        predictions = np.argmax(prob, axis=1)
        y_truth = y[i].cpu().detach().numpy()
        accuracy_check = (y_truth==predictions)
        count = np.count_nonzero(accuracy_check)
        accuracy = float(count/len(accuracy_check))
        accuracy_list.append(accuracy)
        
    return float(sum(accuracy_list)/len(accuracy_list))

pred,lbl,loss = learner.get_preds(with_loss=True,ordered=True)

accuracy = str(get_accuracy(pred,lbl))

print ("Accuracy",accuracy,"\n","Loss",sum(loss)/len(loss))

Accuracy 0.37243303571428527 
 Loss tensor(2.1710)
