In [1]:
#tokenize smi
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    for i in range(len(tokens)):
        if(tokens[i]=='[C@H]'):
            tokens[i]='C'
    return ' '.join(tokens)

In [2]:
#tokenizer test
smi="CCOC(=O)Cc1ccc(OC)c(Oc2ccc([N+](=O)[O-])cc2CBr)c1.FC(F)(F)CS>>COc1ccc(CC(=O)O)cc1Oc1ccc([N+](=O)[O-])cc1CBr"
smi_tokenizer(smi)

'C C O C ( = O ) C c 1 c c c ( O C ) c ( O c 2 c c c ( [N+] ( = O ) [O-] ) c c 2 C Br ) c 1 . F C ( F ) ( F ) C S > > C O c 1 c c c ( C C ( = O ) O ) c c 1 O c 1 c c c ( [N+] ( = O ) [O-] ) c c 1 C Br'

In [3]:
def rxn2rtpt(rxn_smi):
    """
    Convert reaction SMILES into reactants and products
    in a way that is compatible with `rdc.rdchiralRunText`
    and not discarding meaningful data
    """
    # for normal reactions
    if rxn_smi.count('>>') == 1:
        return rxn_smi.split('>>')

    # for reactions with one reactant in middle
    # like 'Cc1ccc(C(=O)O)cc1F>O=C1CCC(=O)N1Br>O=C(O)c1ccc(CBr)c(F)c1'
    elif rxn_smi.count('>') == 2:
        rt1, rt2, pt = rxn_smi.split('>')
        # there are possible cases without rt1
        # like '>O=C(O)C1CCN(C(=O)CO)CC1>O'
        if rt1 == '':
            return rt2, pt
        else:
            return f'{rt1}.{rt2}', pt

In [4]:
#read raw data: test
data_dir="./data/All_Data/test.csv"
import pandas as pd
data=pd.read_csv(data_dir)
src_file=open("./data/All_Data/src_test_binary.txt","w")
tgt_file=open("./data/All_Data/tgt_test_binary.txt","w")

In [5]:
#seperate reactants and products
for i in range(data.shape[0]):
    src=data.loc[i]["rxn_smiles"]
    tgt=data.loc[i]["label"]
    src_token=smi_tokenizer(src)
    src_file.write(src_token)
    src_file.write("\n")
    #tgt_token=smi_tokenizer(tgt)
    tgt_file.write(str(tgt))
    tgt_file.write("\n")
src_file.close()
tgt_file.close()

In [6]:
#read raw data
data_dir="./data/All_Data/train.csv"
import pandas as pd
data=pd.read_csv(data_dir)
src_file=open("./data/All_Data/src_train_binary.txt","w")
tgt_file=open("./data/All_Data/tgt_train_binary.txt","w")

In [7]:
#seperate reactants and products
for i in range(data.shape[0]):
    src=data.loc[i]["rxn_smiles"]
    tgt=data.loc[i]["label"]
    src_token=smi_tokenizer(src)
    src_file.write(src_token)
    src_file.write("\n")
    #tgt_token=smi_tokenizer(tgt)
    tgt_file.write(str(tgt))
    tgt_file.write("\n")
src_file.close()
tgt_file.close()

In [8]:
#read raw data
data_dir="./data/All_Data/valid.csv"
import pandas as pd
data=pd.read_csv(data_dir)
src_file=open("./data/All_Data/src_val_binary.txt","w")
tgt_file=open("./data/All_Data/tgt_val_binary.txt","w")

In [9]:
#seperate reactants and products
for i in range(data.shape[0]):
    src=data.loc[i]["rxn_smiles"]
    tgt=data.loc[i]["label"]
    src_token=smi_tokenizer(src)
    src_file.write(src_token)
    src_file.write("\n")
        #tgt_token=smi_tokenizer(tgt)
    tgt_file.write(str(tgt))
    tgt_file.write("\n")
src_file.close()
tgt_file.close()

In [10]:
#input file generation
!python preprocess.py -train_src data/All_Data/src_train_binary.txt \
                     -train_tgt data/All_Data/tgt_train_binary.txt \
                     -valid_src data/All_Data/src_val_binary.txt \
                     -valid_tgt data/All_Data/tgt_val_binary.txt \
                     -save_data data/All_Data/All_Data\
                     -src_seq_length 1000 -tgt_seq_length 1000 \
                     -src_vocab_size 1000 -tgt_vocab_size 1000 -share_vocab

[2022-05-21 06:20:51,352 INFO] Extracting features...
[2022-05-21 06:20:51,352 INFO]  * number of source features: 0.
[2022-05-21 06:20:51,353 INFO]  * number of target features: 0.
[2022-05-21 06:20:51,353 INFO] Building `Fields` object...
[2022-05-21 06:20:51,353 INFO] Building & saving training data...
[2022-05-21 06:20:51,353 INFO] Reading source and target files: data/All_Data/src_train_binary.txt data/All_Data/tgt_train_binary.txt.
[2022-05-21 06:20:51,421 INFO] Splitting shard 0.
[2022-05-21 06:20:51,468 INFO] Building shard 0.
[2022-05-21 06:21:04,859 INFO]  * saving 0th train data shard to data/All_Data/All_Data.train.0.pt.
[2022-05-21 06:21:10,282 INFO] Building & saving validation data...
[2022-05-21 06:21:10,282 INFO] Reading source and target files: data/All_Data/src_val_binary.txt data/All_Data/tgt_val_binary.txt.
[2022-05-21 06:21:10,288 INFO] Splitting shard 0.
[2022-05-21 06:21:10,292 INFO] Building shard 0.
[2022-05-21 06:21:12,130 INFO]  * saving 0th valid data shard

In [11]:
#train a model
!python  train.py -data data/All_Data/All_Data \
                   -save_model experiments/checkpoints/All_Data_binary/All_Data_model \
                   -seed 42 -gpu_ranks 0 -save_checkpoint_steps 1000 -keep_checkpoint 5 \
                   -train_steps 15000 -param_init 0  -param_init_glorot -max_generator_batches 32 \
                   -batch_size 4096 -batch_type tokens -normalization tokens -max_grad_norm 0  -accum_count 4 \
                   -optim adam -adam_beta1 0.9 -adam_beta2 0.998 -decay_method noam -warmup_steps 1000  \
                   -learning_rate 2 -label_smoothing 0.0 -report_every 1000 \
                   -layers 4 -rnn_size 256 -word_vec_size 256 -encoder_type transformer -decoder_type transformer \
                   -dropout 0.1 -position_encoding -share_embeddings \
                   -global_attention general -global_attention_function softmax -self_attn_type scaled-dot \
                   -heads 8 -transformer_ff 2048

[2022-05-21 06:21:16,538 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:21:16,540 INFO]  * vocabulary size. source = 1004; target = 1004
[2022-05-21 06:21:16,540 INFO] Building model...
[2022-05-21 06:21:21,013 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(1004, 256, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=256, out_features=256, bias=True)
          (linear_values): Linear(in_features=256, out_features=256, bias=True)
          (linear_query): Linear(in_features=256, out_features=256, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.1)
          

[2022-05-21 06:21:21,759 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:22:49,036 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:24:14,956 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:25:38,701 INFO] Step 1000/15000; acc: 100.00; ppl:  1.00; xent: 0.00; lr: 0.00395; 58974/1490 tok/s;    257 sec
[2022-05-21 06:25:38,704 INFO] Saving checkpoint experiments/checkpoints/All_Data_binary/All_Data_model_step_1000.pt
[2022-05-21 06:25:40,538 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:27:06,478 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:28:32,490 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 06:29:57,473 INFO] Step 2000/1

[2022-05-21 07:20:43,495 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 07:22:02,545 INFO] Step 14000/15000; acc: 100.00; ppl:  1.00; xent: 0.00; lr: 0.00106; 63179/1565 tok/s;   3641 sec
[2022-05-21 07:22:02,547 INFO] Saving checkpoint experiments/checkpoints/All_Data_binary/All_Data_model_step_14000.pt
[2022-05-21 07:22:10,720 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 07:23:37,549 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 07:25:03,752 INFO] Loading train dataset from data/All_Data/All_Data.train.0.pt, number of examples: 58518
[2022-05-21 07:26:22,244 INFO] Step 15000/15000; acc: 100.00; ppl:  1.00; xent: 0.00; lr: 0.00102; 62748/1535 tok/s;   3900 sec
[2022-05-21 07:26:22,247 INFO] Saving checkpoint experiments/checkpoints/All_Data_binary/All_Data_model_step_15000.pt
[2022-05-21 07:26:23,425 INFO] L

In [14]:
#make predictions
!python translate.py -model ./experiments/checkpoints/All_Data_binary/All_Data_model_step_15000.pt -src ./data/All_Data/src_test_binary.txt -output ./experiments/results/All_Data_pred.txt -n_best 1 -batch_size 128 -gpu 0 -replace_unk -max_length 5 -fast

PRED AVG SCORE: -0.0000, PRED PPL: 1.0000


In [33]:
#calculate the acc
data_dir="./data/All_Data/test.csv"
import pandas as pd
data=pd.read_csv(data_dir)

pred_text=open("./experiments/results/All_Data_pred.txt")
lines=pred_text.readlines()
count=len(data)
false=0
for i in range(len(data)):
    gt_product=data.loc[i]["label"]
    pred=lines[i].strip("\n")
    if(int(pred)!=gt_product):
        false+=1 
        

In [35]:
false/count

0.0