In [13]:
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import torch
import sys
sys.path.append("../")
from utils import read_pickle_from_file, write_pickle_to_file

In [2]:
def split_form(form):
    string = ''
    for i in re.findall(r"[A-Z][^A-Z]*", form):
        elem = re.match(r"\D+", i).group()
        num = i.replace(elem, "")
        if num == "":
            string += f"{elem} "
        else:
            string += f"{elem} {str(num)} "
    return string.rstrip(' ')


PATTEN = re.compile('(\d+|[A-Z][a-z]?|[^A-Za-z\d/]|/[a-z])')
def l_split(s):
    return ' '.join(re.findall(PATTEN,s))

def split_form3(form):
    form = form.split(' ')
    # print(form)
    string = []
    for i, x in enumerate(form):
        if x.isdigit():
            string.extend(list(x))
        else:
            string.append(x)

    return  ' '.join(string)

def split_form2(form):
    string = ''
    for i in re.findall(r"[a-z][^a-z]*", form):
        elem = i[0]
        num = i.replace(elem, "").replace('/', "")
        num_string = ''
        for j in re.findall(r"[0-9]+[^0-9]*", num):
            num_list = list(re.findall(r'\d+', j))
            assert len(num_list) == 1, f"len(num_list) != 1"
            _num = num_list[0]
            if j == _num:
                num_string += f"{_num} "
            else:
                extra = j.replace(_num, "")
                num_string += f"{_num} {' '.join(list(extra))} "
        string += f"/{elem} {num_string}"
    return string.rstrip(' ')

# ====================================================
# Tokenizer
# ====================================================
data_dir = "../data/"
class YNakamaTokenizer(object):

    def __init__(self, is_load=None):
        self.stoi = {}
        self.itos = {}

        if is_load is not None:
            self.stoi = read_pickle_from_file(os.path.join(data_dir, is_load))
            self.itos = {k: v for v, k in self.stoi.items()}

    def __len__(self):
        return len(self.stoi)

    def build_vocab(self, text):
        vocab = set()
        for t in text:
            vocab.update(t.split(' '))
        vocab = sorted(vocab)
        vocab.append('<sos>')
        vocab.append('<eos>')
        vocab.append('<pad>')
        for i, s in enumerate(vocab):
            self.stoi[s] = i
        self.itos = {k: v for v, k in self.stoi.items()}

    def one_text_to_sequence(self, text):
        sequence = []
        sequence.append(self.stoi['<sos>'])
        for s in text.split(' '):
            sequence.append(self.stoi[s])
        sequence.append(self.stoi['<eos>'])
        return sequence

    def one_sequence_to_text(self, sequence):
        return ''.join(list(map(lambda i: self.itos[i], sequence)))

    def one_predict_to_inchi(self, predict):
        inchi = 'InChI=1S/'
        for p in predict:
            if p == self.stoi['<eos>'] or p == self.stoi['<pad>']:
                break
            inchi += self.itos[p]
        return inchi

    # ---
    def text_to_sequence(self, text):
        sequence = [
            self.one_text_to_sequence(t)
            for t in text
        ]
        return sequence

    def sequence_to_text(self, sequence):
        text = [
            self.one_sequence_to_text(s)
            for s in sequence
        ]
        return text

    def predict_to_inchi(self, predict):
        inchi = [
            self.one_predict_to_inchi(p)
            for p in predict
        ]
        return inchi


In [3]:
# df = pd.read_csv("../data/train_labels.csv")
df = pd.read_csv("../data/agree_test.csv")

In [4]:
df['InChI_1'] = df['InChI'].progress_apply(lambda x: x.split('/')[1])
df['InChI_text'] = df['InChI'].apply(lambda x: x[9:]).progress_apply(l_split).values

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1533328.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1533328.0), HTML(value='')))




In [5]:
df.head()

Unnamed: 0,image_id,InChI,InChI_1,InChI_text
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...,C10H14BrN5S,C 10 H 14 Br N 5 S /c 1 - 6 - 10 ( 11 ) 9 ( 16...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...,C14H18ClN3,C 14 H 18 Cl N 3 /c 1 - 2 - 7 - 16 - 9 - 13 - ...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...,C16H13BrN2O,C 16 H 13 Br N 2 O /c 1 - 11 ( 20 ) 12 - 6 - 7...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1...",C14H19FN4O,"C 14 H 19 F N 4 O /c 1 - 14 ( 2 , 3 ) 12 - 13 ..."
4,000085dab281,InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-...,C20H38O,C 20 H 38 O /c 1 - 20 ( 2 ) 18 - 16 - 14 - 12 ...


In [6]:
train_df = df.copy()
# train_df = train_df[:500]

In [7]:
train_df["text"] = train_df.InChI_text.apply(split_form3)

In [8]:
print(train_df.shape)
df.shape

(1533328, 5)


(1533328, 4)

In [14]:
tokenizer = YNakamaTokenizer(is_load = "small_tokenizer.stoi.pickle")
# print('Saved tokenizer')
print(tokenizer.stoi)

{'(': 0, ')': 1, '+': 2, ',': 3, '-': 4, '/b': 5, '/c': 6, '/h': 7, '/i': 8, '/m': 9, '/s': 10, '/t': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, 'B': 22, 'Br': 23, 'C': 24, 'Cl': 25, 'D': 26, 'F': 27, 'H': 28, 'I': 29, 'N': 30, 'O': 31, 'P': 32, 'S': 33, 'Si': 34, 'T': 35, '<sos>': 36, '<eos>': 37, '<pad>': 38}


In [15]:
lengths = []
seqs = []
tk0 = tqdm(train_df['text'].values, total=len(train_df))
for text in tk0:
    try:
        seq = tokenizer.one_text_to_sequence(text)
        length = len(seq) - 2
    except:
        seq=  "None"
        length = -1
    lengths.append(length)
    seqs.append(seq)
train_df['length'] = lengths
train_df['sequence'] = seqs

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1533328.0), HTML(value='')))




In [16]:
train_df[train_df["sequence"] != "None"].shape[0] + train_df[train_df["sequence"] == "None"].shape[0]

1533328

In [20]:
train_df[train_df["length"] == -1].shape

(397, 7)

In [17]:
write_pickle_to_file("../data/df_test_small.csv.pickle", train_df)