En este notebook, realizaremos el preprocesamiento inicial del output con el fin de crear los tokens, el vocabulario, etc.

Los numeros se han separado de forma integra. Por ejemplo: C4OH10 -> [C, 4, O, H, 10]

# Librerias a usar

In [1]:
import os
import re
import itertools

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

import torch

In [2]:
PATH = '../input/bms-molecular-translation/'

In [3]:
train = pd.read_csv(PATH + 'train_labels.csv')

In [4]:
def separator(text):
    # Para separar la primera parte del inchi
    sub = list(re.findall(r'[A-Z][a-z]?[0-9]*', text))
    return ' '.join(sum([list(re.split('(\d+)', x))[:-1] for x in sub], []))

In [5]:
def separator2(text):
    string = ''
    for i in re.findall(r"[a-z][^a-z]*", text):
        elem = i[0]
        num = i.replace(elem, "").replace('/', "")
        num_string = ''
        for j in re.findall(r"[0-9]+[^0-9]*", num):
            num_list = list(re.findall(r'\d+', j))
            assert len(num_list) == 1, f"len(num_list) != 1"
            _num = num_list[0]
            if j == _num:
                num_string += f"{_num} "
            else:
                extra = j.replace(_num, "")
                num_string += f"{_num} {' '.join(list(extra))} "
        string += f"/{elem} {num_string}"
    return string.rstrip(' ')

In [6]:
class Tokenizer(object):
    
    def __init__(self):
        self.stoi = {}
        self.itos = {}

    def __len__(self):
        return len(self.stoi)
    
    def fit_on_texts(self, texts):
        vocab = set()
        for text in texts:
            vocab.update(text.split(' '))
        vocab = sorted(vocab)
        vocab.append('<sos>')
        vocab.append('<eos>')
        vocab.append('<pad>')
        for i, s in enumerate(vocab):
            self.stoi[s] = i
        self.itos = {item[1]: item[0] for item in self.stoi.items()}
        
    def text2seq(self, text):
        sequence = []
        sequence.append(self.stoi['<sos>'])
        for s in text.split(' '):
            sequence.append(self.stoi[s])
        sequence.append(self.stoi['<eos>'])
        return sequence
    
    def texts2seqs(self, texts):
        sequences = []
        for text in texts:
            sequence = self.text2seq(text)
            sequences.append(sequence)
        return sequences

    def seq2text(self, sequence):
        return ''.join(list(map(lambda i: self.itos[i], sequence)))
    
    def seqs2texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = self.seq2text(sequence)
            texts.append(text)
        return texts
    
    def predict_caption(self, sequence):
        caption = ''
        for i in sequence:
            if i == self.stoi['<eos>'] or i == self.stoi['<pad>']:
                break
            caption += self.itos[i]
        return caption
    
    def predict_captions(self, sequences):
        captions = []
        for sequence in sequences:
            caption = self.predict_caption(sequence)
            captions.append(caption)
        return captions

In [7]:
#Separamos los elementos para crear los tokens
train['InChI_1'] = train['InChI'].progress_apply(lambda x: x.split('/')[1])
train['InChI_text'] = train['InChI_1'].progress_apply(separator) + ' ' + \
                        train['InChI'].apply(lambda x: '/'.join(x.split('/')[2:])).progress_apply(separator2).values

# Inicializamos el tokenizador
tokenizer = Tokenizer()

# Creamos el vocabulario
tokenizer.fit_on_texts(train['InChI_text'].values)
torch.save(tokenizer, 'tokenizer.pth')
print('Tokenizer guardado')

# Almacenamos los len de cada inchi (lo necesitaremos en el modelo)
lengths = []
for text in tqdm(train['InChI_text'].values, total = len(train)):
    seq = tokenizer.text2seq(text)
    
    #no contamos <eos> ni <sos>
    lengths.append(len(seq) - 2)
    
train['InChI_length'] = lengths
train.to_pickle('train_2.pkl')
print('Train tratado guardado.pkl')

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]

Tokenizer guardado


  0%|          | 0/2424186 [00:00<?, ?it/s]

Train tratado guardado.pkl


In [8]:
train.head()

Unnamed: 0,image_id,InChI,InChI_1,InChI_text,InChI_length
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,C13H20OS,C 13 H 20 /c 1 - 9 ( 2 ) 8 - 15 - 13 - 6 - 5 -...,57
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,C21H30O4,C 21 H 30 O 4 /c 1 - 12 ( 22 ) 25 - 14 - 6 - 8...,108
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,C24H23N5O4,C 24 H 23 N 5 O 4 /c 1 - 14 - 13 - 15 ( 7 - 8 ...,112
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,C17H24N2O4S,C 17 H 24 N 2 O 4 /c 1 - 12 ( 20 ) 18 - 13 ( 1...,107
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,C10H19N3O2S,C 10 H 19 N 3 O 2 /c 1 - 15 - 10 ( 14 ) 12 - 8...,71
