In [1]:
import tensorflow as tf
from kme_tokenize import Segmentation, Tokenizer
import numpy as np

In [2]:
kme_segment = Segmentation()
tokenizer = Tokenizer()

In [3]:
text = 'cyclohexanoic acid'
x_test = kme_segment.preprocessing_text(text)
print(x_test.shape)

(18, 5)


In [4]:
predict = kme_segment.predict(x_test)

pred_text, pred_text_join = kme_segment.word_segmentation(text)

print(predict)
print(pred_text)
print(pred_text_join)

[0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]
['cyclo', 'hex', 'an', 'oic acid']
cyclo|hex|an|oic acid


In [10]:
text_arr = ['methyl methanoate', 'ethane', '(hydroxymethylamino)oxy-methoxymethanol']
for item in text_arr:
    word, _ = kme_segment.word_segmentation(item)
    print(word)

['meth', 'yl', ' ', 'meth', 'an', 'oate']
['eth', 'ane']
['(', 'hydroxy', 'meth', 'yl', 'amino', ')', 'oxy', '-', 'meth', 'oxy', 'meth', 'an', 'ol']


In [5]:
text_arr = ['methyl methanoate', 'ethane', '(hydroxymethylamino)oxy-methoxymethanol']

tokenizer.fit_on_text(text_arr)

train_seq = tokenizer.text_to_sequences(text_arr, method_pad='pre')
print(train_seq)

[[ 0  0  0  0  0  0  0  4  5  6  4  7  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  9 10]
 [11 12  4  5 13 14 15 16  4 15  4  7 17]]


In [6]:
print(tokenizer.word2index)

{'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, 'meth': 4, 'yl': 5, ' ': 6, 'an': 7, 'oate': 8, 'eth': 9, 'ane': 10, '(': 11, 'hydroxy': 12, 'amino': 13, ')': 14, 'oxy': 15, '-': 16, 'ol': 17}


In [7]:
print(tokenizer.word2count)

{'<pad>': 0, '<start>': 3, '<end>': 3, '<unk>': 0, 'meth': 5, 'yl': 2, ' ': 1, 'an': 2, 'oate': 1, 'eth': 1, 'ane': 1, '(': 1, 'hydroxy': 1, 'amino': 1, ')': 1, 'oxy': 2, '-': 1, 'ol': 1}


In [8]:
test_arr = ['2-(4-methoxyphenyl)-2-oxoacetic acid']

test_seq = tokenizer.text_to_sequences(test_arr)
print(test_seq)

test_text = tokenizer.sequences_to_text(test_seq)
print(test_text)

[[3, 16, 11, 3, 16, 4, 15, 3, 5, 14, 16, 3, 16, 3, 3, 3]]
[['<unk>', '-', '(', '<unk>', '-', 'meth', 'oxy', '<unk>', 'yl', ')', '-', '<unk>', '-', '<unk>', '<unk>', '<unk>']]
