With a total training data set of 80 sentences, this approach tags everything with a single tag.
As a result, a larger training set is necessary.
With a total training data set of 800 sentences, this approach begins to tag with 2-4 different tags in each sentence, but overwhelmingly uses a single tag as with 80 sentences. This is likely the case because the training with 800 sentences was simply added to the model previously trained with the 80 sentences.

In [2]:
import os
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, InputLayer, Embedding, TimeDistributed, Activation
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
import sqlite3
from itertools import groupby
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
# need to redo this considering word breaks

In [3]:
os.chdir(os.path.expanduser('~/python_workspace/medical_corpus_scripting/corpus/hminterface/static/hminterface'))

In [4]:
conn = sqlite3.Connection('hmcorpus.db')

In [5]:
crsr = conn.cursor()

Retrieve POS tags from database.

In [9]:
# use if only pos tags are desired
sql_query = """SELECT ind, pos_label FROM pos;"""
out = crsr.execute(sql_query).fetchall()

In [10]:
tag2index = {t: i for i, t in out}
tag2index['-PAD-'] = 0

In [6]:
# use if word_loc values are also desired
sql_query = """SELECT DISTINCT loc, pos_label FROM types
JOIN word_loc ON word_loc.ind=types.word_loc
JOIN pos ON pos.ind=types.pos_type;"""
combinations = crsr.execute(sql_query).fetchall()
combinations = ['-'.join(t) for t in combinations]

tag2index = {t: i + 1 for i, t in enumerate(combinations)}
tag2index['-PAD-'] = 0
print(tag2index.items())

dict_items([('B-CL', 1), ('B-NN', 2), ('O-PU', 3), ('B-FW', 4), ('B-VV', 5), ('B-PP', 6), ('I-NN', 7), ('B-QU', 8), ('I-CL', 9), ('B-LC', 10), ('I-VV', 11), ('B-AD', 12), ('B-DT', 13), ('B-CC', 14), ('I-CC', 15), ('B-CV', 16), ('I-AD', 17), ('B-RL', 18), ('B-CS', 19), ('B-PN', 20), ('I-CS', 21), ('I-FW', 22), ('B-NR', 23), ('I-NR', 24), ('I-PU', 25), ('B-PU', 26), ('B-CM', 27), ('B-ON', 28), ('I-QU', 29), ('I-PN', 30), ('B-JJ', 31), ('-PAD-', 0)])


Retrieve word tokens with tags as numerical codes.

In [8]:
# use if only pos tags are desired
sql_query = """SELECT doc_ind, sent_ind, token_form, pos_type FROM tokens
JOIN types ON tokens.word_type_ind=types.ind;"""
words = crsr.execute(sql_query).fetchall()
sentences = []
for k, g in groupby(words, lambda x: (x[0], x[1])):
    sentences.append(list((w[2].lower(), w[3]) for w in g))

In [7]:
# use if word_loc values are also desired
sql_query = """SELECT doc_ind, sent_ind, token_form, loc, pos_label FROM tokens
JOIN types ON tokens.word_type_ind=types.ind
JOIN word_loc ON word_loc.ind=types.word_loc
JOIN pos ON pos.ind=types.pos_type;"""
words = crsr.execute(sql_query).fetchall()
sentences = []
for k, g in groupby(words, lambda x: (x[0], x[1])):
    sentences.append(list((w[2].lower(), tag2index['-'.join(w[3:])]) for w in g))
print(sentences[:10])

[[('tus', 1), ('mob', 2), ('–', 3), ('shigellosis', 4), ('disease', 4), ('fact', 4), ('sheet', 4), ('series', 4), ('tus', 1), ('mob', 2), ('shigellosis', 4), ('zoo', 5), ('li', 6), ('cas', 7), ('?', 3)], [('shigellosis', 4), ('yog', 5), ('ib', 8), ('tug', 9), ('mob', 2), ('los', 5), ('ntawm', 10), ('cov', 1), ('kab', 2), ('mob', 11), ('bacteria', 4), ('los', 5), ('.', 3)], [('muaj', 5), ('txog', 6), ('li', 6), ('300', 8), ('rau', 6), ('400', 8), ('leej', 1), ('neeg', 2), ('tau', 12), ('raug', 5), ('tus', 1), ('mob', 2), ('no', 13), ('txhua', 8), ('xyoo', 1), ('hauv', 10), ('lub', 1), ('xeev', 2), ('wisconsin', 4), ('.', 3)], [('feem', 1), ('ntau', 5), ('muaj', 5), ('tshwm', 5), ('sim', 11), ('rau', 6), ('lub', 1), ('caij', 2), ('ntuj', 7), ('sov', 11), ('thiab', 14), ('lub', 1), ('caij', 2), ('nplooj', 7), ('ntoo', 7), ('zeeg', 11), ('.', 3)], [('nyob', 5), ('nyob', 11), ('mam', 12), ('pom', 5), ('tej', 8), ('tus', 1), ('neeg', 2), ('raug', 5), (',', 3), ('los', 14), ('puas', 15), (','

In [3]:
# for testing with Brown corpus
from nltk.corpus import brown
sentences = brown.tagged_sents()[:10000]

In [8]:
sentences_list = []
tags_list = []
for tagged_sentence in sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences_list.append(sentence)
    tags_list.append(tags)

Split data into training and testing.

In [9]:
(train_sentences,
test_sentences,
train_tags,
test_tags) = train_test_split(sentences_list, tags_list, test_size=0.2)

In [10]:
words = set([])

# change to for s in train_sentences for normal functionality
for s in np.concatenate((train_sentences, test_sentences)):
    for w in s:
        words.add(w.lower())
        
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0
word2index['-OOV-'] = 1

In [11]:
print(word2index.items())

dict_items([('diphtheria', 2), ('fab', 3), ('fact', 4), ('hluav', 5), ('nplooj', 6), ('prevention', 7), ('ces', 8), ('enteric', 9), ('tsim', 10), ('ris', 11), ('ntawd', 12), ('deer', 13), ('for', 14), ('9', 15), ('black', 16), ('txhab', 17), ('mantoux', 18), ('lawv', 19), ('fever', 20), ('ke', 21), ('sid', 22), ('xov', 23), ('nto', 24), ('minnesota', 25), ('up', 26), ('qaum', 27), ('canal', 28), ('cawv', 29), ('carbapenem', 30), ('paratyphoid', 31), ('sin', 32), ('lis', 33), ('deg', 34), ('kheev', 35), ('pleev', 36), ('nraim', 37), ('ntiv', 38), ('laws', 39), ('qho', 40), ('7', 41), ('txav', 42), ('mloog', 43), ('!', 44), ('caij', 45), ('d68', 46), ('“', 47), ('qiv', 48), ('resonance', 49), ('ntxov', 50), ('daj', 51), ('taws', 52), ('europe', 53), ('taig', 54), ('sputum', 55), ('poob', 56), ('doog', 57), ('po', 58), ('nraud', 59), ('ntau', 60), (';', 61), ('tij', 62), ('hnab', 63), ('qub', 64), ('haus', 65), ('hov', 66), ('tshuam', 67), ('anterior', 68), ('million', 69), ('kwv', 70), (

In [74]:
# for testing with Brown corpus
tags = set([])

for s in train_tags:
    for t in s:
        tags.add(t.lower())
        
tag2index = {t: i + 2 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0
tag2index['-OOV-'] = 1

AttributeError: 'int' object has no attribute 'lower'

In [12]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    
    train_sentences_X.append(s_int)
    
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    
    test_sentences_X.append(s_int)

In [13]:
print(test_sentences_X[7])
print(test_sentences[7])

[241, 56, 753]
('*', 'poob', 'phau')


In [9]:
# for testing with Brown corpus
for s in train_tags:
    s_int = []
    for t in s:
        try:
            s_int.append(tag2index[t.lower()])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
    train_tags_y.append(s_int)
        
for s in test_tags:
    s_int = []
    for t in s:
        try:
            s_int.append(tag2index[t.lower()])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
    test_tags_y.append(s_int)

In [14]:
# variant for use with Hmong data
for s in train_tags:
    train_tags_y.append(s)
    
for s in test_tags:
    test_tags_y.append(s)

In [15]:
# change to len(max(train_sentences_X, key=len)) for normal functionality
MAX_LENGTH = len(max(np.concatenate((train_sentences_X, test_sentences_X)), key=len))

In [16]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

In [14]:
# bypass this and use pretrained model below
embedding_model = Word2Vec(sentences_list, workers=10, min_count=1, iter=10, size=150)

In [17]:
import pickle

os.chdir(os.path.expanduser(os.path.join(os.path.join('~', 'python_workspace'), 'hmong_clf_graph')))

f = open('word2vec_model_200.pkl', 'rb')
word2vec_model = pickle.load(f)
f.close()

In [18]:
print(len(train_sentences_X))
print(len(train_tags_y))
#print(type(train_sentences_X))
# use the below only if using full tagged data set to train, otherwise comment out
sentences_X = np.concatenate((train_sentences_X, test_sentences_X), axis=0)
tags_y = np.concatenate((train_tags_y, test_tags_y), axis=0)

323
323


In [19]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [20]:
# this is a test here--it's defined in the actual fit function later below
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

In [20]:
print(len(train_tags_y[0]))
print(cat_train_tags_y.shape)
print(train_sentences_X.shape)

93


NameError: name 'cat_train_tags_y' is not defined

In [72]:
# this is deprecated
from keras import backend as K

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_true, axis=-1)
        
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32')
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [21]:
embedding_matrix = np.zeros((len(words) + 2, 150))
for word, i in word2index.items():
#    print((word, i))
    try:
        embedding_vector = word2vec_model.wv[word]
    except KeyError as e:
        print(word)
        embedding_vector = None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

diphtheria
enteric
mantoux
fever
canal
carbapenem
paratyphoid
d68
resonance
anterior
endocarditis
staphylococcus
encephalitis
norovirus
typhoid
aureus
ltbi
guillain
cerebrospinal
cotton
b2
shigella
consulate
hfrs
creek
antibiotics
hps
ev-d68
carbapenems
nombre
hantaviruses
b3
b1
kui
whooping
lhd
latent
flaccid
magnetic
emg
hantavirus
tetanus
paralysis
bleach
tuberculosis
enterovirus
cytomegalovirus
tdap
typhi
uahauj
cough
enterobacteriaceae
nile
louis
mri
p-01581h
dtap
csf
fluid
electromyography
cre
acute
white-footed
vancomycin
herpesviruses
pertussis
zika
barré
bayou
pulmonary
renal
barr
paratyphi
afm
myelitis
enteroviruses
meningitis
polio
notification
salmonella
x-ray
shigellosis
hemorrhagic
epstein
intermediate
adenoviruses
p-42053
vrsa
nebulization
sanitizers
imaging
aviviruses
bacteremia
cdc
osteomyelitis
epa
-PAD-
-OOV-


In [22]:
embedding_matrix[632]

array([ 0.19163309,  1.69376302,  2.43857241, -1.11453319,  3.79453993,
        4.17252731, -4.68219948,  0.24869762, -6.35372829,  0.31694755,
       -3.84430099, -0.58682597, -0.88593048,  1.53493583,  2.30677724,
        2.23658633,  2.18042707, -0.74518204,  1.31319201, -3.8562367 ,
       -2.16786218,  0.03127109,  0.40597317,  1.38343799,  4.27873802,
       -3.99632454, -3.51169419,  1.30540907, -3.70225096,  1.31717014,
        3.1416502 ,  0.03988991, -0.38707864,  1.74478793, -1.42444181,
       -0.32321543,  3.3821702 , -2.2735436 ,  7.42937231, -1.28889155,
       -4.36491013, -2.23137355, -7.0328064 ,  0.05411424, -0.6318019 ,
       -1.25639653, -2.14141345,  0.24938402,  0.51534641, -1.93796432,
        0.74748939,  1.69639874,  1.69339097,  0.56605494, -0.8465029 ,
        1.59473193, -0.56365103, -1.4284215 ,  1.25055254,  2.94551611,
       -2.44235277, -3.05936575, -4.40176249, -1.89741099,  0.98016882,
       -5.13533545, -3.93402863,  2.18700981,  1.6169908 ,  2.19

In [23]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 150, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 93, 150)           118800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 93, 512)           833536    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 93, 32)            16416     
_________________________________________________________________
activation_1 (Activation)    (None, 93, 32)            0         
Total params: 968,752
Trainable params: 849,952
Non-trainable params: 118,800
_________________________________________________________________


In [24]:
# change sentences_X to train_sentences_X and tags_y to train_tags_y for normal functionality;
# also validation_split=0.2
model.fit(sentences_X, to_categorical(tags_y, len(tag2index)), batch_size=16, epochs=50, validation_split=0)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7efd32767630>

In [28]:
os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting',\
                                        'pos_tagger_interface')))

model.save("pos_tagging_model_expanded.h5")

In [27]:
os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting',\
                                        'pos_tagger_interface')))

with open('bio-pos.txt', 'w') as f:
    for item in tag2index.items():
        f.write(item[0] + '\t' + str(item[1]) + '\n')

with open('word_indices.txt', 'w') as f:
    for item in word2index.items():
        f.write(item[0] + '\t' + str(item[1]) + '\n')

In [2]:
from keras.models import load_model

os.chdir(os.path.expanduser(os.path.join(os.path.join('~', 'python_workspace'), 'medical_corpus_scripting')))

new_model = load_model("pos_tagging_model.h5")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [44]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

acc: 100.0


In [22]:
test_samples = [
    "tus mob no yuav kis tau .".split(),
    "nws mob plab tab sis lawv mob Shigellosis .".split()
]

In [59]:
from nltk.corpus import PlaintextCorpusReader as PCR

os.chdir(os.path.expanduser(os.path.join(os.path.join('~', 'hmong_medical'), 'raw_processing')))

test_samples = PCR('.', 'p01820h.txt').sents()
print(test_samples[:10])

[['TUS', 'MOB', 'ENTEROTOXIGENIC', 'E', '.'], ['COLI', '(', 'ETEC', ')', 'Escherichia', 'coli', '(', 'E', '.'], ['coli', ')', 'yog', 'cov', 'kab', 'mob', 'uas', 'pom', 'muaj', 'rau', 'ntawm', 'tej', 'chaw', 'nyob', 'ib', 'ncig', 'yus', ',', 'hauv', 'tej', 'khoom', 'noj', ',', 'thiab', 'hauv', 'cov', 'hnyuv', 'ntawm', 'tej', 'tsiaj', 'txhu', 'thiab', 'tib', 'neeg', '.'], ['Hom', 'mob', 'E', '.'], ['coli', 'feem', 'ntau', 'yeej', 'tsis', 'ua', 'teeb', 'meem', 'dab', 'tsi', 'thiab', 'kuj', 'yog', 'ib', 'feem', 'tseem', 'ceeb', 'ntawm', 'txoj', 'kev', 'zom', 'zaub', 'mov', ',', 'tab', 'sis', 'muaj', 'ib', 'txhia', 'kuj', 'ua', 'rau', 'koj', 'mob', '.'], ['Enterotoxigenic', 'E', '.'], ['coli', '(', 'ETEC', ')', 'yog', 'ib', 'hom', 'ntawm', 'cov', 'kab', 'mob', 'E', '.'], ['coli', 'uas', 'ua', 'tau', 'rau', 'koj', 'raws', 'plab', '.'], ['Tsis', 'hais', 'leej', 'twg', 'los', 'yeej', 'kis', 'tau', 'tus', 'mob', 'ETEC', '.'], ['Nws', 'yog', 'ib', 'tus', 'mob', 'raws', 'plab', 'uas', 'kheev', 'p

In [19]:
test_samples = [
    "the county will decide .".split(),
    "they have announced their resignations .".split()
]

In [60]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
    
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

In [125]:
print(test_samples_X)

[[346  89   1 ...   0   0   0]
 [610 493 151 ...   0   0   0]
 [346  89   1 ...   0   0   0]
 ...
 [  1   1   1 ...   0   0   0]
 [  1 611   0 ...   0   0   0]
 [  1   1 563 ...   0   0   0]]


In [30]:
f = open('pos_tagger.config', 'w')
f.write('maxlen='+str(MAX_LENGTH) + '\n')
f.close()

In [61]:
predictions = model.predict(test_samples_X)
#print(predictions)

In [48]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
        token_sequences.append(token_sequence)
    return token_sequences

In [62]:
tags = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})

In [63]:
full_output = []
for i, item in enumerate(test_samples[:22]):
    line_output = []
#    print(len(item))
#    print(len([tag for tag in tags[i] if tag != '-PAD-']))
    for j, word in enumerate(item):
        print(i, j, (word, tags[i][j]))
        line_output.append((word, tags[i][j]))
    full_output.append(line_output)

0 0 ('TUS', 'B-CL')
0 1 ('MOB', 'B-NN')
0 2 ('ENTEROTOXIGENIC', 'B-FW')
0 3 ('E', 'B-FW')
0 4 ('.', 'O-PU')
1 0 ('COLI', 'B-FW')
1 1 ('(', 'O-PU')
1 2 ('ETEC', 'B-FW')
1 3 (')', 'O-PU')
1 4 ('Escherichia', 'B-FW')
1 5 ('coli', 'B-FW')
1 6 ('(', 'O-PU')
1 7 ('E', 'B-FW')
1 8 ('.', 'O-PU')
2 0 ('coli', 'B-FW')
2 1 (')', 'O-PU')
2 2 ('yog', 'B-VV')
2 3 ('cov', 'B-CL')
2 4 ('kab', 'B-NN')
2 5 ('mob', 'I-VV')
2 6 ('uas', 'B-CS')
2 7 ('pom', 'B-VV')
2 8 ('muaj', 'B-VV')
2 9 ('rau', 'B-PP')
2 10 ('ntawm', 'B-LC')
2 11 ('tej', 'B-CL')
2 12 ('chaw', 'B-NN')
2 13 ('nyob', 'B-VV')
2 14 ('ib', 'B-QU')
2 15 ('ncig', 'B-CL')
2 16 ('yus', 'B-PN')
2 17 (',', 'O-PU')
2 18 ('hauv', 'B-LC')
2 19 ('tej', 'B-CL')
2 20 ('khoom', 'B-NN')
2 21 ('noj', 'B-VV')
2 22 (',', 'O-PU')
2 23 ('thiab', 'B-CC')
2 24 ('hauv', 'B-LC')
2 25 ('cov', 'B-CL')
2 26 ('hnyuv', 'B-NN')
2 27 ('ntawm', 'B-LC')
2 28 ('tej', 'B-CL')
2 29 ('tsiaj', 'B-NN')
2 30 ('txhu', 'I-NN')
2 31 ('thiab', 'B-CC')
2 32 ('tib', 'B-NN')
2 33 ('neeg',

In [54]:
print([len(l) for l in tags])
print([len(l) for l in test_samples])

[93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93]
[47, 47, 32, 8, 55, 4, 35, 64, 22, 15, 93, 15, 31, 67, 13, 16, 12, 21, 36, 28, 11, 58, 10, 2, 2, 6, 6, 2, 19]


In [51]:
for item in full_output:
    for word in item:
        print(word[0], end=' ')
    print()
    for word in item:
        print(word[1], end=' ')
    print()
    print()

TUS KAB MOB ZIKA : COV LUS POM ZOO SIV RAU COV NEEG UAS TEJ ZAUM TAU RAUG TUS KAB MOB ZIKA 1 . 
B-CL B-NN I-VV B-FW O-PU B-CL B-NN B-VV B-VV B-VV B-PP B-CL B-NN B-CS B-AD I-AD B-AD B-VV B-CL B-NN I-VV B-FW B-FW O-PU 

Nyob twj ywm hauv tsev kom txhob raug yoov tshaj cum tom los yog siv cov tshuaj pleev yoov kom txhob tom ntev li peb lim piam ( weeks ) . 
B-VV B-AD I-AD B-LC B-NN B-CS B-AD B-VV B-NN I-NN I-NN B-LC B-CC I-CC B-VV B-CL B-NN B-FW B-NN B-CS B-AD B-LC B-VV B-PP B-QU B-NN I-NN O-PU B-FW O-PU O-PU 

* Yog koj nyuam qhuav kis tau tus kab mob Zika tsis ntev los no , tej zaum yuav muaj tus kab mob no nyob hauv koj cov ntshav thiab yuav kis tau mus rau ib tus tshaj cum uas tom koj . 
O-PU B-VV B-PN B-AD B-VV B-VV B-VV B-CL B-NN I-VV B-FW B-AD B-VV B-VV B-DT O-PU B-AD I-AD B-AD B-VV B-CL B-NN I-VV B-DT B-VV B-LC B-PN B-CL B-NN B-CC B-AD B-VV B-VV B-VV B-PP B-QU I-CL I-NN I-NN B-CS B-LC B-PN O-PU 

Ces tus tshaj cum uas muaj tus mob yuav kis tau tus kab mob rau lwm tus . 
B-CC B-CL 

In [64]:
#print(full_output[0])
printable_output = [['/'.join(w) for w in line] for line in full_output]
#print(printable_output)
os.chdir(os.path.expanduser('~/hmong_medical/'))
f = open('p01820h.txt', 'w')
f.write('\n '.join([' '.join(line) for line in printable_output]))
f.close()

In [27]:
predictions = model.predict(test_sentences_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['PU', 'VV', 'QU', 'CL', 'VV', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-'], ['PU', 'VV', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [66]:
print(test_tags_y[:5])

[[ 1  8  9  5  1  2  2  2  5  6  5 14  9  9  5  5  3 14  9  5  7  1  5  5
   6  1  2  5  2  2  5  5  5  5  3  1  2  2  6  6  8  2  5  2  2  3 11 11
   3  1  2 14  5  8  1  2  5  9  9  3  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  2  3  1  2  5 11  1  5  5  6  8  1  5  4 11  1  2  4  9  9  5  5  5
   9  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  9  5 11 11  6  1  2  5  2 10  5  5  8  2  2 11 11  2  5  2  3  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  5  6  2  9  9  5  5 14  1  2  4 10  9  5  5  3  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  

In [74]:
from collections import Counter
total = 0.0
for item in test_tags_y:
    c = Counter(item)
    total +=((c[0] + c[5]) / len(item))
print(total / len(test_tags_y))
# if all is predicted to be a series of 5s followed by 0s, then 72% accurate!
# this method is essentially worthless!

0.7241379310344827
