In [1]:
from text_gan import cfg, Vocab
from text_gan.features import GloVeReader, NERTagger, PosTagger
from text_gan.utils import MapReduce

import en_core_web_sm
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np
import logging
import os
import gc

In [2]:
embedding_reader = GloVeReader()

In [3]:
pretrained_vectors = embedding_reader.read(cfg.EMBS_FILE)

Loading vectors: 1669210it [02:19, 11937.12it/s]


In [4]:
vocab = Vocab(
    embedding_reader.START,
    embedding_reader.END,
    embedding_reader.PAD,
    embedding_reader.UNK,
    cfg.CSEQ_LEN,
    cfg.QSEQ_LEN,
    pretrained_vectors
)
ner = NERTagger(cfg.NER_TAGS_FILE, cfg.CSEQ_LEN)
pos = PosTagger(cfg.POS_TAGS_FILE, cfg.CSEQ_LEN)
nlp = en_core_web_sm.load()

In [5]:
train = tfds.load("squad", data_dir="/tf/data/tf_data", split='train').take(10)

In [6]:
def tokenize_example(x):
    context, question, ans = list(nlp.pipe([
        x['context'].decode('utf-8'),
        x['question'].decode('utf-8'),
        x['answers']['text'][0].decode('utf-8')
    ]))
    del x
    return (context, question, ans)

def substrSearch(ans, context):
    i = 0
    j = 0
    s = -1
    while i < len(context) and j < len(ans):
        if context[i].text == ans[j].text:
            s = i
            i += 1
            j += 1
        else:
            i += 1
            j = 0
            s = -1
    return s, j

In [7]:
mr = MapReduce()
train_iter = train.as_numpy_iterator()
train_tokenized = mr.process(tokenize_example, train_iter)

In [8]:
train_context = []
train_question = []
train_ans = []
for context, ques, ans in train_tokenized:
    ans_start, l = substrSearch(ans, context)
    if len(ques) >= 20 or ans_start == -1 or ans_start + l >= 250:
        continue
    train_context.append(context)
    train_question.append(ques)
    train_ans.append((ans_start, l))
len(train_context), len(train_question), len(train_ans)

(10, 10, 10)

In [9]:
# cembs.fit(train_context)
# qembs.fit(train_question)
train_cidx = vocab.transform(train_context, "source")
train_ner = ner.transform(train_context)
train_pos = ner.transform(train_context)
train_qidx = vocab.transform(train_question, "target")


In [10]:
train_cidx[0]

array([    2,    26,  1313,    11,     6,   543,  2564,    15,     6,
         361,     9,     1,    14,     6,   678,    19,   126, 14742,
          18, 27592,  2770,    15,     6,   413,    17,  1425,  6387,
       34515,     5,   152, 15365,   586,     6,  1264,     4, 25729,
       33624, 27592,  4875, 13110,     5,   152,   174,  5182,     6,
        3203, 27074,    15,   130,  5377,    27, 14230, 20683, 33624,
           5,    94,    89,  1414,   165,     6,  3203,  9910,   269,
         138,    11,   101,   837,     4,    32,    11,  1326,  3512,
           4,     6,  1266,  1971, 27074,   129,  3509, 34515,     4,
         170,    32,  8807, 34515,    29,    43,  5103,    18,    23,
         231,  6387,     7,  9084,  2843,    17,    36, 11523, 34515,
           5,     3,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [14]:
train_qidx[0]

array([   2,  194,   14,   55,  129,   19,   73, 1593,   43, 9910,    8,
       1058, 5377,   11,  823,  951,   29,  478,   41,    3])

In [16]:
vocab._source['?']

41

In [18]:
cseq = cfg.CSEQ_LEN
qseq = cfg.QSEQ_LEN

def gen():
    for cidx, ner, pos, qidx, ans in zip(
            train_cidx, train_ner, train_pos,
            train_qidx, train_ans):
        yield (cidx, ans, qidx, ner, pos)

train_dataset = tf.data.Dataset.from_generator(
    gen,
    (tf.int32, tf.int32, tf.int32, tf.uint8, tf.uint8),
    (
        tf.TensorShape([cseq]), tf.TensorShape([2]),
        tf.TensorShape([qseq]), tf.TensorShape([cseq]), tf.TensorShape([cseq]))
)

In [19]:
i = 0
for cidx, ans, qidx, ner, pos in train_dataset:
    i += 1
print("Total", i)

Total 10
