In [1]:
from text_gan import cfg, Vocab
from text_gan.features import GloVeReader, NERTagger, PosTagger
from text_gan.utils import MapReduce

import en_core_web_sm
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np
import logging
import os
import gc

In [2]:
embedding_reader = GloVeReader()

In [3]:
pretrained_vectors = embedding_reader.read(cfg.EMBS_FILE)

Loading vectors: 1669210it [02:19, 11974.20it/s]


In [4]:
vocab = Vocab(
    embedding_reader.START,
    embedding_reader.END,
    embedding_reader.PAD,
    embedding_reader.UNK,
    cfg.CSEQ_LEN,
    cfg.QSEQ_LEN
)
ner = NERTagger(cfg.NER_TAGS_FILE, cfg.CSEQ_LEN)
pos = PosTagger(cfg.POS_TAGS_FILE, cfg.CSEQ_LEN)
nlp = en_core_web_sm.load()

In [5]:
train = tfds.load("squad", data_dir="/tf/data/tf_data", split='train').take(10)

In [6]:
def tokenize_example(x):
    context, question, ans = list(nlp.pipe([
        x['context'].decode('utf-8'),
        x['question'].decode('utf-8'),
        x['answers']['text'][0].decode('utf-8')
    ]))
    del x
    return (context, question, ans)

def substrSearch(ans, context):
    i = 0
    j = 0
    s = -1
    while i < len(context) and j < len(ans):
        if context[i].text == ans[j].text:
            s = i
            i += 1
            j += 1
        else:
            i += 1
            j = 0
            s = -1
    return s, j

In [7]:
mr = MapReduce()
train_iter = train.as_numpy_iterator()
train_tokenized = mr.process(tokenize_example, train_iter)

In [8]:
train_context = []
train_question = []
train_ans = []
for context, ques, ans in train_tokenized:
    ans_start, l = substrSearch(ans, context)
    if len(ques) >= 20 or ans_start == -1 or ans_start + l >= 250:
        continue
    train_context.append(context)
    train_question.append(ques)
    train_ans.append((ans_start, l))
len(train_context), len(train_question), len(train_ans)

(10, 10, 10)

In [9]:
# cembs.fit(train_context)
# qembs.fit(train_question)
vocab.fit(
    train_context,
    train_question,
    pretrained_vectors,
    0, 0)
train_cidx = vocab.transform(train_context, "source")
train_ner = ner.transform(train_context)
train_pos = ner.transform(train_context)
train_qidx = vocab.transform(train_question, "target")


In [10]:
train_cidx[0]

array([  2,  18, 168,   9,   4, 169, 170,  22,   4, 171,   7, 172,  16,
         4, 173,  23,  26, 174,  11,  76, 175,  22,   4, 176,  12, 177,
        77,  35,   6,  46, 178, 179,   4, 180,   5, 181,  78,  76, 182,
       183,   6,  46, 184, 185,   4,  79,  80,  22,  47, 186,  48, 187,
       188,  78,   6,  36,  81, 189, 190,   4,  79, 191, 192,  82,   9,
       193, 194,   5,  14,   9, 195, 196,   5,   4, 197,  83,  80, 198,
       199,  35,   5, 200,  14, 201,  35, 202,  37, 203,  11,  15,  38,
        77,   8, 204,  84,  12,  49, 205,  35,   6,   3,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [11]:
train_qidx[0]

array([ 2,  5,  6, 15, 16, 17, 18, 19, 20, 21,  8, 22, 23,  7, 24, 25, 26,
       27,  4,  3])

In [15]:
vocab._target['?']

4

In [13]:
cseq = cfg.CSEQ_LEN
qseq = cfg.QSEQ_LEN

def gen():
    for cidx, ner, pos, qidx, ans in zip(
            train_cidx, train_ner, train_pos,
            train_qidx, train_ans):
        yield (cidx, ans, qidx, ner, pos)

train_dataset = tf.data.Dataset.from_generator(
    gen,
    (tf.int32, tf.int32, tf.int32, tf.uint8, tf.uint8),
    (
        tf.TensorShape([cseq]), tf.TensorShape([2]),
        tf.TensorShape([qseq]), tf.TensorShape([cseq]), tf.TensorShape([cseq]))
)

In [14]:
i = 0
for cidx, ans, qidx, ner, pos in train_dataset:
    i += 1
print("Total", i)

Total 10
