In [1]:
from text_gan import cfg
from text_gan.features import GloVe, NERTagger, PosTagger
from text_gan.utils import MapReduce

import en_core_web_sm
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np
import logging
import os
import gc

In [22]:
cfg.CSEQ_LEN = 100
cfg.QSEQ_LEN = 10

In [23]:
cembs = GloVe(cfg.EMBS_FILE, cfg.CSEQ_LEN)
qembs = GloVe(cfg.EMBS_FILE, cfg.QSEQ_LEN, cembs.data)

In [24]:
ner = NERTagger(cfg.NER_TAGS_FILE, cfg.CSEQ_LEN)
pos = PosTagger(cfg.POS_TAGS_FILE, cfg.CSEQ_LEN)
nlp = en_core_web_sm.load()

In [25]:
train = tfds.load("squad", data_dir="/tf/data/tf_data", split='train').take(10)

In [26]:
def tokenize_context(x):
    ret = nlp(x['context'].decode('utf-8'))
    del x
    return ret

def tokenize_question(x):
    ret = nlp(x['question'].decode('utf-8'))
    del x
    return ret

def tokenize_answer(x):
    ret = nlp(x['answers']['text'][0].decode('utf-8'))
    del x
    return ret

def tag_answer(inp):
    cidx, aidx = inp
    aidx = np.array(aidx, dtype=np.uint8)
    if aidx.shape[0] == 0:
        return np.zeros(cidx.shape, dtype=np.int32)
    size = aidx.shape[0]
    shape = cidx.shape[:-1] + (cidx.shape[-1] - size + 1, size)
    strides = cidx.strides + (cidx.strides[-1],)
    windows = np.lib.stride_tricks.as_strided(
        cidx, shape=shape, strides=strides)
    answer = np.all(windows == aidx, axis=1)
    aidx = np.zeros(cidx.shape, dtype=np.int32)
    if answer.nonzero()[0].shape[0] != 0:
        start_index = answer.nonzero()[0][0]
        for i in range(size):
            aidx[start_index+i] = 1
    return aidx

In [18]:
mr = MapReduce()
train_context = train.as_numpy_iterator()
train_context = mr.process(tokenize_context, train_context)
cembs.fit(train_context, min_freq=None)
train_cembs = cembs.transform(train_context)

In [19]:
train_answer = train.as_numpy_iterator()
train_answer = mr.process(tokenize_answer, train_answer)
train_aembs = cembs.transform(train_answer, pad=False, end=False)
train_aembs = mr.process(
    tag_answer, zip(train_cembs, train_aembs))
train_aembs = np.array(train_aembs, dtype=np.uint8)

In [32]:
train_question = train.as_numpy_iterator()
train_question = mr.process(tokenize_question, train_question)
qembs.fit(train_question, min_freq=None)
train_qembs = qembs.transform(train_question)

In [34]:
train_question

[What is one use that would require an antenna to receive signals in various ways at once?,
 About how many counts existed in the Carolingian Empire?,
 How can climate changes be determined from soil?,
 What is Tucson's Fox station?,
 What is the size of New York City in square miles?,
 Who wrote William Tell Overture?,
 How do asset prices generally move in relation to interest rates?,
 When did the Cubs announce a move of their top class A affiliate from Daytona to Myrtle Beach?,
 What was Schleiden's profession?,
 What is :74 used for?]

In [35]:
train_qembs[3]

array([ 4,  5, 38, 39, 40, 41, 21,  3,  0,  0], dtype=int32)

In [36]:
train_aembs[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=uint8)

In [37]:
train_cembs[3]

array([  4, 266, 267,  97,  14, 175, 100, 268, 173, 269, 270,  69,  14,
         7, 271, 272, 273, 274,  97,  19, 275,  23,   6,   7, 276,  67,
       277, 278,  19, 279, 280,  12,   7, 281, 276,  23,  27, 103,  14,
       282, 117,   7, 283, 167,  12, 284, 285,  19, 286,   3], dtype=int32)