In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [2]:
# import sentencepiece as spm
# vocab = 'sp10m.cased.t5.model'
# sp = spm.SentencePieceProcessor()
# sp.Load(vocab)

In [3]:
def cnn_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-data/cnn-summarization-*.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def cnn_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('cnn_dataset')
t5.data.TaskRegistry.add(
    'cnn_dataset',
    dataset_fn = cnn_dataset,
    splits = ['train'],
    text_preprocessor = [cnn_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


def multinews_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-data/multinews-summarization-*.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def multinews_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('multinews_dataset')
t5.data.TaskRegistry.add(
    'multinews_dataset',
    dataset_fn = multinews_dataset,
    splits = ['train'],
    text_preprocessor = [multinews_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

def news_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-data/news-title-*.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def news_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('news_dataset')
t5.data.TaskRegistry.add(
    'news_dataset',
    dataset_fn = news_dataset,
    splits = ['train'],
    text_preprocessor = [news_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

def ringkasan_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['summary.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def ringkasan_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('ringkasan_dataset')
t5.data.TaskRegistry.add(
    'ringkasan_dataset',
    dataset_fn = ringkasan_dataset,
    splits = ['train'],
    text_preprocessor = [ringkasan_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

def sentiment_ringkasan_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-data/*-summarization*.tsv.sentiment.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def sentiment_ringkasan_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('sentiment_ringkasan_dataset')
t5.data.TaskRegistry.add(
    'sentiment_ringkasan_dataset',
    dataset_fn = sentiment_ringkasan_dataset,
    splits = ['train'],
    text_preprocessor = [sentiment_ringkasan_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

def sentiment_newstitle_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-data/news-title-*.tsv.sentiment.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def sentiment_newstitle_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('sentiment_newstitle_dataset')
t5.data.TaskRegistry.add(
    'sentiment_newstitle_dataset',
    dataset_fn = sentiment_newstitle_dataset,
    splits = ['train'],
    text_preprocessor = [sentiment_newstitle_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [4]:
nq_task = t5.data.TaskRegistry.get("ringkasan_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
t = tfds.as_numpy(ds)

In [5]:
from tqdm import tqdm

@registry.register_problem
class Seq2Seq(text_problems.Text2TextProblem):

    @property
    def approx_vocab_size(self):
        return 32100
    
    @property
    def is_generate_per_split(self):
        return False
    
    @property
    def dataset_splits(self):
        return [{
            "split": problem.DatasetSplit.TRAIN,
            "shards": 500,
        }, {
            "split": problem.DatasetSplit.EVAL,
            "shards": 1,
        }]
    
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split
        
        nq_task = t5.data.TaskRegistry.get("cnn_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            yield ex
            
        nq_task = t5.data.TaskRegistry.get("multinews_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            yield ex
        
        nq_task = t5.data.TaskRegistry.get("news_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            if len(ex['targets']) > 4:
                yield ex
        
        nq_task = t5.data.TaskRegistry.get("ringkasan_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            yield ex
            
        nq_task = t5.data.TaskRegistry.get("sentiment_ringkasan_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            yield ex
            
        nq_task = t5.data.TaskRegistry.get("sentiment_newstitle_dataset")
        ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
        
        for ex in tqdm(tfds.as_numpy(ds)):
            if len(ex['targets']) > 4:
                yield ex
                    
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        
        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        for sample in generator:
            sample["inputs"] = sample['inputs'].tolist()
            sample["targets"] = sample['targets'].tolist()
            yield sample

In [6]:
!rm -rf t2t-summarization/data

In [7]:
DATA_DIR = os.path.expanduser("t2t-summarization/data")
TMP_DIR = os.path.expanduser("t2t-summarization/tmp")
TRAIN_DIR = os.path.expanduser("t2t-summarization/train")
EXPORT_DIR = os.path.expanduser("t2t-summarization/export")
TRANSLATIONS_DIR = os.path.expanduser("t2t-summarization/translation")
EVENT_DIR = os.path.expanduser("t2t-summarization/event")
USR_DIR = os.path.expanduser("t2t-summarization/user")
 
tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(TMP_DIR)
tf.gfile.MakeDirs(TRAIN_DIR)
tf.gfile.MakeDirs(EXPORT_DIR)
tf.gfile.MakeDirs(TRANSLATIONS_DIR)
tf.gfile.MakeDirs(EVENT_DIR)
tf.gfile.MakeDirs(USR_DIR)

In [8]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

PROBLEM = 'seq2_seq'
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

0it [00:00, ?it/s]

INFO:tensorflow:Generating case 0.


INFO:tensorflow:Generating case 0.
99975it [02:50, 588.19it/s]

INFO:tensorflow:Generating case 100000.


INFO:tensorflow:Generating case 100000.
140382it [03:58, 587.92it/s]
59618it [05:35, 88.20it/s] 

INFO:tensorflow:Generating case 200000.


INFO:tensorflow:Generating case 200000.
101574it [09:31, 177.81it/s]
61630it [01:38, 687.27it/s]

INFO:tensorflow:Generating case 300000.


INFO:tensorflow:Generating case 300000.
168207it [04:23, 759.43it/s] 

INFO:tensorflow:Generating case 400000.


INFO:tensorflow:Generating case 400000.
274195it [06:46, 565.69it/s] 

INFO:tensorflow:Generating case 500000.


INFO:tensorflow:Generating case 500000.
293443it [07:15, 673.23it/s]
81855it [02:44, 653.52it/s] 

INFO:tensorflow:Generating case 600000.


INFO:tensorflow:Generating case 600000.
107472it [03:18, 541.97it/s]
74413it [02:14, 321.63it/s]

INFO:tensorflow:Generating case 700000.


INFO:tensorflow:Generating case 700000.
120973it [05:58, 337.84it/s]
56947it [01:14, 593.42it/s] 

INFO:tensorflow:Generating case 800000.


INFO:tensorflow:Generating case 800000.
146716it [03:15, 751.19it/s]

INFO:tensorflow:Generated 884601 Examples



INFO:tensorflow:Generated 884601 Examples


INFO:tensorflow:Shuffling data...


INFO:tensorflow:Shuffling data...


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


INFO:tensorflow:Data shuffled.


INFO:tensorflow:Data shuffled.
