In [None]:
!pip install trax

In [1]:
import sys
import os

import numpy as np

import textwrap
wrapper = textwrap.TextWrapper(width=70)

import trax
from trax import layers as tl
from trax.fastmath import numpy as jnp
from trax import models as tm
from trax.supervised import training

# to print the entire np array
np.set_printoptions(threshold=sys.maxsize)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [None]:
train_stream_fn = trax.data.TFDS(
    'cnn_dailymail',
     data_dir='data/',
     keys=('article', 'highlights'),
     train=True
)

eval_stream_fn = trax.data.TFDS(
    'cnn_dailymail',
    data_dir='data/',
    keys=('article', 'highlights'),
    train=False
)

In [None]:
!gsutil cp gs://trax-ml/vocabs/en_32k.subword vocab_dir/en_32k.subword

In [None]:
!head -n 2 vocab_dir/en_32k.subword

In [None]:
# Special tokens
SEP = 0 # Padding or separator token
EOS = 1 # End of sentence token

# Concatenate tokenized inputs and targets using 0 as separator.
def preprocess(stream):
    for (article, summary) in stream:
        joint = np.array(list(article) + [EOS, SEP] + list(summary) + [EOS])
        mask = [0] * (len(list(article)) + 2) + [1] * (len(list(summary)) + 1) # Accounting for EOS and SEP
        yield joint, joint, np.array(mask)

data_pipeline = trax.data.Serial(
    trax.data.Tokenize(
        vocab_file='en_32k.subword',
        vocab_dir='vocab_dir'
    ),
    preprocess,
    trax.data.Shuffle(),
    trax.data.FilterByLength(max_length=2048),
    trax.data.BucketByLength(
        boundaries=[  32, 128, 512, 2048],
        batch_sizes=[512, 128,  32,    8, 1],
    ),
    trax.data.AddLossWeights()
)
train_batches_stream = data_pipeline(train_stream_fn())
eval_batches_stream = data_pipeline(eval_stream_fn())

In [None]:
def detokenize(integers):
    """List of ints to str"""
  
    s = trax.data.detokenize(
        integers,
        vocab_file='en_32k.subword',
        vocab_dir='vocab_dir'
    )
    
    return wrapper.fill(s)

In [None]:
model = tm.TransformerLM(
    vocab_size=33300,
    d_model=4,
    d_ff=16,
    n_layers=1,
    n_heads=2,
    mode='train'
)

In [6]:
pwd

'/Users/louis.guitton/workspace/papers/moocs/coursera-nlp-specialization/course-4-attention-models'

In [None]:
train_task = training.TrainTask( 
  labeled_data=train_batches_stream,
  loss_layer=tl.CrossEntropyLoss(),
  optimizer=trax.optimizers.Adam(0.01),
  lr_schedule=trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01),
  n_steps_per_checkpoint=10
)

eval_task = training.EvalTask( 
  labeled_data=eval_batches_stream,
  metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]
)

# Training loop saves checkpoints to output_dir.
output_dir = os.path.expanduser('~/output-dir/')
!rm -rf {output_dir}
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

# Run 10 steps (batches).
training_loop.run(10)

   `Realistic (pretrained) model: `                                 
                                       
    TransformerLM(vocab_size=33300, d_model=512, d_ff=2048, n_layers=6, n_heads=8, 
                   dropout=0.1, max_len=4096, ff_activation=tl.Relu)
                   
   `This model:`
   
    TransformerLM(d_model=4, d_ff=16, n_layers=1, n_heads=2)

In [None]:
# Get the model architecture
model =  tm.TransformerLM(
    vocab_size=33300,
    d_model=4,
    d_ff=16,
    n_layers=1,
    n_heads=2,
    mode='eval'
)

# Load the pre-trained weights
model.init_from_file(f"{output_dir}model.pkl.gz", weights_only=True)

In [None]:
# Tokenize a sentence.
article = "It’s the posing craze sweeping the U.S. after being brought to fame by skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert Pujols - and even Republican politician Rick Perry. But now four students at Riverhead High School on Long Island, New York, have been suspended for dropping to a knee and taking up a prayer pose to mimic Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll were all suspended for one day because the ‘Tebowing’ craze was blocking the hallway and presenting a safety hazard to students. Scroll down for video. Banned: Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured left) were all suspended for one day by Riverhead High School on Long Island, New York, for their tribute to Broncos quarterback Tim Tebow. Issue: Four of the pupils were suspended for one day because they allegedly did not heed to warnings that the 'Tebowing' craze at the school was blocking the hallway and presenting a safety hazard to students."
tokenized = list(trax.data.tokenize(iter([article]),  # Operates on streams.
                                    vocab_file='en_32k.subword',
                             vocab_dir='vocab_dir'))[0]

# Decode from the Transformer.
tokenized = tokenized[None, :]  # Add batch dimension.
tokenized_summary = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=0.0)  # Higher temperature: more diverse results.

# De-tokenize,
tokenized_summary = tokenized_summary[0][:-1]  # Remove batch and EOS.
summary = trax.data.detokenize(tokenized_summary,
                                   vocab_file='en_32k.subword',
                             vocab_dir='vocab_dir')
print(summary)