In [None]:
!pip install trax

In [1]:
import sys
import os

import numpy as np

import textwrap
wrapper = textwrap.TextWrapper(width=70)

import trax
from trax import layers as tl
from trax.fastmath import numpy as jnp
from trax import models as tm
from trax.supervised import training

# to print the entire np array
np.set_printoptions(threshold=sys.maxsize)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [None]:
train_stream_fn = trax.data.TFDS(
    'cnn_dailymail',
     data_dir='data/',
     keys=('article', 'highlights'),
     train=True
)

eval_stream_fn = trax.data.TFDS(
    'cnn_dailymail',
    data_dir='data/',
    keys=('article', 'highlights'),
    train=False
)

In [None]:
!gsutil cp gs://trax-ml/vocabs/en_32k.subword vocab_dir/en_32k.subword

In [None]:
!head -n 2 vocab_dir/en_32k.subword

In [None]:
# Special tokens
SEP = 0 # Padding or separator token
EOS = 1 # End of sentence token

# Concatenate tokenized inputs and targets using 0 as separator.
def preprocess(stream):
    for (article, summary) in stream:
        joint = np.array(list(article) + [EOS, SEP] + list(summary) + [EOS])
        mask = [0] * (len(list(article)) + 2) + [1] * (len(list(summary)) + 1) # Accounting for EOS and SEP
        yield joint, joint, np.array(mask)

data_pipeline = trax.data.Serial(
    trax.data.Tokenize(
        vocab_file='en_32k.subword',
        vocab_dir='vocab_dir'
    ),
    preprocess,
    trax.data.Shuffle(),
    trax.data.FilterByLength(max_length=2048),
    trax.data.BucketByLength(
        boundaries=[  32, 128, 512, 2048],
        batch_sizes=[512, 128,  32,    8, 1],
    ),
    trax.data.AddLossWeights()
)
train_batches_stream = data_pipeline(train_stream_fn())
eval_batches_stream = data_pipeline(eval_stream_fn())

In [None]:
def detokenize(integers):
    """List of ints to str"""
  
    s = trax.data.detokenize(
        integers,
        vocab_file='en_32k.subword',
        vocab_dir='vocab_dir'
    )
    
    return wrapper.fill(s)

In [None]:
model = tm.TransformerLM(
    vocab_size=33300,
    d_model=4,
    d_ff=16,
    n_layers=1,
    n_heads=2,
    mode='train'
)

In [None]:
train_task = training.TrainTask( 
  labeled_data=train_batches_stream,
  loss_layer=tl.CrossEntropyLoss(),
  optimizer=trax.optimizers.Adam(0.01),
  lr_schedule=trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01),
  n_steps_per_checkpoint=10
)

eval_task = training.EvalTask( 
  labeled_data=eval_batches_stream,
  metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]
)

# Training loop saves checkpoints to output_dir.
output_dir = os.path.expanduser('~/output-dir/')
!rm -rf {output_dir}
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

# Run 10 steps (batches).
training_loop.run(10)

   `Realistic (pretrained) model: `                                 
                                       
    TransformerLM(vocab_size=33300, d_model=512, d_ff=2048, n_layers=6, n_heads=8, 
                   dropout=0.1, max_len=4096, ff_activation=tl.Relu)
                   
   `This model:`
   
    TransformerLM(d_model=4, d_ff=16, n_layers=1, n_heads=2)

In [41]:
# Get the model architecture
model =  tm.TransformerLM(
    vocab_size=33300,
    d_model=512,
    d_ff=2048,
    n_layers=6,
    n_heads=8,
    mode='predict'
)

# Load the pre-trained weights
model.init_from_file("pretrained_weights.pkl.gz", weights_only=True)

# Tokenize a sentence.
article = "It’s the posing craze sweeping the U.S. after being brought to fame by skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert Pujols - and even Republican politician Rick Perry. But now four students at Riverhead High School on Long Island, New York, have been suspended for dropping to a knee and taking up a prayer pose to mimic Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll were all suspended for one day because the ‘Tebowing’ craze was blocking the hallway and presenting a safety hazard to students. Scroll down for video. Banned: Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured left) were all suspended for one day by Riverhead High School on Long Island, New York, for their tribute to Broncos quarterback Tim Tebow. Issue: Four of the pupils were suspended for one day because they allegedly did not heed to warnings that the 'Tebowing' craze at the school was blocking the hallway and presenting a safety hazard to students."
tokenized = list(trax.data.tokenize(iter([article]),  # Operates on streams.
                                    vocab_file='summarize32k.subword',
                             vocab_dir='vocab_dir'))[0]

# Decode from the Transformer.
tokenized = tokenized[None, :]
tokenized_summary = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=0.0)

# De-tokenize,
tokenized_summary = tokenized_summary[0][:-1]  # Remove batch and EOS.
summary = trax.data.detokenize(tokenized_summary,
                                   vocab_file='summarize32k.subword',
                             vocab_dir='vocab_dir')
print(summary)




In [42]:
def next_symbol(cur_output_tokens, model):
    """Returns the next symbol for a given sentence.

    Args:
        cur_output_tokens (list): tokenized sentence with EOS and PAD tokens at the end.
        model (trax.layers.combinators.Serial): The transformer model.

    Returns:
        int: tokenized symbol.
    """
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # current output tokens length
    token_length = len(cur_output_tokens)
    # calculate the minimum power of 2 big enough to store token_length
    # HINT: use np.ceil() and np.log2()
    # add 1 to token_length so np.log2() doesn't receive 0 when token_length is 0
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # Fill cur_output_tokens with 0's until it reaches padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length)
    padded_with_batch = np.array(padded)[None, :] # Don't replace this 'None'! This is a way of setting the batch dim

    # model expects a tuple containing two padded tensors (with batch)
    output, _ = model((padded_with_batch, padded_with_batch)) 
    # HINT: output has shape (1, padded_length, vocab_size)
    # To get log_probs you need to index output with 0 in the first dim
    # token_length in the second dim and all of the entries for the last dim.
    log_probs = output[0, token_length, :]
    
    ### END CODE HERE ###
    
    return int(np.argmax(log_probs))

def tokenize(input_str, EOS=1):
    """Input str to features dict, ready for inference"""
  
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_dir='vocab_dir/',
                                      vocab_file='summarize32k.subword'))
    
    # Mark the end of the sentence with EOS
    return list(inputs) + [EOS]

def detokenize(integers):
    """List of ints to str"""
  
    s = trax.data.detokenize(integers,
                             vocab_dir='vocab_dir/',
                             vocab_file='summarize32k.subword')
    
    return wrapper.fill(s)

# UNQ_C10
# Decoding functions.
def greedy_decode(input_sentence, model):
    """Greedy decode function.

    Args:
        input_sentence (string): a sentence or article.
        model (trax.layers.combinators.Serial): Transformer model.

    Returns:
        string: summary of the input.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # Use tokenize()
    cur_output_tokens = tokenize(input_sentence) + [0]
    generated_output = [] 
    cur_output = 0 
    EOS = 1 
    
    while cur_output != EOS:
        # Get next symbol
        cur_output = next_symbol(cur_output_tokens, model)
        # Append next symbol to original sentence
        cur_output_tokens.append(cur_output)
        # Append next symbol to generated sentence
        generated_output.append(cur_output)    
    ### END CODE HERE ###
    
    return detokenize(generated_output)

In [43]:
# Get the model architecture
model =  tm.TransformerLM(
    vocab_size=33300,
    d_model=512,
    d_ff=2048,
    n_layers=6,
    n_heads=8,
    mode='eval'
)

# Load the pre-trained weights
model.init_from_file("pretrained_weights.pkl.gz", weights_only=True)

# Test it out with a whole article!
article = "It’s the posing craze sweeping the U.S. after being brought to fame by skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert Pujols - and even Republican politician Rick Perry. But now four students at Riverhead High School on Long Island, New York, have been suspended for dropping to a knee and taking up a prayer pose to mimic Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll were all suspended for one day because the ‘Tebowing’ craze was blocking the hallway and presenting a safety hazard to students. Scroll down for video. Banned: Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured left) were all suspended for one day by Riverhead High School on Long Island, New York, for their tribute to Broncos quarterback Tim Tebow. Issue: Four of the pupils were suspended for one day because they allegedly did not heed to warnings that the 'Tebowing' craze at the school was blocking the hallway and presenting a safety hazard to students."
print(wrapper.fill(article), '\n')
print(greedy_decode(article, model))

It’s the posing craze sweeping the U.S. after being brought to fame by
skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert
Pujols - and even Republican politician Rick Perry. But now four
students at Riverhead High School on Long Island, New York, have been
suspended for dropping to a knee and taking up a prayer pose to mimic
Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel,
Tyler Carroll and Connor Carroll were all suspended for one day
because the ‘Tebowing’ craze was blocking the hallway and presenting a
safety hazard to students. Scroll down for video. Banned: Jordan
Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured
left) were all suspended for one day by Riverhead High School on Long
Island, New York, for their tribute to Broncos quarterback Tim Tebow.
Issue: Four of the pupils were suspended for one day because they
school was blocking the hallway and presenting a safety hazard to
students. 

Jordan Fulcoly, Wayne Drexel, Ty

In [44]:
article = """🦁 Premier League Player of the Week - The never-ending story

Presumably the film would have told the inspiring, feel-good story of Vardy being plucked from the obscurity of non-league football before going on to lift the Premier League title with Leicester in fairytale fashion.
 Even at the ripe old age of 33 he is still one of Europe’s deadliest strikers, as he showed once again in Leicester’s sensational 5-2 victory away at Manchester City on Sunday.
 Vardy is the most dangerous player around when it comes to hanging off the shoulder of the last man and if you play with a high line against the Foxes, you’re asking for trouble.
 Pep Guardiola took that risk in this game, and he would have paid the price much sooner had some of Leicester’s final balls been better on the counter-attack.
 Vardy eventually did make his first big mark on the match when he ghosted past Kyle Walker and left the defender with little choice but to bring him down for a penalty. Up stepped the 2019/20 Golden Boot winner to fire past Ederson for his third goal of the new season.
 Then in the second half came one of the most brilliant moments of Vardy’s career, when Timothy Castagne was played in down the right and his low cross into the box found the striker lurking.
 In his post-match interview, Vardy described what he did next as a “Johan Dink”, and who are we to argue?
 To put the icing on the cake, he soon completed his hat-trick with another penalty, taking his tally to five goals for the season already – the most in the Premier League.
 He has now scored 108 goals in 214 Premier League matches. Not a bad haul for a player who didn’t make his debut in the top flight until he was 27.
 He has also netted 21 penalties since his Premier League debut – more than any other player during that period – and he’s won 18 of those spot-kicks himself.
 And what’s more, he has now scored nine times against Guardiola’s Manchester City – more than any other player since the Catalan’s arrival in England."""

print(wrapper.fill(article), '\n')
print(greedy_decode(article, model))

🦁 Premier League Player of the Week - The never-ending story
Presumably the film would have told the inspiring, feel-good story of
Vardy being plucked from the obscurity of non-league football before
going on to lift the Premier League title with Leicester in fairytale
fashion.  Even at the ripe old age of 33 he is still one of Europe’s
deadliest strikers, as he showed once again in Leicester’s sensational
5-2 victory away at Manchester City on Sunday.  Vardy is the most
dangerous player around when it comes to hanging off the shoulder of
the last man and if you play with a high line against the Foxes,
you’re asking for trouble.  Pep Guardiola took that risk in this game,
and he would have paid the price much sooner had some of Leicester’s
final balls been better on the counter-attack.  Vardy eventually did
make his first big mark on the match when he ghosted past Kyle Walker
and left the defender with little choice but to bring him down for a
penalty. Up stepped the 2019/20 Golden Boo