
<a href="https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/master/notebooks/t5-trivia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91


<h3>Train on TPU</h3>




In [2]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

BASE_DIR = "gs://fiery-lcm-000001" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models2")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Installing dependencies...
[K     |████████████████████████████████| 153kB 3.4MB/s 
[K     |████████████████████████████████| 3.3MB 43.7MB/s 
[K     |████████████████████████████████| 665kB 41.7MB/s 
[K     |████████████████████████████████| 61kB 6.6MB/s 
[K     |████████████████████████████████| 296kB 51.0MB/s 
[K     |████████████████████████████████| 3.0MB 43.7MB/s 
[K     |████████████████████████████████| 890kB 46.9MB/s 
[K     |████████████████████████████████| 3.8MB 37.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: google-colab 1.0.0 has requirement six~=1.12.0, but you'll have six 1.15.0 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
Setting up GCS access...
Running on TPU: grpc://10.28.69.242:8470
Instruct

Loading IMDB dataset.

In [4]:
!unzip ./imdb-dataset-of-50k-movie-reviews

Archive:  ./imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


# New Section

In [0]:
import csv
import pandas as pd
import re
from numpy.random import RandomState
import numpy as np
from bs4 import BeautifulSoup

def remove_html_tags(input):
    soup = BeautifulSoup(input)
    return soup.get_text()

df =pd.read_csv('./IMDB Dataset.csv')
print(df.head())
df = df.applymap(remove_html_tags)
df = df["review"]
df.to_csv('data.txt', sep='\t', index=False)


After removing the html tags:

In [0]:
print(df.head())

In [0]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train('--input=./data.txt --model_prefix=m --vocab_size=32000')

In [86]:
sp = spm.SentencePieceProcessor()
sp.Load("./m.model")

True

Simple tests to check whether the training was alright or not.

In [89]:
print('Vocabulary size : {}'.format(sp.GetPieceSize()))
print(sp.EncodeAsPieces("Positive"))
print(sp.EncodeAsIds("Positive"))
sp.IdToPiece(73)

Vocabulary size : 64000
['▁Positive']
[48974]


'ion'

In [0]:
def tab_to_space(x):
    x = re.sub('\t+','\s',x)
    #x = re.sub('\s+', '\s', x)
    
    return x

In [0]:
import csv
import pandas as pd
import re
from numpy.random import RandomState


df =pd.read_csv('./IMDB Dataset.csv')
print(df.head())
rng = RandomState()

train = df.sample(frac=0.9, random_state=rng)
test = df.loc[~df.index.isin(train.index)]

print(train.head())
print(test.head())


train = train.applymap(tab_to_space)
test = test.applymap(tab_to_space)

train = train.applymap(remove_html_tags)
test = test.applymap(remove_html_tags)

print(train.head())
print(test.head())

train.to_csv("imdb-train.csv",index=False, header=False)
test.to_csv("imdb-test.csv",index=False, header=False)



In [0]:
def conv_csv_to_tsv(filename):
  with open(filename+'.csv','r') as csvin, open(filename+'.tsv', 'w') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)
conv_csv_to_tsv('imdb-train')
conv_csv_to_tsv('imdb-test')

In [100]:
# !gsutil cp ./imdb-train.tsv gs://fiery-lcm-000001/data/imdb-train.tsv
# !gsutil cp ./imdb-test.tsv gs://fiery-lcm-000001/data/imdb-test.tsv
!gsutil cp ./m.model gs://fiery-lcm-000001/data/spm-imdb.model
!gsutil cp ./m.vocab gs://fiery-lcm-000001/data/spm-imdb.vocab

Copying file://./m.model [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/785.4 KiB.                                    
Copying file://./m.vocab [Content-Type=application/octet-stream]...
/ [1 files][580.9 KiB/580.9 KiB]                                                
Operation completed over 1 objects/580.9 KiB.                                    


In [0]:
import os
DATA_DIR = os.path.join(BASE_DIR, "data")
nq_tsv_path = {
    "train": os.path.join(DATA_DIR, "imdb-train.tsv"),
    "validation": os.path.join(DATA_DIR, "imdb-test.tsv")
}
def imdb_dataset_fn(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path[split])
  # Split each "<question>\t<answer>" example into (question, answer) tuple.
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  # Map each tuple to a {"question": ... "answer": ...} dict.
  ds = ds.map(lambda *ex: dict(zip(["review", "sentiment"], ex)))
  return ds

for ex in tfds.as_numpy(nq_dataset_fn("train").take(10)):
 print(ex)

Removing html tags using Beautifulsoup. 

In [0]:
from bs4 import BeautifulSoup
for ex in tfds.as_numpy(nq_dataset_fn("train").take(10)):
  soup = BeautifulSoup(ex['review'])
  print(soup.get_text())

In [0]:
def imdb_preprocessor(ds):
  def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    text = tf.strings.regex_replace(text,"'\s*'", r"\s")
    return text

  def to_inputs_and_targets(ex):
    """Map {"review": ..., "sentiment": ...}->{"inputs": ..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["sentiment: ", normalize_text(ex["review"])]),
        "targets": normalize_text(ex["sentiment"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [0]:
import t5.data
from t5.data import postprocessors as t5_postprocessors
from t5.evaluation import metrics as t5_metrics
from t5.data.utils import Feature
from t5.data import sentencepiece_vocabulary

TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

OUTPUT_FEATURES = {
    "inputs": Feature(vocabulary =sentencepiece_vocabulary.SentencePieceVocabulary(
      os.path.join(DATA_DIR, "spm-imdb.model"), 100) , add_eos=True),
    "targets": Feature(vocabulary =sentencepiece_vocabulary.SentencePieceVocabulary(
      os.path.join(DATA_DIR, "spm-imdb.model"), 100) , add_eos=True),
}

TaskRegistry.remove("imdb_custom1")
t5.data.TaskRegistry.add(
    "imdb_custom1",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=imdb_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[imdb_preprocessor],
    # Use the same vocabulary that we used for pre-training.
    #output_features=OUTPUT_FEATURES,
    output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(os.path.join(DATA_DIR, "spm-imdb.model"))),
    # Lowercase targets before computing metrics.
    postprocess_fn=t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy]
)
TaskRegistry.remove("imdb_custom2")
t5.data.TaskRegistry.add(
    "imdb_custom2",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=imdb_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[imdb_preprocessor],
    # Use the same vocabulary that we used for pre-training.
    #output_features=OUTPUT_FEATURES,
    output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(os.path.join(DATA_DIR, "spm-imdb.model"))),
    # Lowercase targets before computing metrics.
    postprocess_fn=t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy]
)


In [0]:
t5.data.MixtureRegistry.remove("imdb_all")
t5.data.MixtureRegistry.add(
    "imdb_all",
    [["imdb_custom1",45000],["imdb_custom2",45000]],
    
)

## Define Model

In [0]:
MODEL_SIZE = "small" #@param["small", "base", "large", "3B", "11B"]
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)
MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)

if ON_CLOUD and MODEL_SIZE == "3B":
  tf.logging.warn(
      "The `3B` model is too large to use with the 5GB GCS free tier. "
      "Make sure you have at least 25GB on GCS before continuing."
  )
elif ON_CLOUD and MODEL_SIZE == "11B":
  raise ValueError(
      "The `11B` parameter is too large to fine-tune on the `v2-8` TPU "
      "provided by Colab. Please comment out this Error if you're running "
      "on a larger TPU."
  )

# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 128, "targets": 32},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=500,
    #keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

In [0]:
from google.colab import auth
auth.authenticate_user()

Before we continue, let's load a [TensorBoard](https://www.tensorflow.org/tensorboard) visualizer so that we can keep monitor our progress. The page should automatically update as fine-tuning and evaluation proceed.

In [0]:
if ON_CLOUD:
  %reload_ext tensorboard
  import tensorboard as tb
tb.notebook.start("--logdir " + MODELS_DIR)

## Fine-tune

In [0]:
import gin
with gin.unlock_config():
  gin.parse_config_file("gs://t5-data/pretrained_models/base/operative_config.gin")

In [0]:
FINETUNE_STEPS = 500 #@param {type: "integer"}
import transformers
model.train(
    mixture_or_task_name="imdb_all",
    steps=FINETUNE_STEPS,
    split="train",
)

## Evaluate

We now evaluate on the validation sets of the tasks in our mixture. Accuracy results will be logged and added to the TensorBoard above.

In [104]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="imdb_all",
    checkpoint_steps="all",
    split="validation"
)

INFO:tensorflow:Using config: {'_model_dir': 'gs://fiery-lcm-000001/models2/small', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.28.69.242:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.28.69.242:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.28.69.242:8470', '_evaluation_master': 'grpc://10.28.69.242:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replica