**MIT License**

Copyright (c) [2019] [Chinh Ngo]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

# Install dependencies.

* `tensor2tensor`: a library with all necessary tools to perform training/inference of Transformers.

In [1]:
# Imports we need.
%tensorflow_version 1.x

import tensorflow as tf

# Install Tensor2tensor
!pip install -q -U tensor2tensor
!pip install tensorflow-datasets==3.2.1

print('All done.')


TensorFlow 1.x selected.
[K     |████████████████████████████████| 1.5MB 6.3MB/s 
[K     |████████████████████████████████| 358kB 21.1MB/s 
[?25hCollecting tensorflow-datasets==3.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/c9/d97bdf931edbae9aebc767633d088bd674136d5fe7587ef693b7cb6a1883/tensorflow_datasets-3.2.1-py3-none-any.whl (3.4MB)
[K     |████████████████████████████████| 3.4MB 6.0MB/s 
Installing collected packages: tensorflow-datasets
  Found existing installation: tensorflow-datasets 4.0.1
    Uninstalling tensorflow-datasets-4.0.1:
      Successfully uninstalled tensorflow-datasets-4.0.1
Successfully installed tensorflow-datasets-3.2.1
All done.


# Setup some options.

In [37]:
import numpy as np
import os
from os import path
import collections
import json
import pprint

#@markdown 1. The problem is either `translate_vien_iwslt32k` or `translate_envi_iwslt32k`. This name will tell tensor2tensor how to properly set up training/testing data pipeline.

problem = 'fb_wiki_and_book_m45_translate_envi_iwslt32k'  # @param

#@markdown 2. We use the tiny setting of the transformer by default. This name will tell tensor2tensor to pick the corresponding Transformer architecture (default the small one).

hparams_set = 'transformer_tall9'  # @param

#@markdown 3. Next we specify the directory where all data involving this colab will be stored (training data, checkpoints, decoded text etc.)

#@markdown * For GPU we use Google Drive Storage (free for everyone with a Google account, no need to install any payment method). This will create a directory in your Google Drive with the specified name.

#@markdown * With TPU, unfortunately only Google Cloud Storage is usable (free trial with a payment method required). Here we specify a Storage bucket.

google_cloud_bucket = 'ntkchinh_public'  # @param

#@markdown Please note that only one of the two options above will be used depending on which runtime setting you are using.

#@markdown 4. Now we specify all sub-directories:

#@markdown * Data tfrecords (train/valid) is a special data format used by tensor2tensor for optimized reading/loading. They will be generated from raw texts files and stored into:
data_dir = 'translation_envi'  # @param

#@markdown * Training Machine Learning model generally takes a long time. We want to frequently save intermediate results (half-trained models or 'checkpoints') to:
logdir = 'translation_envi'  # @param

#@markdown * The temporary dir to store all the temp files during tfrecords data generation (e.g. downloads from the internet).

tmp_dir = 'translation_envi'  # @param

In [35]:
# Check if the runtime is set to TPU or GPU:
use_tpu = True  # @param{type:"boolean"}
# use_tpu = False

  
def setup_tpu():
  from google.colab import auth
  auth.authenticate_user()

  # Mount the bucket to colab, so that python package os can access to it.
  # First we install gcsfuse to be able to mount Google Cloud Storage with Colab.
  print('\nInstalling gcsfuse')
  !echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
  !curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
  !apt -qq update
  !apt -qq install gcsfuse

  bucket = google_cloud_bucket
  print('Mounting bucket {} to local.'.format(bucket))
  mount_point = '/content/{}'.format(bucket)
  if not os.path.exists(mount_point):
    tf.gfile.MakeDirs(mount_point)
  
  !fusermount -u $mount_point
  !gcsfuse --implicit-dirs $bucket $mount_point
  print('\nMount point content:')
  !ls $mount_point

  # First we Connect to the TPU pod.
  tpu_address = ''
  if 'COLAB_TPU_ADDR' in os.environ:
    tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    print ('TPU address is', tpu_address)
    with tf.Session(tpu_address) as session:
      devices = session.list_devices()
      # Upload credentials to TPU.
      with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
      tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

    print('TPU devices:')
    pprint.pprint(devices)

  return mount_point, tpu_address


if not use_tpu:
  mount_point = setup_gpu()
  tpu_address = ''
else:
  mount_point, tpu_address = setup_tpu()
  
print('\nMount point: {}'.format(mount_point))
print('TPU address: {}'.format(tpu_address))


Installing gcsfuse
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1974  100  1974    0     0  53351      0 --:--:-- --:--:-- --:--:-- 53351
OK
20 packages can be upgraded. Run 'apt list --upgradable' to see them.
gcsfuse is already the newest version (0.33.1).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Mounting bucket ntkchinh_public to local.
Using mount point: /content/ntkchinh_public
Opening GCS connection...
Mounting file system...
File system has been successfully mounted.

Mount point content:
translation_envi
TPU address is grpc://10.25.25.42:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 7974801949009978256),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 14799849929165045029),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179

Now we create all the directories.

In [38]:
# Now we make all the paths absolute.
logdir = os.path.join(mount_point, logdir)
data_dir = os.path.join(mount_point, data_dir)
tmp_dir = os.path.join(mount_point, tmp_dir)

run_logdir = logdir

print('log dir: {}'.format(run_logdir))
print('data dir: {}'.format(data_dir))
print('temp dir: {}'.format(tmp_dir))

log dir: /content/ntkchinh_public/translation_envi
data dir: /content/ntkchinh_public/translation_envi
temp dir: /content/ntkchinh_public/translation_envi


# Clone or Pull source code from our Github repo `ntkchinh/some_new_repo`



In [44]:
src = '/content/translation_vien'
if not os.path.exists(src):
  os.chdir('/content')
  ! git clone https://github.com/ntkchinh/translation_vien.git
else:
  % cd $src
  ! git pull
  % cd /

print('\n Source code:')
%ls $src

Cloning into 'translation_vien'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (4/4), done.

 Source code:
problems.py  t2t_decoder.py


In [31]:
!ls  $mount_point/translation_envi  #uhmok

model.ckpt.data-00000-of-00001
model.ckpt.index
model.ckpt.meta
tst2013.en
tst2013.vi
vocab.fb_wiki_and_book_m45_translate_envi_iwslt32k.32768.subwords
vocab.subwords


In [None]:
decode_from_file = os.path.join(tmp_dir, 'tst2013.en')
decode_to_file = 'tst2013.en2vi'  # no longer write to cloud bucket.
ref_file = os.path.join(tmp_dir, 'tst2013.vi')

if use_tpu:
  # TPU wants the paths to begin with gs://
  ckpt_dir = logdir.replace(mount_point, 'gs://{}'.format(google_cloud_bucket))
ckpt_path = os.path.join(ckpt_dir, 'model.ckpt')

print('Decode to file {}'.format(decode_to_file))
!python $src/t2t_decoder.py \
--data_dir=$data_dir --problem=$problem \
--hparams_set=$hparams_set \
--model=transformer \
--decode_hparams="beam_size=4,alpha=0.6"  \
--decode_from_file=$decode_from_file \
--decode_to_file=$decode_to_file  \
--checkpoint_path=$ckpt_path  \
--output_dir=$ckpt_dir  \
--use_tpu=$use_tpu \
--cloud_tpu_name=$tpu_address


Decode to file tst2013.en2vi








W0208 16:17:23.384145 140020228740992 estimator.py:1994] Estimator's model_fn (<function T2TModel.make_estimator_model_fn.<locals>.wrapping_model_fn at 0x7f587ec16d08>) includes params argument, but params are not passed to Estimator.
INFO:tensorflow:Using config: {'_model_dir': 'gs://ntkchinh_public/translation_envi', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.95
}
allow_soft_placement: true
graph_options {
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.25.25.42:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 20, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs':

In [40]:
!pwd
!ls
!head -10 $decode_to_file

/
bin	 dev   lib32  opt   sbin   tensorflow-1.15.2  usr
boot	 etc   lib64  proc  srv    tmp		      var
content  home  media  root  swift  tools
datalab  lib   mnt    run   sys    tst2013.en2vi
Khi còn nhỏ , tôi nghĩ đất nước mình là tốt nhất trên hành tinh này , và tôi lớn lên hát một bài hát tên là &quot; Không có gì để ghen tị . &quot;
Và tôi đã rất tự hào .
Ở trường , chúng tôi dành rất nhiều thời gian nghiên cứu lịch sử của Kim Il-Sung , nhưng chúng tôi chưa bao giờ biết nhiều về thế giới bên ngoài , ngoại trừ Mỹ , Hàn Quốc , Nhật Bản là kẻ thù .
Mặc dù tôi thường băn khoăn về thế giới bên ngoài , tôi nghĩ rằng mình sẽ dành cả cuộc đời mình ở Bắc Triều Tiên , cho đến khi mọi thứ đột nhiên thay đổi .
Khi tôi 7 tuổi , tôi chứng kiến lần hành quyết trước công chúng đầu tiên , nhưng tôi nghĩ cuộc sống của tôi ở Bắc Triều Tiên là bình thường .
Gia đình tôi không nghèo , và bản thân tôi , tôi chưa bao giờ trải qua cơn đói .
Nhưng một ngày , vào năm 1995 , mẹ tôi mang về nhà một lá thư từ

In [41]:
print('\nCompare {} with reference {}'.format(decode_to_file, ref_file))
!t2t-bleu --translation=$decode_to_file --reference=$ref_file


Compare tst2013.en2vi with reference /content/ntkchinh_public/translation_envi/tst2013.vi



BLEU_uncased =  37.49
BLEU_cased =  36.63
