# Pre-training ALBERT

Code referred from
https://github.com/google-research/albert

In [None]:
#Setting up TPU
%tensorflow_version 1.x
import os
import pprint
import json
import tensorflow as tf

assert "COLAB_TPU_ADDR" in os.environ, "ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!"
TPU_ADDRESS = "grpc://" + os.environ["COLAB_TPU_ADDR"] 
TPU_TOPOLOGY = "2x2"
print("TPU address is", TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.

TensorFlow 1.x selected.
TPU address is grpc://10.15.73.98:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 12787477634464250745),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 13264972788616172421),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 815571450894141198),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4663239132975362837),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 538891274723644301),
 _DeviceAttributes(/job:tpu_wo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Create sentencepiece file

In [None]:
!pip install tensorflow==1.15.0

In [None]:
# loading google drive to colaboratory
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# https://drive.google.com/file/d/1OkbkUULazQoS4m58PtdCYUyeQwRdLoJ1/view?usp=sharing
#Importing file to build vocab
id = '1OkbkUULazQoS4m58PtdCYUyeQwRdLoJ1'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('combineV2_book10per.txt') 

In [None]:
#Cloning google research ALBERT github code
!pip install sentencepiece
!git clone https://github.com/google-research/ALBERT

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.7MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94
Cloning into 'ALBERT'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 356 (delta 0), reused 0 (delta 0), pack-reused 353[K
Receiving objects: 100% (356/356), 233.79 KiB | 3.00 MiB/s, done.
Resolving deltas: 100% (233/233), done.


In [None]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train('--input=''combineV2_book10per.txt'' --vocab_size=30000 --model_prefix=30k-clean --pad_id=0 --unk_id=1 --pad_piece=<pad> --unk_piece=<unk> --bos_id=-1 --eos_id=-1 --control_symbols=[CLS],[SEP],[MASK],[UNK],<pad> --user_defined_symbols="(,),",-,.,–,£,€"')

In [None]:
sp_user = spm.SentencePieceProcessor()
sp_user.load('30k-clean.model')
vocabs = [sp_user.IdToPiece(id) for id in range(sp_user.GetPieceSize())]

In [None]:
print(vocabs)



In [None]:
f = open("/content/drive/My Drive/Capstone/AlbertVocab/vocab.txt", "x")
f = open("/content/drive/My Drive/Capstone/AlbertVocab/vocab.txt", "w")
for vocab in vocabs:
  f.write(vocab)
  f.write('\n')

In [None]:
sp_user.EncodeAsIds('Hello world')

[245, 1, 157, 10577, 232]

In [None]:
!pip3 install albert-tensorflow

Collecting albert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/ba/1e/e776bb23e6f89a1f1d7d33b50d0bd9c2c7b24b39aa548f041827a9c00d73/albert_tensorflow-1.1-py3-none-any.whl (81kB)
[K     |████                            | 10kB 9.9MB/s eta 0:00:01[K     |████████                        | 20kB 1.4MB/s eta 0:00:01[K     |████████████                    | 30kB 2.1MB/s eta 0:00:01[K     |████████████████                | 40kB 2.6MB/s eta 0:00:01[K     |████████████████████            | 51kB 2.2MB/s eta 0:00:01[K     |████████████████████████        | 61kB 2.3MB/s eta 0:00:01[K     |████████████████████████████▏   | 71kB 2.6MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.2MB/s 
Installing collected packages: albert-tensorflow
Successfully installed albert-tensorflow-1.1


In [None]:
!pip install -r /content/ALBERT/requirements.txt

Collecting tensorflow==1.15.2
[?25l  Downloading https://files.pythonhosted.org/packages/9a/d9/fd234c7bf68638423fb8e7f44af7fcfce3bcaf416b51e6d902391e47ec43/tensorflow-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl (110.5MB)
[K     |████████████████████████████████| 110.5MB 36kB/s 
[?25hCollecting tensorflow_hub==0.7
[?25l  Downloading https://files.pythonhosted.org/packages/00/0e/a91780d07592b1abf9c91344ce459472cc19db3b67fdf3a61dca6ebb2f5c/tensorflow_hub-0.7.0-py2.py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 7.3MB/s 
Installing collected packages: tensorflow, tensorflow-hub
  Found existing installation: tensorflow 2.3.0
    Uninstalling tensorflow-2.3.0:
      Successfully uninstalled tensorflow-2.3.0
  Found existing installation: tensorflow-hub 0.9.0
    Uninstalling tensorflow-hub-0.9.0:
      Successfully uninstalled tensorflow-hub-0.9.0
Successfully installed tensorflow-1.15.2 tensorflow-hub-0.7.0


In [None]:
cd ./ALBERT

/content/ALBERT


##Create Pre-training Data

In [None]:
!pip install tensorflow-gpu==2.0.0-alpha0

import tensorflow as tf
import os
import sys

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
WORKER_NAME='tpu_worker'

tf.config.experimental_connect_to_host(TPU_WORKER, WORKER_NAME) 

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(TPU_WORKER)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
devices=tf.config.experimental_list_devices()
print(*devices,sep="\n")

Collecting tensorflow-gpu==2.0.0-alpha0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/66/32cffad095253219d53f6b6c2a436637bbe45ac4e7be0244557210dc3918/tensorflow_gpu-2.0.0a0-cp36-cp36m-manylinux1_x86_64.whl (332.1MB)
[K     |████████████████████████████████| 332.1MB 53kB/s 
[?25hCollecting tf-estimator-nightly<1.14.0.dev2019030116,>=1.14.0.dev2019030115
[?25l  Downloading https://files.pythonhosted.org/packages/13/82/f16063b4eed210dc2ab057930ac1da4fbe1e91b7b051a6c8370b401e6ae7/tf_estimator_nightly-1.14.0.dev2019030115-py2.py3-none-any.whl (411kB)
[K     |████████████████████████████████| 419kB 36.1MB/s 
Collecting tb-nightly<1.14.0a20190302,>=1.14.0a20190301
[?25l  Downloading https://files.pythonhosted.org/packages/a9/51/aa1d756644bf4624c03844115e4ac4058eff77acd786b26315f051a4b195/tb_nightly-1.14.0a20190301-py3-none-any.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 44.5MB/s 
Installing collected packages: tf-estimator-nightly, tb-nightly, tensorflow

INFO:tensorflow:Initializing the TPU system: 10.15.73.98:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.15.73.98:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 12787477634464250745)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 815571450894141198)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4663239132975362837)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 538891274723644301)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TP

In [None]:
# !pip install tensorflow-gpu==1.15.0
import tensorflow as tf
import os
import sys

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
WORKER_NAME='tpu_worker'

tf.config.experimental_connect_to_host(TPU_WORKER, WORKER_NAME) 

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(TPU_WORKER)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
devices=tf.config.experimental_list_devices()
print(*devices,sep="\n")

INFO:tensorflow:Initializing the TPU system: 10.15.73.98:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.15.73.98:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 12787477634464250745)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 815571450894141198)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4663239132975362837)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 538891274723644301)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TP

In [None]:
with strategy.scope():
  !python create_pretraining_data.py --input_file "/content/combineV2_book10per.txt" --output_file "/content/drive/My Drive/Capstone/CreatePre/out_comb_v1" --vocab_file "/content/30k-clean.vocab" --spm_model_file "/content/30k-clean.model" --max_seq_length=256 --dupe_factor 1


W1030 12:48:04.059408 140316732458880 module_wrapper.py:139] From /usr/local/lib/python3.6/dist-packages/albert/tokenization.py:240: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

INFO:tensorflow:loading sentence piece model
I1030 12:48:04.059681 140316732458880 tokenization.py:240] loading sentence piece model
INFO:tensorflow:*** Reading from input files ***
I1030 12:48:04.179522 140316732458880 create_pretraining_data.py:631] *** Reading from input files ***
INFO:tensorflow:  /content/combineV2_book10per.txt
I1030 12:48:04.179761 140316732458880 create_pretraining_data.py:633]   /content/combineV2_book10per.txt

next
exit
quit
^C


In [None]:
!pip install transformers
!pip install tfrecord

##Run Pre-training

In [None]:
!python run_pretraining.py \
    --input_file="gs://capstonecs9-1/combinedPre" \
    --output_dir="gs://capstonecs9-1/models_recc2" \
    --albert_config_file="/content/drive/My Drive/Colab Notebooks/Capstone/albert_config.json" \
    --do_train \
    --do_eval \
    --train_batch_size=256 \
    --eval_batch_size=64 \
    --max_seq_length=512 \
    --max_predictions_per_seq=20 \
    --optimizer='lamb' \
    --learning_rate=.00176 \
    --num_train_steps=10000 \
    --num_warmup_steps=3025 \
    --use_tpu=True \
    --tpu_name='grpc://10.14.28.106:8470' \
    --save_checkpoints_steps=50



W1102 01:58:31.670285 140134263179136 module_wrapper.py:139] From /usr/local/lib/python3.6/dist-packages/albert/modeling.py:116: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

INFO:tensorflow:*** Input Files ***
I1102 01:58:32.952018 140134263179136 run_pretraining.py:484] *** Input Files ***
INFO:tensorflow:  gs://capstonecs9-1/combinedPre
I1102 01:58:32.952295 140134263179136 run_pretraining.py:486]   gs://capstonecs9-1/combinedPre
W1102 01:58:33.958949 140134263179136 estimator.py:1994] Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f7326cca0d0>) includes params argument, but params are not passed to Estimator.
INFO:tensorflow:Using config: {'_model_dir': 'gs://capstonecs9-1/models_recc2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value

###Make Bin File

In [None]:
import logging
import torch
# from transformers import AlbertConfig, AlbertModel
from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)



convert_tf_checkpoint_to_pytorch("/content/drive/My Drive/Reuters_Dataset/Reuters_model/model.ckpt-best.index","/content/drive/My Drive/Reuters_Dataset/Reuters_model/albert_config.json" , "/content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/pytorch_model.bin")

Building PyTorch model from configuration: AlbertConfig {
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 20000
}

Save PyTorch model to /content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/pytorch_model.bin
