In [None]:
#!git clone https://github.com/google-research/electra.git
!pip install tensorflow-gpu==1.15
#!pip install transformers==3.3.0


In [None]:
import os
import json
from transformers import AutoTokenizer

In [None]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer()

paths = ["../../media/data/ociftci/corpus.txt"]

# Customize training
tokenizer.train(files=paths,vocab_size=64_000, min_frequency=500)

In [None]:
!mkdir med-electra-tokenizerfreqs2200
tokenizer.save_model("med-electra-tokenizerfreqs2200")

In [None]:
DATA_DIR = "./data"
CORPUS_DIR = "/media/data/ociftci/data/"
TRAIN_SIZE = 1000000 
MODEL_NAME = "med-electra" 
TOKENIZER_DIR = "./med-electra-tokenizer-64kvocab-500freq"

In [None]:
!python3 electra/build_pretraining_dataset.py \
  --corpus-dir $CORPUS_DIR \
  --vocab-file $TOKENIZER_DIR/vocab.txt \
  --output-dir $DATA_DIR/pretrain_tfrecords \
  --max-seq-length 128 \
  --blanks-separate-docs True \
  --no-lower-case \
  --num-processes 5

In [None]:
import json

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices()) 

In [None]:
hparams = {
    "do_train": "true",
    "do_eval": "false",
    "model_size": "small",
    "do_lower_case": "false",
    "vocab_size": 64000,
    "num_train_steps": 1000000,
    "save_checkpoints_steps": 50000,
    "train_batch_size": 128,
    "electra_objective" :  True,
    
}
           
with open("hparams.json", "w") as f:
    json.dump(hparams, f)

In [None]:
!python3 electra/run_pretraining.py \
  --data-dir $DATA_DIR \
  --model-name $MODEL_NAME \
  --hparams "hparams.json"

In [None]:
!git clone https://github.com/lonePatient/electra_pytorch.git

In [None]:
MODEL_DIR = "data/models/med-electra/"

config = {
  "vocab_size": 64000,
  "embedding_size": 128,
  "hidden_size": 256,
  "num_hidden_layers": 12,
  "num_attention_heads": 4,
  "intermediate_size": 1024,
  "generator_size":"0.25",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "max_position_embeddings": 512,
  "type_vocab_size": 2,
  "initializer_range": 0.02
}

with open(MODEL_DIR + "config.json", "w") as f:
    json.dump(config, f)

In [None]:
!pip install torch

In [None]:
#import numpy
!pip install --upgrade transformers

In [None]:
!python electra_pytorch/convert_electra_tf_checkpoint_to_pytorch.py \
    --tf_checkpoint_path=$MODEL_DIR \
    --electra_config_file=$MODEL_DIR/config.json \
    --pytorch_dump_path=$MODEL_DIR/pytorch_model.bin

In [None]:
import torch
import tensorflow as tf
from transformers import ElectraForPreTraining, ElectraTokenizerFast

In [None]:
discriminator = ElectraForPreTraining.from_pretrained(MODEL_DIR,from_tf=True)
tokenizer = ElectraTokenizerFast.from_pretrained(DATA_DIR, do_lower_case=False)

In [None]:
sentence = "The birds are singing"
fake_sentence = "The birds are speaking"

fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
discriminator_outputs = discriminator(fake_inputs)
predictions = discriminator_outputs[0] > 0

[print("%7s" % token, end="") for token in fake_tokens]
print("\n")
[print("%7s" % int(prediction), end="") for prediction in predictions.tolist()];

In [None]:
from transformers import ElectraTokenizer, ElectraForMaskedLM

In [None]:
model = ElectraModel.from_pretrained(MODEL_DIR)
tokenizer = ElectraTokenizer.from_pretrained(DATA_DIR, do_lower_case=True, return_dict=True)

In [None]:
text = "Hello my dog is cute."
tokenized_text = tokenizer.tokenize(text)

masked_index = 5
tokenized_text[masked_index] = '[MASK]'

In [None]:
tokenized_text

In [None]:
# Convert token to vocabulary indices
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
token_type_ids = [0] * len(token_ids)

print(token_ids)
print(token_type_ids) # segment_ids

# Convert inputs to PyTorch tensors
token_ids_tensor = torch.tensor([token_ids]).to('cuda')
token_type_ids_tensor = torch.tensor([token_type_ids]).to('cuda')

In [None]:
model.eval()

In [None]:
model.to('cuda')

In [None]:
with torch.no_grad():
    outputs = model(token_ids_tensor, token_type_ids=token_type_ids_tensor)
    predictions = outputs[0]
print(predictions)

predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print('[MASK] =>', predicted_token)