In [1]:
from google.colab import drive

drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [2]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
  Downloading tokenizers-0.11.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 5.1 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.2
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 62.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 460 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading token

In [3]:
import tensorflow as tf
import os

import tokenizers
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

from pathlib import Path


In [16]:
class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        # self.tokenizer.normalizer = Sequence([
        #     NFKC()
        # ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=15000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer = trainer, files = paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [17]:

# the folder 'text' contains all the files
paths = [str(x) for x in Path("./gdrive/MyDrive/project/data/").glob("**/*processed.txt")]


In [18]:
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'gdrive/MyDrive/project/model'
tokenizer.save_tokenizer(save_path)

In [19]:
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

file gdrive/MyDrive/project/model/config.json not found


In [20]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [21]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [22]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 2
history = model.fit(dataset, epochs=num_epoch)

Epoch 1/2
 42/344 [==>...........................] - ETA: 1:12:19 - loss: 8.5826 - logits_loss: 8.5826 - logits_accuracy: 0.0442 - past_key_values_1_accuracy: 0.0014 - past_key_values_2_accuracy: 0.0012 - past_key_values_3_accuracy: 0.0026 - past_key_values_4_accuracy: 0.0011 - past_key_values_5_accuracy: 0.0014 - past_key_values_6_accuracy: 0.0017 - past_key_values_7_accuracy: 4.1169e-04 - past_key_values_8_accuracy: 4.5344e-04 - past_key_values_9_accuracy: 0.0015 - past_key_values_10_accuracy: 7.3570e-04 - past_key_values_11_accuracy: 6.5386e-04 - past_key_values_12_accuracy: 0.0019

In [None]:
text = "Μια φορά και έναν καιρό, "
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')

# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 5,
  temperature = 0.7,
  no_repeat_ngram_size=1,
  num_return_sequences=10,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


In [None]:
print(tokenizer.decode(beam_output[0]))

Μια φορά και έναν καιρό, 
- «« είπε που να το βασιλιάς. Ο είναι ο μικρός θα τα είχε δεν η πριγκίπισσα από την πρίγκιπας του της στο σπίτι με τον ένα' σε μια πολύ μου». Η Δεν! Το τη ήταν οι αυτό στην δυο; Και στον μικρό κι τι: Τα όταν ότι για τις ώρα στα;» Τι μας τους...Ο μεγάλο σας στη τόσο σαν αλλά πως σου!» Οι κάθε πιο είχαν καλά των ένας αν γιατί’ έτσι», πάνω μέσα όμως όλα αυτά λίγο


In [None]:
text = "Σε μία μακρινή "
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 10,
  #temperature = 0.1,
  no_repeat_ngram_size=1,
  num_return_sequences=8,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


In [None]:
print(tokenizer.decode(beam_output[0]))

In [None]:
model.save_pretrained(save_directory = save_path, save_config = True)
