In [2]:
from google.colab import drive

drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [3]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
  Downloading tokenizers-0.11.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 5.4 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.2
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloadin

In [4]:
import tensorflow as tf
import os

import tokenizers
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

from pathlib import Path


In [5]:
class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        # self.tokenizer.normalizer = Sequence([
        #     NFKC()
        # ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=25000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer = trainer, files = paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [6]:

# the folder 'text' contains all the files
paths = [str(x) for x in Path("./gdrive/MyDrive/project/data/").glob("**/*processed.txt")]

paths

['gdrive/MyDrive/project/data/corpus_processed.txt',
 'gdrive/MyDrive/project/data/paidika_paramythia_processed.txt',
 'gdrive/MyDrive/project/data/paramithia_processed.txt',
 'gdrive/MyDrive/project/data/paramithia2_processed.txt']

In [7]:
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'gdrive/MyDrive/project/model'
tokenizer.save_tokenizer(save_path)

In [8]:
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

file gdrive/MyDrive/project/model/config.json not found


In [9]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [10]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [11]:
# defining our optimizer

# clipnorm:
# Gradient norm scaling involves changing the derivatives of the loss function 
# to have a given vector norm when the L2 vector norm (sum of the squared values) 
# of the gradient vector exceeds a threshold value. For example, we could specify 
# a norm of 1.0, meaning that if the vector norm for a gradient exceeds 1.0, then
#  the values in the vector will be rescaled so that the norm of the vector equals 1.0.

# epsilon: avoid zero division

optimizer = tf.keras.optimizers.Adam(learning_rate=6e-5, epsilon=1e-08, clipnorm=1.0)

# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [11]:
# from keras.callbacks import ModelCheckpoint

# checkpoint = ModelCheckpoint("best_model.hdf5", monitor='loss', verbose=1,
#     save_best_only=True, mode='auto', period=1)

In [12]:
# num_epoch = 30
# history = model.fit(dataset, epochs=num_epoch)

# save_path = 'gdrive/MyDrive/project/model'
# model.save_pretrained(save_directory = save_path, save_config = True)

In [12]:
num_epoch = 30
checkpoint_filepath = 'gdrive/MyDrive/project/model/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
# history = model.fit(dataset, epochs=num_epoch, callbacks=[model_checkpoint_callback])

# The model weights (that are considered the best) are loaded into the model.
#model.load_weights(checkpoint_filepath)


#save_path = 'gdrive/MyDrive/project/model'
#model.save_pretrained(save_directory = save_path, save_config = True)

In [13]:
loaded_model = model.load_weights(checkpoint_filepath) #trained for 12 epochs

In [None]:
num_epoch = 30
checkpoint_filepath = 'gdrive/MyDrive/project/model/checkpoint'

history2 = model.fit(dataset, epochs=num_epoch, callbacks=[model_checkpoint_callback])
save_path = 'gdrive/MyDrive/project/model'
model.save_pretrained(save_directory = save_path, save_config = True)

Epoch 1/30
 56/322 [====>.........................] - ETA: 1:13:21 - loss: 4.3249 - logits_loss: 4.3249 - logits_accuracy: 0.3006 - past_key_values_1_accuracy: 0.0015 - past_key_values_2_accuracy: 0.0018 - past_key_values_3_accuracy: 0.0019 - past_key_values_4_accuracy: 0.0016 - past_key_values_5_accuracy: 0.0018 - past_key_values_6_accuracy: 0.0017 - past_key_values_7_accuracy: 0.0018 - past_key_values_8_accuracy: 0.0015 - past_key_values_9_accuracy: 0.0015 - past_key_values_10_accuracy: 0.0017 - past_key_values_11_accuracy: 0.0019 - past_key_values_12_accuracy: 0.0018

In [None]:
print(history2)

In [None]:
# sess = gpt2.start_tf_sess()

# gpt2.finetune(sess,
#     file_name,
#     model_name=model_name,
#     checkpoint_dir=checkpoint_dir,
#     run_name=run_name,
#     steps=25,
# )

In [15]:
text = "Μια φορά και έναν καιρό, "
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')

# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 10,
  temperature = 0.7,
  no_repeat_ngram_size=1,
  num_return_sequences=10,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

print(tokenizer.decode(beam_output[0]))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


Μια φορά και έναν καιρό,  ένας βασιλιάς που είχε έρθει ο πατέρας του.Ο πρίγκιπας ήταν πολύ μακριά από το σπίτι της είπε: Δεν είναι ότι δεν μπορούσε να τον πατέρα μου;Και τι θα σε μια μέρα η αλεπού για την άλλη ημέρα με τα μάτια σου! Θα γίνει καλά κι εμείς καλύτερα... Το βράδυ στο παλάτι των παιδιών μας πει ένα μικρό πρίγκιπα βασιλιά στον κήπο τους έλεγε πως οι άνθρωποι αλλά αν είμαι τόσο όμορφα χρόνια μαζί σας πω αυτό όμως δε πρέπει ν’ όλα αυτά τις δυο παιδιά


In [None]:
text = "Σε μία μακρινή "
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 500,
  num_beams = 10,
  #temperature = 0.1,
  no_repeat_ngram_size=1,
  num_return_sequences=8,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

print(tokenizer.decode(beam_output[0]))

In [None]:
text = "Ο νεαρός βοσκός ήταν πολύ στεναχωρημένος"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 500,
  num_beams = 10,
  #temperature = 0.1,
  no_repeat_ngram_size=1,
  num_return_sequences=10,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

print(tokenizer.decode(beam_output[0]))