In [None]:
from google.colab import drive

drive.mount('/content/gdrive')


Mounted at /content/gdrive


## download the necessary libraries

In [None]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
  Downloading tokenizers-0.11.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 4.8 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.2
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |███████████████████████

## import libraries

In [None]:
import tensorflow as tf
import os

import tokenizers
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

from pathlib import Path


## class with all the tokenization steps to process all the input files

In [None]:
class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        # self.tokenizer.normalizer = Sequence([
        #     NFKC()
        # ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=25000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer = trainer, files = paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

## read all the input files 

In [None]:

# the folder 'text' contains all the files
paths = [str(x) for x in Path("./gdrive/MyDrive/project/data/").glob("**/*processed.txt")]

paths

['gdrive/MyDrive/project/data/corpus_processed.txt',
 'gdrive/MyDrive/project/data/paidika_paramythia_processed.txt',
 'gdrive/MyDrive/project/data/paramithia_processed.txt',
 'gdrive/MyDrive/project/data/paramithia2_processed.txt']

## perform tokenization 

In [None]:
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'gdrive/MyDrive/project/model'
tokenizer.save_tokenizer(save_path)

## load and config GPT-2

In [None]:
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

file gdrive/MyDrive/project/model/config.json not found


## merge all the fairytales in one string

In [None]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

## create the input dataset for the model

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

## define the training hyperparameters

In [None]:
# defining our optimizer

# clipnorm:
# Gradient norm scaling involves changing the derivatives of the loss function 
# to have a given vector norm when the L2 vector norm (sum of the squared values) 
# of the gradient vector exceeds a threshold value. For example, we could specify 
# a norm of 1.0, meaning that if the vector norm for a gradient exceeds 1.0, then
#  the values in the vector will be rescaled so that the norm of the vector equals 1.0.

# epsilon: avoid zero division

optimizer = tf.keras.optimizers.Adam(learning_rate=6e-5, epsilon=1e-08, clipnorm=1.0)

# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

## train the model

In [None]:
num_epoch = 35
checkpoint_filepath = 'gdrive/MyDrive/project/model/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
history = model.fit(dataset, epochs=num_epoch, callbacks=[model_checkpoint_callback])

# The model weights (that are considered the best) are loaded into the model.
# model.load_weights(checkpoint_filepath)


#save_path = 'gdrive/MyDrive/project/model'
model.save_pretrained(save_directory = save_path, save_config = True)

## load weights and continue the training process

In [None]:
 
loaded_model = model.load_weights(checkpoint_filepath)
num_epoch = 5
checkpoint_filepath = 'gdrive/MyDrive/project/model/checkpoint'

history2 = model.fit(dataset, epochs=num_epoch, callbacks=[model_checkpoint_callback])
save_path = 'gdrive/MyDrive/project/model'
model.save_pretrained(save_directory = save_path, save_config = True)

## generate short stories

In [None]:
text = "Ο νεαρός βοσκός ήταν πολύ στεναχωρημένος"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 500,
  num_beams = 10,
  #temperature = 0.1,
  no_repeat_ngram_size=1,
  num_return_sequences=10,
  repetition_penalty=1.5,
  skip_special_tokens = True,
  clean_up_tokenization = True,
  early_stopping = True
)

print(tokenizer.decode(beam_output[0]))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


Ο νεαρός βοσκός ήταν πολύ στεναχωρημένος και η τρομάρα του είπαν ότι είχε έρθει καλά από τη χαρά με εδώ!Σαν έγινε ο Παπουτσωμένος Γάτος, στο μεταξύ τους δέχτηκε.Όταν κάποιος βασιλιάς μπήκε στην πόρτα αυτή την τύχη αυτό να μπει μέσα σ’ ένα λιβάδι δίπλα σε έναν μεγάλο διάδρομο όπου ζούσε ένας κακός άνθρωπος τις τρεις γιούς της κόρης."Τι ψάχνεις; ρώτησαν μαζί μας;.Τα ποντίκια απάντησαν: Να μου τα μάγια κοιτάχτηκαν γύρω μ΄ όλο τον κόσμο ή εσύ!, είπε αποφασιστικά το κορίτσι που θα έλεγες αν δούμε τι κάνεις για πες σας δώσω μια φοβερή έκπληξη!!Την ίδια όμως ακόμη μεγαλύτερο γιο σου;""Όχι απάντησε ειρωνικά.– Αν θέλεις!.Ναι λοιπόν ποιο δώρο μην πάω εκεί κοντά στη μέση ενός κήπου δεν πειράζει όσο πιο πέρα δώθε κάτω – όπως κάνουν μία φορά δε γυρεύεις πάρα πολλά χρόνια αλλά όταν έχει αξία...Έτσι ήρθε μόνο αυτός έμεινε κλεισμένος στον πύργο μακριά κι άλλες χώρες....Μια μέρα ξέρεις πόσο ευτυχισμένη είναι ανίκανος είμαι μοναχούφιας λέει· γιατί είσαι πλούσιος κύριος Γιώργος μεγάλωσε μπροστά στους άλλ