In [2]:
from google.colab import drive

drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!pip install tokenizers
!pip install transformers



In [3]:
import tensorflow as tf
import os

import tokenizers
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer


In [5]:
class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        # self.tokenizer.normalizer = Sequence([
        #     NFKC()
        # ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=5000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer = trainer, files = paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [6]:
from pathlib import Path
import os

# the folder 'text' contains all the files
paths = [str(x) for x in Path("./gdrive/MyDrive/project/data/").glob("**/*.txt")]


In [8]:
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'gdrive/MyDrive/project/data/tokenizer'
tokenizer.save_tokenizer(save_path)

In [11]:

# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

file gdrive/MyDrive/project/data/tokenizer/config.json not found


In [12]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [13]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [14]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [15]:
num_epoch = 1
history = model.fit(dataset, epochs=num_epoch)



In [35]:
text = "Μια φορά και έναν καιρό, "
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 10,
  temperature = 0.7,
  no_repeat_ngram_size=1,
  num_return_sequences=10
)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


In [36]:
print(tokenizer.decode(beam_output[0]))

Μια φορά και έναν καιρό, 
- ««Π να τον είπε ο μικρός θα το σπίτι του. Ο βασιλιάς που δεν είναι η πριγκίπισσα μου με τα κα από την αλεπού! Δεν' ένα μικρό ήταν είχε σε μια σου της; Η πρίγκιπας στο βράδυ για τις κερα στην αιο». Το μην τους κι αν αυτό οι παιδιά: Θα’ τι πιο κάθε παρνγτείάκπυκασε στον κάνεις στα γίνει πολύ... Ετο μας


In [47]:
text = "Σε μία μακρινή"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 10,
  #temperature = 0.1,
  no_repeat_ngram_size=1,
  num_return_sequences=8,
  repetition_penalty=1.5
)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


In [48]:
print(tokenizer.decode(beam_output[0]))

Σε μία μακρινή.
- «Ο μικρός, ο βασιλιάς και το σπίτι του είπε η πριγκίπισσα με την κα από τον βράδυ που θα τα παιδιά να ένα μικρό είχε μια ήταν δεν είναι στο κ μου της! Ο κυρ στην Αερρα για τις γίνει σε αυτό οι μην τους: Το κάνει σου; Η Εινκπγυάσε στον ακαίλα». Δεν'’ πολύ τη διαηο στα αλεπούειτεόής ότι αλλά τι αν κι αυτά


In [None]:
# Preprocessing ideas
# 1. remove « , - , '
# 2. remove empty lines
# 3. remove line change [\r\n]+
#         import re
#         s = """cat
#                dog"""
#  
#         re.match(r'cat\ndog',s,re.M)