<div style="text-align:center;font-size:40pt"> imports </div>

In [1]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


<div style="text-align:center;font-size:40pt"> Load PreTrianed Model </div>

model : https://huggingface.co/akhooli/poetry2023

In [2]:
model_name = 'akhooli/poetry2023'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.17.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.21.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'lm_head.weight', 'transformer.h.22.attn.masked_bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.23.attn.masked_bias', 'transformer.h.20.attn.masked_bias', 'transformer.h.18.attn.masked_bias']
-

<div style="text-align:center;font-size:40pt"> Load Dataset </div>

dataset: https://www.kaggle.com/ultrajack/modern-renaissance-poetry

In [3]:
df = pd.read_csv('Arabic_poetry_dataset.csv')
df = df.sample(frac=1)
df = df.sample(200)
lines = df['poem_text'].values.tolist()
lines = "\n".join(lines)
lines = lines.split("\n")
len(lines)

4432

<div style="text-align:center;font-size:40pt"> Remove Tashkeel </div>

In [4]:
import pyarabic.araby as araby
lines = [araby.strip_tashkeel(line) for line in lines]

<div style="text-align:center;font-size:40pt"> Tokenize the Dataset </div>

In [5]:
tokenizer.pad_token = tokenizer.eos_token
max_length = max([len(tokenizer.encode(line)) for line in lines])
print(f"max-len = {max_length}")

# Tokenize the lines
tokenized_lines = tokenizer(
    lines,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    add_special_tokens=True
)['input_ids']
print(f"original line: {lines[0]}")
print(f"tokenized line: {tokenized_lines[0]}")

max-len = 78
original line: مسئلة الدار غدت
tokenized line: [508, 3689, 7061, 53057, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


<div style="text-align:center;font-size:40pt"> Create Sequences and labels  </div>

In [6]:
# Flatten the tokenized lines
input_sequences = [line[:-1] for line in tokenized_lines]
labels = [line[1:] for line in tokenized_lines]
print(f"original line: {lines[0]}")
print(f"input sequence: {input_sequences[0]}")
print(f"input sequence as text: {tokenizer.decode(input_sequences[0])}")
print(f"label: {labels[0]}")
print(f"label as text: {tokenizer.decode(labels[0])}")

original line: مسئلة الدار غدت
input sequence: [508, 3689, 7061, 53057, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
input sequence as text: مسئلة الدار غدت<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endof

<div style="text-align:center;font-size:40pt"> Compile the model </div>

In [7]:
# Fine-tuning parameters
epochs = 5
learning_rate = 1e-5

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))


<div style="text-align:center;font-size:40pt"> Train the model </div>

In [10]:
# Fine-tuning
validation_interval = 1  # Epoch interval for calculating perplexity
train_data = list(zip(input_sequences, labels))

# Pre-training perplexity calculation
pretrain_loss = model.evaluate(input_sequences, labels, verbose=0)
pretrain_perplexity = tf.exp(pretrain_loss)

# Comprehension table for perplexity
perplexity_table = pd.DataFrame(columns=['Epoch', 'Perplexity'])
perplexity_table.loc[0] = [0, pretrain_perplexity]


In [11]:

print(f"Pretraining Perplexity: {pretrain_perplexity}")


Pretraining Perplexity: 33586.4140625


In [16]:

for epoch in range(1, epochs + 1):
    print(f"Epoch {epoch}/{epochs}")
    history = model.fit(input_sequences,labels, batch_size=8, epochs=1)
    
    # Calculate perplexity at validation interval epochs
    if epoch % validation_interval == 0:
        loss = model.evaluate(input_sequences, labels, verbose=0)
        perplexity = tf.exp(loss)
        perplexity_table.loc[epoch] = [epoch, perplexity]
        print(f"Perplexity at Epoch {epoch}: {perplexity}")


Epoch 1/5


TypeError: 'NoneType' object is not callable

<div style="text-align:center;font-size:40pt"> Test the model output </div>

In [11]:
seed_test = "الشتاء"
input_ids = tokenizer.encode(seed_test, return_tensors='tf')

sample_outputs = model.generate(
    input_ids, # The input sequence encoded as token IDs.
    do_sample=True,
    max_length=100,  # The maximum length of the generated output.
    top_k=0,
    top_p=0.9,
    temperature=1,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
output = output.replace('-', '\n')
print(output)


In [27]:
# create validation sequences
df = pd.read_csv('Arabic_poetry_dataset.csv')
df = df.sample(frac=1)
df = df.sample(30)
lines = df['poem_text'].values.tolist()
lines = "\n".join(lines)
lines = lines.split("\n")
len(lines)

1238

In [28]:
tokenizer.pad_token = tokenizer.eos_token
max_length = max([len(tokenizer.encode(line)) for line in lines])
print(f"max-len = {max_length}")
# Tokenize the lines
tokenized_lines = tokenizer(
    lines,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    add_special_tokens=True
)['input_ids']

max-len = 56


In [29]:
# Flatten the tokenized lines
validation_sequences = [line[:-1] for line in tokenized_lines]
validation_labels = [line[1:] for line in tokenized_lines]

In [31]:
# Calculate perplexity
loss = model.evaluate(validation_sequences, validation_labels, verbose=0)
perplexity = tf.exp(loss)
print(f"Perplexity: {perplexity}")

Perplexity: 18.51316833496094


In [12]:
# # save model
# model.save_pretrained('models/arabic/gpt2-poems.ar')
# # save tokenizer
# tokenizer.save_pretrained('models/arabic/gpt2-poems-tokenizer.ar')