<div style="text-align:center;font-size:40pt"> imports </div>

In [1]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


<div style="text-align:center;font-size:40pt"> Load PreTrianed Model </div>

In [2]:
# Load pretrained BART model and tokenizer
model_name = 'ismaelfaro/gpt2-poems.en'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name,from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.11.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.4.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

<div style="text-align:center;font-size:40pt"> Load Dataset </div>

In [3]:
df = pd.read_csv('merged_data.csv')
df = df.sample(frac=1)
df = df.sample(100)
lines = df['Verse'].values.tolist()

<div style="text-align:center;font-size:40pt"> Tokenize the Dataset </div>

In [4]:
tokenizer.pad_token = tokenizer.eos_token
max_length = max([len(tokenizer.encode(line)) for line in lines])
print(f"max-len = {max_length}")
# Tokenize the lines
tokenized_lines = tokenizer(
    lines,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    add_special_tokens=True
)['input_ids']

max-len = 14


In [5]:
print(f"original line: {lines[0]}")
print(f"tokenized line: {tokenized_lines[0]}")

original line: a good man seen though silent counsel gives
tokenized line: [64, 922, 582, 1775, 996, 10574, 7739, 3607, 50256, 50256, 50256, 50256, 50256, 50256]


<div style="text-align:center;font-size:40pt"> Create Sequences and labels  </div>

In [6]:
# Flatten the tokenized lines
input_sequences = [line[:-1] for line in tokenized_lines]
labels = [line[1:] for line in tokenized_lines]

In [7]:
print(f"original line: {lines[0]}")
print(f"input sequence : {input_sequences[0]}")
print(f"input sequence as text: {tokenizer.decode(input_sequences[0])}")
print(f"label: {labels[0]}")
print(f"label as text: {tokenizer.decode(labels[0])}")


original line: a good man seen though silent counsel gives
input sequence : [64, 922, 582, 1775, 996, 10574, 7739, 3607, 50256, 50256, 50256, 50256, 50256]
input sequence as text: a good man seen though silent counsel gives<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
label: [922, 582, 1775, 996, 10574, 7739, 3607, 50256, 50256, 50256, 50256, 50256, 50256]
label as text:  good man seen though silent counsel gives<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


<div style="text-align:center;font-size:40pt"> Compile the model </div>

In [8]:
# Fine-tuning parameters
epochs = 5
learning_rate = 1e-5
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

<div style="text-align:center;font-size:40pt"> Train the model </div>

In [9]:
# Fine-tuning
model.fit(input_sequences,labels,batch_size = 8, epochs=epochs)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2be8debb788>

<div style="text-align:center;font-size:40pt"> Test the model output </div>

In [10]:
seed_test = "sadness"
input_ids = tokenizer.encode(seed_test, return_tensors='tf')

sample_outputs = model.generate(
    input_ids, # The input sequence encoded as token IDs.
    do_sample=True,
    max_length=100,  # The maximum length of the generated output.
    top_k=0,
    top_p=0.9,
    temperature=1,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
output = output.replace('-', '\n')
print(output)


sadnessIs dilapidationDespite your rush, despite your witThe weightless whisper of the snail, The powerless thought of the coal, The lossless peace of the ocean breeze, The hopeless weightless agony of your soul on this stone yo
west.By Sidney 
 July 15,2008so often my sadness sends you to sleep  \nso very often my fingers walk over you with tears  \nyour eyes so gently squeezed as the sun remembers your childhood  \nI


In [11]:
# create validation sequences
df = pd.read_csv('merged_data.csv')
df = df.sample(frac=1)
df = df.sample(2000)
lines = df['Verse'].values.tolist()

In [12]:
tokenizer.pad_token = tokenizer.eos_token
max_length = max([len(tokenizer.encode(line)) for line in lines])
print(f"max-len = {max_length}")
# Tokenize the lines
tokenized_lines = tokenizer(
    lines,
    truncation=True,
    padding='max_length',
    max_length=max_length,
    add_special_tokens=True
)['input_ids']

max-len = 18


In [13]:
# Flatten the tokenized lines
validation_sequences = [line[:-1] for line in tokenized_lines]
validation_labels = [line[1:] for line in tokenized_lines]

In [17]:
# Calculate perplexity
loss = model.evaluate(validation_sequences, validation_labels, verbose=0)
perplexity = tf.exp(loss)
print(f"Perplexity: {perplexity}")

Perplexity: 22.884700775146484


In [16]:
# # save model
# model.save_pretrained('models/english/gpt2-poems.en')
# # save tokenizer
# tokenizer.save_pretrained('models/english/gpt2-poems-tokenizer.en')

('models/english/gpt2-poems-tokenizer.en\\tokenizer_config.json',
 'models/english/gpt2-poems-tokenizer.en\\special_tokens_map.json',
 'models/english/gpt2-poems-tokenizer.en\\vocab.json',
 'models/english/gpt2-poems-tokenizer.en\\merges.txt',
 'models/english/gpt2-poems-tokenizer.en\\added_tokens.json')