In [9]:
!pip install transformers -U
!pip install accelerate -U
!pip install torch==2.1.0

^C




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [None]:
mod_path = "/content/drive/MyDrive/my_model_folder/generator_model"
tok_path = "/content/drive/MyDrive/my_model_folder/gen_tokenizer"

model = GPT2LMHeadModel.from_pretrained(mod_path)
tokenizer = GPT2Tokenizer.from_pretrained(tok_path)

In [None]:
# Load original dataset
dataset_path = "/content/drive/MyDrive/Colab Notebooks/04-Machine-Learning/dying_earth_corpus.txt"
with open(dataset_path, 'r', encoding='utf-8') as file:
    original_lines = file.readlines()

In [None]:
print(original_lines[0])

TURJAN SAT in his workroom, legs sprawled out from the stool, back against and elbows on the bench. Across the room was a cage; into this Turjan gazed with rueful vexation. The creature in the cage returned the scrutiny with emotions beyond conjecture.<|endoftext|>



In [None]:
# Data collator used for dynamic padding and combining batch data
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer, file_path=dataset_path, block_size=400
)



In [None]:
# Assuming train_dataset is your training dataset instance
train_dataloader = DataLoader(
    train_dataset,
    batch_size=1,  # or whatever batch size you are using
    shuffle=False,  # This ensures data is shuffled each epoch
)


In [None]:

training_args = TrainingArguments(
    output_dir="./gpt2-dying-earth",  # Output directory for model checkpoints
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=20,  # Number of epochs (will be overridden by max_steps)
    per_device_train_batch_size=1,  # Batch size per device
    save_steps=1_000,  # Saving frequency for model checkpoints
    save_total_limit=5,  # Max number of saved checkpoints
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=250,  # Logging frequency
    load_best_model_at_end=False,  # Load the best model at the end of training
    metric_for_best_model='loss',  # Metric for evaluating the best model
    greater_is_better=False,  # Lower loss indicates a better model
    fp16=True,  # Use mixed precision training if GPU supports it
    learning_rate=1e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay for L2 regularization
)

# Create a new instance of Trainer or reset the existing one
trainer = Trainer(
    model=model,  # assuming 'model' is your pre-trained GPT-2 model
    args=training_args,  # 'training_args' should be your TrainingArguments instance
    data_collator=data_collator,  # 'data_collator' for handling batching
    train_dataset=train_dataset,  # your training dataset
)

# Start training
trainer.train()

# Train the model
trainer.train()

Step,Training Loss
250,1.6522
500,1.6424
750,1.6407
1000,1.6498
1250,1.6009
1500,1.5723
1750,1.5851
2000,1.6082
2250,1.5406
2500,1.5635


Step,Training Loss
250,1.232
500,1.2682
750,1.2704
1000,1.272
1250,1.224
1500,1.2289
1750,1.2457
2000,1.2224
2250,1.2036
2500,1.2137


TrainOutput(global_step=20440, training_loss=1.0714999936098222, metrics={'train_runtime': 1177.1575, 'train_samples_per_second': 17.364, 'train_steps_per_second': 17.364, 'total_flos': 4172507136000000.0, 'train_loss': 1.0714999936098222, 'epoch': 20.0})

In [None]:
# Creating a list of 100 prompt starters following the specified parameters

prompt_starters = []

# Adding prompts focused on nature themes
nature_themed_prompts = [
    "Trees whispered in",
    "Sunlight bathed the",
    "Moonlight shone on",
    "Stars twinkled above",
    "Winds howled through",
    "Rivers murmured in",
    "Leaves rustled under",
    "Mountains loomed over",
    "Fog enveloped the",
    "Rain pattered on",
    "Snow blanketed the",
    "Clouds drifted over",
    "Dew glistened on",
    "Thunder echoed in",
    "Birds sang in",
    "Flowers bloomed in",
    "Vines climbed the",
    "Mist shrouded the",
    "Fireflies glowed in",
    "Crickets chirped in",
    "Frost covered the",
    "Breezes caressed the",
    "Waves crashed against",
    "Seas roared around"
]

# Adding mystical prompts
mystical_prompts = [
    "Magic swirled around",
    "Spirits danced in",
    "Auras glimmered in",
    "Portals opened in",
    "Visions appeared in",
    "Ghosts roamed the",
    "Witches chanted in",
    "Dragons soared over",
    "Enchantments wove through",
    "Sorcerers summoned in",
    "Runes glowed on",
    "Curses lingered in",
    "Golems trudged through",
    "Phantoms floated over",
    "Alchemy transformed the",
    "Crystals hummed in",
    "Fairies flitted around",
    "Griffins perched on",
    "Elixirs bubbled in",
    "Charms activated in"
]

# Adding prompts that are the beginning of a scenic descriptive passage
scenic_prompts = [
    "Valleys stretched below",
    "Sunsets painted the",
    "Dawns broke over",
    "Horizons merged at",
    "Skylines dominated the",
    "Landscapes changed beyond",
    "Sceneries shifted near",
    "Views expanded towards",
    "Panoramas unfolded across",
    "Tableaus formed in"
]

# Adding prompts containing references to the Dying Earth series
dying_earth_prompts = [
    "Cugel navigated through",
    "Rhialto pondered in",
    "Iucounu smirked at",
    "T'sais wandered in",
    "Turjan studied in",
    "Liane gazed upon",
    "Guyal sought answers in",
    "Firx scampered across",
    "Deodarr roamed the",
    "Ildefonse plotted in",
    "Zaraide meditated in",
    "Faide pursued through",
    "Pharesm guarded the",
    "Pelgrane flew over",
    "Baleful stars glinted above",
    "Scaum river flowed by",
    "Vermoulian schemed in",
    "Derwe Coreme admired",
    "Grue lurked in",
    "Sandestins worked in",
    "Spell-Wrights chanted in",
    "Mazirian's magic wove around",
    "Dolm's curse lingered in",
    "Dying Earth's sun set on",
    "Vancian mysteries unfolded in"
]

# Combining all the prompts
prompt_starters.extend(nature_themed_prompts)
prompt_starters.extend(mystical_prompts)
prompt_starters.extend(scenic_prompts)
prompt_starters.extend(dying_earth_prompts)

# Ensuring we have 100 unique prompts
unique_prompt_starters = list(set(prompt_starters))

# Expanding the existing lists and adding new prompts
expanded_nature_themed_prompts = [
    "Trees whispered in", "Sunlight bathed the", "Moonlight shone on",
    "Stars twinkled above", "Winds howled through", "Rivers murmured in",
    "Leaves rustled under", "Mountains loomed over", "Fog enveloped the",
    "Rain pattered on", "Snow blanketed the", "Clouds drifted over",
    "Dew glistened on", "Thunder echoed in", "Birds sang in", "Flowers bloomed in",
    "Vines climbed the", "Mist shrouded the", "Fireflies glowed in",
    "Crickets chirped in", "Frost covered the", "Breezes caressed the",
    "Waves crashed against", "Seas roared around", "Brooks babbled through",
    "Hills rolled under", "Valleys yawned below", "Glaciers crept over",
    "Oceans stretched beyond", "Storms raged over", "Forests stood around",
    "Meadows spread beneath", "Lakes shimmered under", "Streams trickled through",
    "Deserts baked under", "Canyons echoed with", "Prairies swayed in",
    "Islands emerged from", "Volcanoes erupted over", "Marshes squelched under"
]

expanded_mystical_prompts = [
    "Magic swirled around", "Spirits danced in", "Auras glimmered in",
    "Portals opened in", "Visions appeared in", "Ghosts roamed the",
    "Witches chanted in", "Dragons soared over", "Enchantments wove through",
    "Sorcerers summoned in", "Runes glowed on", "Curses lingered in",
    "Golems trudged through", "Phantoms floated over", "Alchemy transformed the",
    "Crystals hummed in", "Fairies flitted around", "Griffins perched on",
    "Elixirs bubbled in", "Charms activated in", "Oracles foretold in",
    "Necromancers conjured in", "Demons lurked in", "Angels descended on",
    "Totems stood in", "Amulets shone under", "Wizards pondered in",
    "Spells echoed through", "Shamans communicated with", "Chimeras roamed the"
]

expanded_scenic_prompts = [
    "Valleys stretched below", "Sunsets painted the", "Dawns broke over",
    "Horizons merged at", "Skylines dominated the", "Landscapes changed beyond",
    "Sceneries shifted near", "Views expanded towards", "Panoramas unfolded across",
    "Tableaus formed in", "Vistas revealed the", "Sunrises lit up",
    "Twilights settled over", "Tundras lay under", "Savannas spread across",
    "Oases appeared in", "Fjords cut through", "Plateaus rose above",
    "Badlands sprawled under", "Steppes swept across"
]

expanded_dying_earth_prompts = [
    "Cugel navigated through", "Rhialto pondered in", "Iucounu smirked at",
    "T'sais wandered in", "Turjan studied in", "Liane gazed upon",
    "Guyal sought answers in", "Firx scampered across", "Deodarr roamed the",
    "Ildefonse plotted in", "Zaraide meditated in", "Faide pursued through",
    "Pharesm guarded the", "Pelgrane flew over", "Baleful stars glinted above",
    "Scaum river flowed by", "Vermoulian schemed in", "Derwe Coreme admired",
    "Grue lurked in", "Sandestins worked in", "Spell-Wrights chanted in",
    "Mazirian's magic wove around", "Dolm's curse lingered in", "Dying Earth's sun set on",
    "Vancian mysteries unfolded in", "Zamilon plotted beneath", "Klarkash-Ton brooded over",
    "Blikdak delved into", "Pilgrims traveled to", "Mummers acted in"
]

# Combining all the prompts into a single list
all_prompts = (expanded_nature_themed_prompts + expanded_mystical_prompts +
               expanded_scenic_prompts + expanded_dying_earth_prompts)

# Ensuring uniqueness and sufficient number of prompts
unique_prompt_starters = list(set(all_prompts))
while len(unique_prompt_starters) < 100:
    unique_prompt_starters.append(f"New unique prompt {len(unique_prompt_starters)+1}")

# Checking the total number of prompts and displaying a few examples
len(unique_prompt_starters), unique_prompt_starters[:5]


In [None]:
import pickle
with open('prompt_starters_list.pickle', 'wb') as file:
    pickle.dump(prompt_starters, file)

In [None]:
prompt = random.choice(prompt_starters)
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Move input_ids to the same device as the model
input_ids = input_ids.to(model.device)

max_length = len(input_ids.tolist()[0]) + 600  # Increase max length for longer outputs

# Generate and decode text
from transformers import set_seed

# Optional: Set a seed for reproducibility

# Adjusting generation parameters
output = model.generate(
    input_ids,
    max_length=max_length,  # Updated max length
    do_sample=True,        # Enable sampling
    temperature=0.5,       # Adjust the temperature if needed
    top_k=60,              # Use top-k sampling
    top_p=0.75,            # Use top-p (nucleus) sampling
    pad_token_id=tokenizer.eos_token_id,
    attention_mask=input_ids.new_ones(input_ids.shape)
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


NameError: name 'random' is not defined