In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [2]:


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [3]:
def pandas_df(csv):
    df = pd.read_csv(r"../../Resources/Cleaned/"+csv+".csv")
    return df

def add_eos_to_examples(example):
    return {'input_ids': tokenizer.encode(example['Text'] + tokenizer.eos_token, truncation=True)}

In [4]:
corpus = pandas_df("dying_earth_corpus")

In [5]:
corpus.head()

Unnamed: 0,Title,Text,Is_Dying_Earth
0,The Dying Earth,"TURJAN SAT in his workroom, legs sprawled out ...",1
1,The Dying Earth,It was a thing to arouse pity—a great head on ...,1
2,The Dying Earth,"Turjan stood up, found a bowl of pap. With a l...",1
3,The Dying Earth,"Turjan put down the bowl, stood back and slowl...",1
4,The Dying Earth,Turjan sighed and left the room. He mounted wi...,1


In [6]:
corpus['Text'] = corpus['Text'].apply(lambda x: x.strip())  # Remove leading/trailing whitespace

In [7]:
corpus['input_ids'] = corpus.apply(add_eos_to_examples, axis=1)

In [8]:
texts = corpus['Text'].tolist()  # Convert the 'Text' column to a list
with open('dying_earth_corpus.txt', 'w') as file:
    for text in texts:
        file.write(text + tokenizer.eos_token + '\n')

In [9]:
dataset = TextDataset(tokenizer=tokenizer, file_path='dying_earth_corpus.txt', block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [10]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-dying-earth",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train the model
trainer.train()

  0%|          | 0/2397 [00:00<?, ?it/s]

{'loss': 3.7975, 'learning_rate': 3.9570296203587817e-05, 'epoch': 0.63}
{'loss': 3.4838, 'learning_rate': 2.9140592407175638e-05, 'epoch': 1.25}
{'loss': 3.3241, 'learning_rate': 1.8710888610763455e-05, 'epoch': 1.88}
{'loss': 3.1913, 'learning_rate': 8.281184814351273e-06, 'epoch': 2.5}
{'train_runtime': 11669.2066, 'train_samples_per_second': 0.821, 'train_steps_per_second': 0.205, 'train_loss': 3.402296694109825, 'epoch': 3.0}


TrainOutput(global_step=2397, training_loss=3.402296694109825, metrics={'train_runtime': 11669.2066, 'train_samples_per_second': 0.821, 'train_steps_per_second': 0.205, 'train_loss': 3.402296694109825, 'epoch': 3.0})

In [11]:
model.save_pretrained("generator_model")
tokenizer.save_pretrained("gen_tokenizer")


('gen_tokenizer\\tokenizer_config.json',
 'gen_tokenizer\\special_tokens_map.json',
 'gen_tokenizer\\vocab.json',
 'gen_tokenizer\\merges.txt',
 'gen_tokenizer\\added_tokens.json')

In [27]:
# Text Generation
prompt = "Ascolais"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
max_length = len(input_ids.tolist()[0]) + 200  # Adjust as needed

# Generate and decode text
from transformers import set_seed

# Optional: Set a seed for reproducibility
set_seed(42)

# Adjusting generation parameters
# Adjusting generation parameters with do_sample set to True
output = model.generate(
    input_ids,
    max_length=max_length,
    do_sample=True,        # Enable sampling
    temperature=0.7,       # Adjust the temperature
    top_k=50,              # Use top-k sampling
    top_p=0.95,            # Use top-p (nucleus) sampling
    pad_token_id=tokenizer.eos_token_id,
    attention_mask=input_ids.new_ones(input_ids.shape)
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)



Ascolais, your conduct is not unusual," said Ildefonse. "I am not sure how to assess the gravity of the matter."
