In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [None]:


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
def pandas_df(csv):
    df = pd.read_csv(r"../../Resources/Cleaned/"+csv+".csv")
    return df

def add_eos_to_examples(example):
    return {'input_ids': tokenizer.encode(example['Text'] + tokenizer.eos_token, truncation=True)}

In [None]:
corpus = pandas_df("dying_earth_corpus")

In [None]:
corpus.head()

In [None]:
corpus['Text'] = corpus['Text'].apply(lambda x: x.strip())  # Remove leading/trailing whitespace

In [None]:
corpus['input_ids'] = corpus.apply(add_eos_to_examples, axis=1)

In [None]:
texts = corpus['Text'].tolist()  # Convert the 'Text' column to a list
with open('dying_earth_corpus.txt', 'w') as file:
    for text in texts:
        file.write(text + tokenizer.eos_token + '\n')

In [None]:
dataset = TextDataset(tokenizer=tokenizer, file_path='dying_earth_corpus.txt', block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-dying-earth",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train the model
trainer.train()

In [None]:
model.save_pretrained("generator_model")
tokenizer.save_pretrained("gen_tokenizer")


In [None]:
# Text Generation
prompt = "Ascolais"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
max_length = len(input_ids.tolist()[0]) + 200  # Adjust as needed

# Generate and decode text
from transformers import set_seed

# Optional: Set a seed for reproducibility
set_seed(42)

# Adjusting generation parameters
# Adjusting generation parameters with do_sample set to True
output = model.generate(
    input_ids,
    max_length=max_length,
    do_sample=True,        # Enable sampling
    temperature=0.7,       # Adjust the temperature
    top_k=50,              # Use top-k sampling
    top_p=0.95,            # Use top-p (nucleus) sampling
    pad_token_id=tokenizer.eos_token_id,
    attention_mask=input_ids.new_ones(input_ids.shape)
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

