In [None]:
## import all libraries

import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import os


In [2]:
def single_text(csv_path, text_column, output_file):
    df = pd.read_csv(csv_path)
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in CSV. Available: {df.columns.tolist()}")
    df = df[df[text_column].notnull()]
    separator = "\n\n###\n\n"  # Optional: helps model see doc boundaries
    combined_text = separator.join(df[text_column].tolist())
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(combined_text)
    print(f"Combined corpus saved to '{output_file}' — Total stories: {len(df)}")


In [None]:
dream_file = pd.read_csv("dream_token.csv")
dream_file


In [None]:
single_text("dream_token.csv", "report", "dream_corpus.txt")

In [4]:

def train_gpt2_on_corpus(corpus_file,output_dir,model_name,num_train_epochs,block_size=128):

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  
    model = GPT2LMHeadModel.from_pretrained(model_name)
    dataset = load_dataset("text", data_files={"train": corpus_file})
    def tokenize_function(examples):
        return tokenizer(examples["text"],truncation=True,max_length=block_size,padding="max_length")

    tokenized_dataset = dataset.map(tokenize_function,batched=True,remove_columns=["text"])

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )

    # Training configcheckpoints
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=False,  
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=2,
        save_steps=500,       
        save_total_limit=2,   
        logging_steps=50,
        prediction_loss_only=True,
        fp16=True,)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        data_collator=data_collator,
        tokenizer=tokenizer,)

    #Check for existing checkpoints
    last_checkpoint = None
    if os.path.isdir(output_dir):
        checkpoints = [os.path.join(output_dir, d)
            for d in os.listdir(output_dir)
            if d.startswith("checkpoint-")]
        if checkpoints:
            last_checkpoint = max(checkpoints, key=os.path.getmtime)

    if last_checkpoint:
        print(f" Resuming from: {last_checkpoint}")
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        print("Starting fresh")
        trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Final model {output_dir}")


In [None]:
# run model


'''train_gpt2_on_corpus(
    corpus_file="dream_corpus.txt",
    output_dir="./dream_gpt2",
    model_name="gpt2",
    num_train_epochs=3
)'''

train_gpt2_on_corpus(
    corpus_file="email_corpus.txt",
    output_dir="./email_gpt2",
    model_name="gpt2",
    num_train_epochs=3
)



Map:   0%|          | 0/16957624 [00:00<?, ? examples/s]