In [4]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [8]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Check current working directory
files = r"C:/Project1/ForkNatNew/data/processed/"
print(f"Current working directory: {os.getcwd()}")

# Load preprocessed data
cleaned_data = pd.read_csv(r"C:/Project1/ForkNatNew/data/processed/cleaned_data.csv")

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Current working directory: c:\Project1


In [9]:
# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token # This line sets the padding token to the end-of-sequence token, which resolves the issue of the tokenizer not having a padding token.

# Example input (you would use cleaned data for actual training)
sample_text = " ".join(cleaned_data['clean_text'].iloc)
inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length=512) # This line tokenizes the input text with padding and truncation, ensuring that the input length does not exceed the specified max_length (512 in this case).

In [10]:
# Train the model (expand this section to include full training loop)
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=100) # This line generates the output from the model, using the attention_mask to handle padding correctly. The max_new_tokens parameter specifies the maximum number of new tokens to generate.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [11]:
# Save the trained model
model.save_pretrained('models/trained_gpt2_model')
tokenizer.save_pretrained('models/trained_gpt2_tokenizer')

print("Model and tokenizer saved in 'models/' directory.")

Model and tokenizer saved in 'models/' directory.


In [14]:
# To train data in a loop

# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup

# Load preprocessed data
cleaned_data = pd.read_csv(r"C:/Project1/ForkNatNew/data/processed/cleaned_data.csv")

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [15]:
# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token

# Define a custom dataset class
class EmailDataset(Dataset): # This class inherits from torch.utils.data.Dataset - used to prepare data for training. It tokenizes the text data and returns the input tensors.
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts 
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=self.max_length)
        inputs['labels'] = inputs.input_ids.clone()
        return inputs

In [16]:
# Prepare the dataset and dataloader
texts = [" ".join(tokens) for tokens in cleaned_data['clean_text']] # texts: Combines the cleaned tokens into text strings.
dataset = EmailDataset(texts, tokenizer, max_length=512) # dataset: Creates an instance of the EmailDataset class.
dataloader = DataLoader(dataset, batch_size=2, shuffle=True) # dataloader: Creates a DataLoader to handle batching and shuffling of the data.

In [17]:
# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5) # optimizer: Uses the AdamW optimizer with a learning rate of 5e-5.
total_steps = len(dataloader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # scheduler: Uses a linear learning rate scheduler with warmup.



In [18]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [19]:
# Training loop
# The training loop runs for a specified number of epochs (3 in this example).
# For each batch, it performs the following steps: Zeroes the gradients.
# Moves the input tensors to the GPU if available.
# Computes the model outputs and loss.
# Backpropagates the loss.
# Updates the model parameters.
# Adjusts the learning rate using the scheduler.
# Prints the average loss for each epoch.

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze().to(device)
        attention_mask = batch['attention_mask'].squeeze().to(device)
        labels = batch['labels'].squeeze().to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

Epoch 1/3, Loss: 2.2062025666236877
Epoch 2/3, Loss: 1.521947205066681
Epoch 3/3, Loss: 1.4388502836227417


In [20]:
# Save the trained model and tokenizer to the 'models/' directory.
model.save_pretrained('models/trained_gpt2_model')
tokenizer.save_pretrained('models/trained_gpt2_tokenizer')

print("Model and tokenizer saved in 'models/' directory.")

Model and tokenizer saved in 'models/' directory.
