In [None]:
pip install pytorch-transformers

In [1]:
#Importing modules
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import load_dataset

In [None]:
# Load and preprocess the dataset
dataset = load_dataset("csv", data_files="/kaggle/input/all-the-news/articles1.csv")
text_samples = dataset["train"]["content"]

In [3]:
dataset

In [4]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# Tokenize and encode the dataset
def tokenize_function(example):
    return tokenizer(example["content"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["input_ids"] for item in batch]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

In [5]:
# Prepare the data for training
train_dataset = tokenized_dataset["train"]
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True,collate_fn=collate_fn)

In [None]:
# Set up the training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

In [6]:
for batch in train_dataloader:
    print(batch)
    break

In [None]:
# Training loop
model.train()
num_epochs=1
for epoch in range(num_epochs):
    for step,batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        if step%400==0:
            print("Step-{},Loss-{}".format(step,loss.item()))
            break
        loss.backward()
        optimizer.step()

In [None]:
# Save the trained model
output_path = '/kaggle/working/GPT2-model.pth'
torch.save(model.state_dict(), output_path)

In [None]:
# Load the trained model
model_path = '/kaggle/working/GPT2-model.pth'
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load(model_path))

In [None]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Set the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# Generate text
prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=100, num_return_sequences=1)

In [None]:
# Decode and print the generated text
for i, generated in enumerate(output):
    text = tokenizer.decode(generated, skip_special_tokens=True)
    print(f"Generated text {i+1}: {text}")