## Import - dataset

In [1]:
%%capture
!pip install torch transformers datasets

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"  # Can use "gpt2-medium", "gpt2-large", "gpt-j-6b" etc.
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Inference first

In [3]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# Generate text
outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)
outputs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[15496,    11,   616,  3290,   318, 13779,    13,   314,  1101,   407,
          1654,   611,   673,   338,   257, 26188,   393,   407,    13,   314,
          1101,   407,  1654,   611,   673,   338,   257,  3290,   393,   407,
            13,   314,  1101,   407,  1654,   611,   673,   338,   257,  3290,
           393,   407,    13,   198,   198,    40,  1101,   407,  1654,   611]])

In [4]:
# Decode and print result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Hello, my dog is cute. I'm not sure if she's a puppy or not. I'm not sure if she's a dog or not. I'm not sure if she's a dog or not.

I'm not sure if


In [8]:
def generate_gpt2(sentence, max_length=100,device = "cuda"):
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    model.to(device)
    # Generate text
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    # Decode
    # Decode and print result
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [10]:
generate_gpt2("Hello, my dog is cute")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Hello, my dog is cute. I'm not sure if she's a puppy or not. I'm not sure if she's a dog or not. I'm not sure if she's a dog or not.\n\nI'm not sure if she's a puppy or not. I'm not sure if she's a dog or not.\n\nI'm not sure if she's a puppy or not. I'm not sure if she's a dog or not.\n\nI'm not"

## Test print loss with model

In [13]:
import pandas as pd
data = pd.read_csv("/kaggle/input/pytorch-finetuning-gpt2/data.csv")
data.head(2)

Unnamed: 0,Bad_Practices,Good_Practices
0,<table alt=header>Title</table>,<table alt='header'>Title</table>
1,<tr>Content,<tr>Content</tr>


In [15]:
bad_1 = data.iloc[0,0]
good_1 = data.iloc[0,1]
bad_1, good_1

('<table alt=header>Title</table>', "<table alt='header'>Title</table>")

In [18]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
input_ids_bad = tokenizer.encode(bad_1, return_tensors="pt")
input_ids_good = tokenizer.encode(good_1, return_tensors="pt")
# model(input_ids=input_ids_bad.to(device), labels=input_ids_good.to(device))

## Create dataset English - Germany

In [22]:
# convert df to DataDict
from datasets import Dataset, DatasetDict


# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['Bad_Practices', 'Good_Practices'],
    num_rows: 6712
})

In [31]:
# Create a DatasetDict (you can split it into train/test/validation if needed)
dataset_dict = DatasetDict({
    "train": dataset.select(range(100)),
    "test":dataset.select(range(100))
})

# Print dataset structure
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['Bad_Practices', 'Good_Practices'],
        num_rows: 100
    })
    test: Dataset({
        features: ['Bad_Practices', 'Good_Practices'],
        num_rows: 100
    })
})


## Tokenizer

In [32]:
# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["Bad_Practices"], padding="max_length", truncation=True, max_length=100)
    labels = tokenizer(examples["Good_Practices"], padding="max_length", truncation=True, max_length=100)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [33]:
# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
# Apply tokenization
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [34]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [35]:
tokenized_datasets = tokenized_datasets.remove_columns(['Bad_Practices'])
tokenized_datasets = tokenized_datasets.remove_columns(['Good_Practices'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

## Train

In [36]:
from torch.utils.data import DataLoader
tokenized_datasets.set_format("torch")

In [37]:
# Create dataloaders
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8, shuffle=False)

In [38]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [40]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

epochs = 30  # Number of training epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_dataloader)}")


cuda
Epoch 1 - Loss: 0.2811214465361375
Epoch 2 - Loss: 0.23510802479890677
Epoch 3 - Loss: 0.21153342494597802
Epoch 4 - Loss: 0.194926639015858
Epoch 5 - Loss: 0.17669604260187882
Epoch 6 - Loss: 0.16691776766226843
Epoch 7 - Loss: 0.15628014275660881
Epoch 8 - Loss: 0.14997939536204705
Epoch 9 - Loss: 0.134268031670497
Epoch 10 - Loss: 0.13625521671313506
Epoch 11 - Loss: 0.13161552009674218
Epoch 12 - Loss: 0.13065327646640632
Epoch 13 - Loss: 0.12390476923722488
Epoch 14 - Loss: 0.12142727409417813
Epoch 15 - Loss: 0.11999335541174962
Epoch 16 - Loss: 0.12361065699503972
Epoch 17 - Loss: 0.11714659573940131
Epoch 18 - Loss: 0.11037066177679943
Epoch 19 - Loss: 0.11250142409251286
Epoch 20 - Loss: 0.10904796593464337
Epoch 21 - Loss: 0.11188264076526348
Epoch 22 - Loss: 0.10407132426133522
Epoch 23 - Loss: 0.1107972780099282
Epoch 24 - Loss: 0.1060666235593649
Epoch 25 - Loss: 0.10425036515180881
Epoch 26 - Loss: 0.10559802330457248
Epoch 27 - Loss: 0.10096166340204385
Epoch 28 - L

## Test

In [42]:
test = "<table"
generate_gpt2(test)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"<table style='description'>List Item</table>"