In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset from the Excel file
file_path = 'dataset.xlsx'
df = pd.read_excel(file_path, engine='openpyxl')
sentences = df.iloc[:, 0].tolist()
labels = df.iloc[:, 1].tolist()
labels2 = []
for label in labels:
    if label == 'a1':
        labels2.append(0)
    if label == 'a2':
        labels2.append(1)
    if label == 'a3':
        labels2.append(2)
    if label == 'a4':
        labels2.append(3)
    if label == 'a5':
        labels2.append(4)
labels = labels2

In [2]:
# Split the dataset into train and test sets (in the ratio 4:1)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Define the N-way K-shot dataset
class FewShotDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, n_shot=5):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        # self.n_shot = n_shot
        

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx) -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # Check if the sentence is a string; if not, convert it to string
        if not isinstance(sentence, str):
            sentence = str(sentence)

        # Encode the sentence and handle possible exceptions
        encoded_sentence = self.tokenizer.encode(sentence, add_special_tokens=True)
        # print(encoded_sentence)
        return torch.tensor(encoded_sentence), torch.tensor(label)  # Assuming selected_positives and selected_negatives are not used in your current implementation
    

In [3]:
# Load pre-trained GPT-2 model and tokenizer (smaller version) and add a padding token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=5)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Prepare the train and test datasets and dataloaders

train_dataset = FewShotDataset(train_sentences, train_labels, tokenizer, n_shot=5)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)


test_dataset = FewShotDataset(test_sentences, test_labels, tokenizer, n_shot=5)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)


# Training loop (fine-tuning for few-shot learning)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        try:
            inputs = batch[0].to(device)
            labels = batch[1].to(device)
        except Exception as e:
            print("Error occurred:", e)
            print("Batch contents:", batch)
            continue  # Skip this batch and continue with the next one

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}')


Epoch 1/10, Loss: 1.4636
Epoch 2/10, Loss: 0.8776
Epoch 3/10, Loss: 0.5109
Epoch 4/10, Loss: 0.2134
Epoch 5/10, Loss: 0.1302
Epoch 6/10, Loss: 0.0671
Epoch 7/10, Loss: 0.0681
Epoch 8/10, Loss: 0.0305
Epoch 9/10, Loss: 0.0296
Epoch 10/10, Loss: 0.0262
