# A2 GPT-2

This is an example of how to apply GPT-2, specifically the distilgpt2 model using `GPT2ForSequenceClassification`, to the 3-way sentiment analysis task from assignment 2. Much of this code is copied from the BERT example (`a2climatesentimentGPT.ipynb`).

Again, as for the BERT example, there is some variation in terms of results each time this is run, but results on the test set are usually better than all methods considered in assignment 2.

In [None]:
import csv

def get_texts_and_labels(fname):
    csv_reader = csv.reader(open(fname))
    # Ignore header row
    next(csv_reader)
    texts = []
    labels = []
    for line in csv_reader:
        id,text,label = line
        label = int(label)
        texts.append(text)
        labels.append(label)
    return texts,labels

train_texts, train_labels = get_texts_and_labels('data/train-sample.csv')
val_texts, val_labels = get_texts_and_labels('data/dev.csv')
test_texts,test_labels = get_texts_and_labels('data/test.csv')


In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


In [None]:
import torch

class A2Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = A2Dataset(train_encodings, train_labels)
val_dataset = A2Dataset(val_encodings, val_labels)
test_dataset = A2Dataset(test_encodings, test_labels)

In [None]:
from transformers import GPT2ForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    #num_train_epochs=3,              # total number of training epochs
)

model = GPT2ForSequenceClassification.from_pretrained("distilgpt2", num_labels=3)
model.config.pad_token_id = tokenizer.pad_token_id

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
import numpy as np
test_predictions = trainer.predict(test_dataset)
test_predicted_labels = np.argmax(test_predictions.predictions, axis=1)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# A helper function to print out macro averaged P,R, and F1 and accuracy.
# Uses implementantions of evaluation metrics from sklearn.
def print_results(gold_labels, predicted_labels):
    p,r,f,_ = precision_recall_fscore_support(gold_labels, 
                                              predicted_labels,
                                              average='macro',
                                              zero_division=0)
    acc = accuracy_score(gold_labels, predicted_labels)

    print("Precision: ", p)
    print("Recall: ", r)
    print("F1: ", f)
    print("Accuracy: ", acc)
    print()


In [None]:
print_results(test_labels, np.argmax(test_predictions.predictions, axis=1))