## Project 3: Using Large Language Models

In this project, I use the GPT-2 model via the HuggingFrace transformers API to perform autoregressive language modelling, as well as prompt-based sentiment classification.

In [None]:
import math
import numpy as np
import random
import pdb
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import tqdm
import torch
from torch import nn
import torch.nn.functional as F
import torchtext.legacy as torchtext

from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# downloading and loading WikiText2 data
text_field = torchtext.data.Field()
train_dataset, validation_dataset, test_dataset = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
text_field.build_vocab(train_dataset, validation_dataset, test_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)
train_text = train_dataset.examples[0].text # a list of tokens (strings)
validation_text = validation_dataset.examples[0].text

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained("gpt2-large").to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-large")

### Language Modeling with GPT-2

In the first part, we use GPT-2 to predict word probabilities.

Start by tokenizing the vocabulary.

In [None]:
vocab_map = {}
vocab = tokenizer.vocab
for token in tqdm.tqdm(vocab):
    idx = vocab[token]
    vocab_map[idx] = token

The tokenizer for GPT-2 uses subwords, so we reconstruct the words using the the word-start "Ġ" token. The probabilities are obtained by adding the log-probabiliities of the corresponding subword tokens.



In [None]:
def get_gpt_probs(sentence):
    '''Outputs word probabilities given an input string as a sentence
    
    sentence: string
    returns: list of 2-tuples with a word and its negative log-probability
    '''
    words = []
    probs = []

    # append the sentence start token and tokenize the sentence
    sentence = '<|endoftext|>'+sentence
    encodings = tokenizer(sentence, return_tensors='pt').to(device)
    subwords = [vocab_map[int(idx)] for idx in encodings['input_ids'][0]][1:]

    logits = model(encodings['input_ids']).logits.squeeze(0)[:-1]
    logit_norm = F.log_softmax(logits, dim=-1)
    subprobs = logit_norm.gather(-1, encodings['input_ids'][0,1:].unsqueeze(-1)).flatten().tolist()
    
    # turn tokens into words, paired with their log-probabilities
    current_word = subwords[0]
    current_prob = subprobs[0]
    for i, subword in enumerate(subwords[1:]):
        if (subword[0] != 'Ġ'):
            current_word += subword
            current_prob += subprobs[i+1]
        else:
            words.append(current_word)
            probs.append(current_prob)
            current_word = subword
            current_prob = subprobs[i+1]

    if len(current_word) > 0:
        words.append(current_word)
        probs.append(current_prob)

    # END SOLUTION
    return [(word, -1*prob) for word, prob in zip(words, probs)]
            


In [None]:
get_gpt_probs("The florist sent the flowers was pleased.")

[('The', 2.4790914058685303),
 ('Ġflorist', 13.309796899557114),
 ('Ġsent', 8.14240837097168),
 ('Ġthe', 2.366187334060669),
 ('Ġflowers', 1.405614972114563),
 ('Ġwas', 9.130631446838379),
 ('Ġpleased.', 9.91234540939331)]

We now implement a modification to allow for efficient modeling of long text documents: using a strided window, rather than a standard 1-token sliding window. The model uses a context of 1024 and stride of 512. Only the probabilities generated for a word the first time are used in the output.

In [None]:
stride = 512
print("Context length: {}".format(model.config.n_positions))

Context length: 1024


In [None]:
def get_gpt_probs(document, stride=512, no_tqdm=False):
    '''Outputs word probabilities given an input document string using 1024
    context and 512 stride
    
    document: string
    returns: list of 2-tuples with a word and its negative log-probability
    '''
    words = []
    probs = []

    encodings = tokenizer('<|endoftext|>'*stride+document, return_tensors='pt').to(device)
    subwords = [vocab_map[int(idx)] for idx in encodings['input_ids'][0]][stride:]
    
    subprobs = []
    for i in tqdm.notebook.tqdm(range(0, encodings['input_ids'].shape[1], stride), disable=no_tqdm):
        left = max(0, i + stride - 1024)
        right = i + stride
        text = encodings['input_ids'][:,left:right].to(device)
        length = text.shape[1]
        mask = text.clone()
        # we don't want to consider the first 512 tokens of the window; we've already predicted their probabilities
        mask[:,:-stride] = -100
        
        with torch.no_grad():
            logits = model(text).logits.squeeze(0)
            logits = logits[stride-length:]
            logit_norm = F.log_softmax(logits, dim=-1)
            subprob = logit_norm.gather(-1, text[:,stride-length:].T).flatten().tolist()
            subprobs += subprob

    current_word = subwords[0]
    current_prob = subprobs[stride-1]

    for i, subword in enumerate(subwords[1:]):
        if (subword[0] != 'Ġ'):
            current_word += subword
            current_prob += subprobs[i+stride]
        else:
            words.append(current_word)
            probs.append(current_prob)
            current_word = subword
            current_prob = subprobs[i+stride]

    if len(current_word) > 0:
        words.append(current_word)
        probs.append(current_prob)

    # END SOLUTION
    return [(word, -1*prob) for word, prob in zip(words, probs)]

This function can be used to generate probabilities for long texts, such as *Wuthering Heights*.

In [None]:
with open("wuthering.txt", "r") as text_file:
  text = text_file.read()

In [None]:
output = get_gpt_probs(text, stride=512, no_tqdm=False)
probs = np.asarray([val[1] for val in output])

### Prompting

In this part, we experiment with adapting GPT-2 for sentiment classification via prompting. We use the task set SST-2.

In [None]:
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")

Sentiment classification is done by prompting using examples from the training dataset, and then comparing the probabilities of answering yes or no to a question on the sentiment of an input.

We implement a thresholding function that adjusts for bias in the prompt: due to the high variance in outputs arising from the specific details of the prompt structure, we find it useful to manually correct for bias in the prompt. The correcting is determined by the probabilities obtained from a neutral input.

In [None]:
prompt = "Opinion: I love this book! Positive? Yes. Opinion: I like having fun. Positive? Yes. Opinion: This food is terrible. Positive? No. Opinion: I hate you. Positive? No. Opinion: Funny and whimsical. Positive? Yes."

def bias():
    bias_pos_input = prompt + ' Opinion: ' + 'N/A' + '. Positive? Yes.'
    bias_neg_input = prompt + ' Opinion: ' + 'N/A' + '. Positive? No.'
    bias_pos = get_gpt_probs(bias_pos_input)[-1][1]
    bias_neg = get_gpt_probs(bias_neg_input)[-1][1]
    bias = bias_pos - bias_neg

    return bias

def predict_sentiment(sentence, bias):
    pos_input = prompt + ' Opinion: ' + sentence + '. Positive? Yes.'
    neg_input = prompt + ' Opinion: ' + sentence + '. Positive? No.'
    pos_prob = get_gpt_probs(pos_input)[-1][1]
    neg_prob = get_gpt_probs(neg_input)[-1][1]

    #get_gpt_probs returns negative log-likelihoods
    if pos_prob <= neg_prob - bias:
        return 1
    else:
        return 0


Checking performance on the training dataset.

In [None]:
num_correct = 0
FP = 0
FN = 0

for idx in tqdm.notebook.tqdm(range(1000)):
  example = dataset["train"][idx]
  predicted_label = predict_sentiment(example["sentence"], bias)
  if predicted_label == example["label"]:
    num_correct += 1
  elif predicted_label - example["label"] > 0:
    FP += 1
  else:
    FN += 1
    
print(f"Accuracy: {num_correct / 1000}")
print('FP: ', FP)
print('FN: ', FN)