<a href="https://colab.research.google.com/github/patrickabadi/python-pytorch-notebooks/blob/main/NLP101_DaRMoD_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing a LLM : GPT2-XL

**Change the runtime type to a GPU before downloading gpt2**

**Note: We are running a quantized version (8-bit integer) of the model so that we can fit it into a free google colab isntance (T4) . See https://huggingface.co/docs/transformers/main_classes/quantization and https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4#scrollTo=W8tQtyjp75O for more details**

In [None]:
!mkdir -p /content/llm/gpt2-xl/
llm_dir = "/content/llm/gpt2-xl/"

!export LC_ALL=C.UTF-8 # So that gdown works for data download
!export LANG=C.UTF-8

In [None]:
# Download the required files from HugginFace (https://huggingface.co/hivemind/gpt-j-6B-8bit/tree/main):
# Possible ETA ~ 10 min for the 6G model; usually < 1 min
!wget -P /content/llm/gpt2-xl/ https://huggingface.co/gpt2-xl/resolve/main/pytorch_model.bin
!wget -P /content/llm/gpt2-xl/ https://huggingface.co/gpt2-xl/resolve/main/config.json
!wget -P /content/llm/gpt2-xl/ https://huggingface.co/gpt2-xl/resolve/main/merges.txt
!wget -P /content/llm/gpt2-xl/ https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json
!wget -P /content/llm/gpt2-xl/ https://huggingface.co/gpt2-xl/resolve/main/vocab.json

--2023-07-11 14:55:51--  https://huggingface.co/gpt2-xl/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 18.172.134.88, 18.172.134.124, 18.172.134.4, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.88|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/gpt2-xl/cd2a29e31040ef64d9362cb96801969c9f67b9e0bdbd6e00b9dda57cdbe17435?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1689346551&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY4OTM0NjU1MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9ncHQyLXhsL2NkMmEyOWUzMTA0MGVmNjRkOTM2MmNiOTY4MDE5NjljOWY2N2I5ZTBiZGJkNmUwMGI5ZGRhNTdjZGJlMTc0MzU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=wYQ0PL5OZVnTQBemUijX6bbD%7ESHvd

In [None]:
# Install HF transformers library:
!pip -q install transformers
# Accelerate and bitsandbytes for 8bit models version
!pip -q install accelerate
!pip -q install bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Mount the model
import json
import random
from datetime import datetime
from time import sleep
import logging
import argparse
from tqdm.notebook import tqdm
import csv
import os

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

llm_dir = "/content/llm/gpt2-xl/"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(llm_dir, use_fast=False)
# set pad token ids for batched inference cus gpt2 does not have one
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model_config = AutoConfig.from_pretrained(llm_dir)
model = AutoModelForCausalLM.from_pretrained(llm_dir, load_in_8bit=True) # Load 8bit int model to fit in Gcolab memory
#model.to(device)
model.eval()


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [None]:
# Test and play a little bit with gpt2-XL

# Define the input text
input_text = "Once upon a time, a venture founder was taking a class about ML where" # Feel free to modify!

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Move the input IDs to the same device as the model
input_ids = input_ids.to(device)

# Generate text from the model
output = model.generate(input_ids, max_length=200, do_sample=True, temperature=0.7)  # Can play with max_length of tokens to generate smaller/longer outputs

# Decode the output IDs to text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, a venture founder was taking a class about ML where he learned about the Data Mining library. He didn't have a big enough data set for a data mining task and he was curious about how to get started. He knew that the only way to do data mining was to do it in python. He also knew that data mining would be a relatively slow process. So that's what he did!

The data was not very large but it was enough for him to learn the basics of data mining. But he had no idea how to scale it up.

After a couple of months of coding, he had some ideas of how to scale his python data mining job and he wrote a blog post about it. The post got a lot of traction and that's what brought him to the idea of Data Science jobs. Here's what he writes:

"After a couple of months of coding, I had some ideas of how to scale my python data mining job


## Basic Prompt Engineering (In-Context Learning (ICL)) and KNN Prompting

In [None]:
# Download some test data:
!gdown --id 1Yh2blPkJvMtdm5xWKoHr2fLp2i2Bn5Ir
!unzip data.zip

In [None]:
# Above fails from time to time ... Can try to restart runtime and run:
!export LC_ALL=C.UTF-8 # So that gdown works for data download
!export LANG=C.UTF-8

In [None]:
# This colab is extensively reusing stuff from https://github.com/BenfengXu/KNNPrompting/tree/main (https://openreview.net/pdf?id=fe2S7736sNS)
# Let's install the repo locally
!git clone https://github.com/BenfengXu/KNNPrompting.git
import sys
sys.path.append('/content/KNNPrompting')


Note: The cell below, that buils the embeddings to use with knn classifier, takes about ~13 min to execute on a T4 instance.

In [None]:
from utils.dataset import *
from utils.anchor import AnchorStore
from utils.template import *

# We will play with sst2 dataset: https://huggingface.co/datasets/sst2
datadir = "/content/data/sst2/"
AutoDataset = SST2Dataset

train_data = AutoDataset(datadir, mode='train')
dev_data = AutoDataset(datadir, mode='dev')

anchor_data = AutoDataset(datadir, mode='train')

knn=3 # number of k nearest neighbord to look for classification
max_context_len = 1024
n_demo_shot = 32
n_anchor_shot = max_context_len - n_demo_shot
seed = 43

def llm_gen(model, prompt, tokenizer, max_context_len):
    inputs = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True).to(device=model.device)
    if inputs['input_ids'].shape[1] > max_context_len:
        inputs['input_ids'] = inputs['input_ids'][:, -max_context_len:]
        inputs['attention_mask'] = inputs['attention_mask'][:, -max_context_len:]
    with torch.no_grad():
        logits = model.forward(input_ids=inputs['input_ids'],
                               attention_mask=inputs['attention_mask'],
                               return_dict=True).logits.detach().cpu()
    # the output prob is shifted by -1, so we should use the output at the last input token position
    # gen_logits.shape = [1, 50257]
    gen_logits = logits[:, -1, :]

    return gen_logits

# Stage1: Meta Test -> This populates the anchor_store datastore with the target embeddings
train_data.subsamplebyshot(n_demo_shot, seed) # (demo_shots, seed)
prompt_prefix = make_prompt(train_data, "sst2", mode='train')
anchor_data.subsamplebyshot(n_anchor_shot, seed, exclude=train_data.data)
label2id = dev_data.label2id
id2verb = train_data.id2verb
anchor_store = AnchorStore(K=anchor_data.__len__(),
                            dim=model_config.vocab_size,
                            knn=knn,
                            n_class=len(label2id))
for ins in tqdm(anchor_data.data, total=anchor_data.__len__()):  # This could be parallelize on GPU; left as an exercise ;-)
    labels = label2id[ins['label']]
    prompt = prompt_prefix + make_prompt(ins, 'sst2', mode='inference')
    gen_logits = llm_gen(model, prompt, tokenizer, max_context_len)
    anchor_store.enqueue(torch.softmax(gen_logits.float(), dim=-1), torch.tensor(labels))

In [None]:
# That took a long time to generate the datastore! Let's pickle it for later usage
import pickle

with open('anchor_store.pkl', 'wb') as f:
    pickle.dump(anchor_store, f)

In [None]:
# To load it back:
with open('anchor_store.pkl', 'rb') as f:
    anchor_store = pickle.load(f)

In [None]:
# Can explore the dataset

# Take data instance you want by modifying index
index=1
example_ins = dev_data.data[index]

# Make an example label
example_label = label2id[example_ins['label']]

# Make query part of the example prompt
example_prompt_query = make_prompt(example_ins, 'sst2', mode='inference')

# Append context for full example prompt (for ICL)
example_prompt = prompt_prefix + example_prompt_query

print(f"Example to classify: \n{example_prompt_query}")
print()
print(f"Example true label: {example_label} (1 for positive / 0 for negative)")

Example to classify: 
Review: it 's worth seeing just on the basis of the wisdom , and at times , the startling optimism , of the children . 
Sentiment:

Example true label: 1 (1 for positive / 0 for negative)


### Let's compare vanilla prompting to ICL

In [None]:
# Let's see what basic prompting gives as an answer:

# Make the basic prompt
base_prompt = "Is the following review expressing a positive or negative sentiment? \n" + make_prompt(example_ins, 'sst2', mode='inference')

inputs = tokenizer.encode_plus(base_prompt, return_tensors="pt", padding=True).to(device=model.device)
if inputs['input_ids'].shape[1] > max_context_len:
        inputs['input_ids'] = inputs['input_ids'][:, -max_context_len:]
        inputs['attention_mask'] = inputs['attention_mask'][:, -max_context_len:]
output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=1, do_sample=False, num_beams=1)  # Can play with max_length of tokens to generate smaller/longer outputs
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print()
print("model output: \n" + output_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 45, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.



model output: 
Is the following review expressing a positive or negative sentiment? 
Review: it's worth seeing just on the basis of the wisdom, and at times, the startling optimism, of the children. 
Sentiment: 


In [None]:
# Can we do better with ICL ?
print("#################")
print('Model answer')
print("#################")
inputs = tokenizer.encode_plus(example_prompt, return_tensors="pt", padding=True).to(device=model.device)
if inputs['input_ids'].shape[1] > max_context_len:
        inputs['input_ids'] = inputs['input_ids'][:, -max_context_len:]
        inputs['attention_mask'] = inputs['attention_mask'][:, -max_context_len:]
output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=1, do_sample=False, num_beams=1)  # Can play with max_length of tokens to generate smaller/longer outputs
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print()
print("model output:", output_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1024, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


#################
Model answer
#################

model output:  positive

Review: '' is a sweet, honest, and enjoyable comedy-drama about a young woman who wants many things in life, but fears she 'll become her mother before she gets to fulfill her dreams. 
Sentiment: positive

Review: inane and unimaginative 
Sentiment: negative

Review: and not in a good way 
Sentiment: negative

Review: personal low 
Sentiment: negative

Review: '' has the right stuff for silly summer entertainment and has enough laughs to sustain interest to the end. 
Sentiment: positive

Review: close to losing my lunch 
Sentiment: negative

Review: memorable zingers 
Sentiment: positive

Review: a graceful, moving tribute to the courage of new york's finest and a nicely understated expression of the grief 
Sentiment: positive

Review: cheesy b-movie playing 
Sentiment: negative

Review: that would make it the darling of many a kids-and-family-oriented cable channel 
Sentiment: positive

Review: wasted. 
Sentime

### What can we do with KNN prompting?

In [None]:
# Train a KNN classifier with the anchors embeddings, using scikit-learn.
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

def kl_divergence(p, q):
    epsilon = np.finfo(float).eps # Add a small constant to avoid division by zero or taking log(0)

    p += epsilon
    q += epsilon
    return np.mean(p * (np.log(p) - np.log(q)))

# Instantiate the knn classifier with kl as a metric
knn_classifier = KNeighborsClassifier(n_neighbors=knn, metric=kl_divergence)

# Fit the model to the data in the datastore
knn_classifier.fit(anchor_store.queue_anchor.cpu().numpy(), anchor_store.queue_label.cpu().numpy())

In [None]:
# We can compare ICL predictions to kNN Prompting prediction

def knn_predict(prompt):
    query = llm_gen(model, prompt, tokenizer, max_context_len)
    query_np = torch.softmax(query.float(), dim=-1).cpu().numpy()
    predicted_labels = knn_classifier.predict(query_np)
    return predicted_labels

example_knn_prediction = knn_predict(example_prompt)
print(f"Predicted label: {example_knn_prediction[0]} / True label {example_label}")
print(f"Id to label mapping: 0 = {id2verb[0]} / 1 = {id2verb[1]} ")

Predicted label: 1 / True label 1
Id to label mapping: 0 = negative / 1 = positive 


In [None]:
from tqdm.notebook import tqdm as tqdm_not

# Evaluate across the whole test dataset
dev_labels = []
dev_pred = []
for ins in tqdm_not(dev_data.data, total=dev_data.__len__()):
    dev_labels.append(label2id[ins['label']])
    prompt = prompt_prefix + make_prompt(ins, 'sst2', mode='inference')
    dev_pred.extend(knn_predict(prompt))

dev_correct = [1 if dev_labels[i] == dev_pred[i] else 0 for i in range(len(dev_labels))]
acc = sum(dev_correct) / len(dev_labels)
print(f"Prediction accuracy of KNN Prompting across the validation set: {acc*100:.1f}%")

  0%|          | 0/256 [00:00<?, ?it/s]

Prediction accuracy of KNN Prompting across the validation set: 89.8%


#### Let's compare with ICL: (Reusing code from https://github.com/BenfengXu/KNNPrompting/blob/main/icl.py)

In [None]:
from utils.dataset import *
from utils.anchor import AnchorStore
from utils.template import *

# Still using sst2 dataset: https://huggingface.co/datasets/sst2
AutoDataset = SST2Dataset

def llm_gen(model, prompt, tokenizer, max_context_len):
    inputs = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True).to(device=model.device)
    if inputs['input_ids'].shape[1] > max_context_len:
        inputs['input_ids'] = inputs['input_ids'][:, -max_context_len:]
        inputs['attention_mask'] = inputs['attention_mask'][:, -max_context_len:]
    with torch.no_grad():
        logits = model.forward(input_ids=inputs['input_ids'],
                               attention_mask=inputs['attention_mask'],
                               return_dict=True).logits.detach().cpu()
    # the output prob is shifted by -1, so we should use the output at the last input token position
    # gen_logits.shape = [1, 50257]
    gen_logits = logits[:, -1, :]

    return gen_logits


def parse_response(gen_logits, tokenizer, id2verb):
    gen_prob = torch.softmax(gen_logits.float(), dim=-1)
    prob_per_cls = []
    for label_verb in id2verb:
        label_verb_token_id = tokenizer.encode(' ' + label_verb)[-1] # note the space before label word
        prob_per_cls.append(gen_prob[:, label_verb_token_id])
    pred = torch.argmax(torch.cat(prob_per_cls, dim=0)).tolist()
    return pred


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

dataset_dir = "/content/data/sst2/"
train_data = AutoDataset(dataset_dir, mode='train')
dev_data = AutoDataset(dataset_dir, mode='dev')

max_context_len = 1024
n_demo_shot = 32
seed = 43

# inference
train_data.subsamplebyshot(n_demo_shot, seed)
prompt_prefix = make_prompt(train_data, "sst2", mode='train')
dev_labels = []
dev_pred = []
label2id = dev_data.label2id
id2verb = train_data.id2verb
for ins in tqdm(dev_data.data, total=dev_data.__len__()):
    dev_labels.append(label2id[ins['label']])
    prompt = prompt_prefix + make_prompt(ins, "sst2", mode='inference')
    gen_logits = llm_gen(model, prompt, tokenizer, max_context_len)
    dev_pred.append(parse_response(gen_logits, tokenizer, id2verb))

dev_correct = [1 if dev_labels[i] == dev_pred[i] else 0 for i in range(len(dev_labels))]
acc = sum(dev_correct) / len(dev_labels)
print(f"Prediction accuracy for ICL across the validation set: {acc*100:.1f}%")

  0%|          | 0/256 [00:00<?, ?it/s]

Prediction accuracy for ICL across the validation set: 77.7%
