# Install the Necessary Libraries
1. Install the libraries
	1. Transformers
	2. bitsandbytes
	3. accelerate
	4. dataesets
	5. prompt source

In [1]:
# Check if the copmuter is on google colab
import sys
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    !pip install rich
    !pip install "accelerate>=0.16.0,<1" 
    !pip install "torch>=1.13.1"
    !pip install "transformers[torch]>=4.28.1,<5" 
    !pip install "datasets>=1.14.0,<2"
    !pip install bitsandbytes
    !pip install datasets
    !pip install sentencepiece
    !pip install triton
    !pip install einops
    !pip install safetensors
    !pip install langchain
    !pip install gradio
else:
    print("Not running on Google Colab")
from rich import print

Not running on Google Colab


# Check the GPU

In [2]:
import torch
from rich import print
if torch.cuda.is_available():
    !nvidia-smi
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("Cuda capability: ", torch.cuda.get_device_capability(0))
    '''
    On pre-ampere hardware bf16 works, but doesn't provide speed-ups compared to fp32 matmul operations, and some matmul operations are failing outright, so this check is more like "guaranteed to work and be performant" than "works somehow".  https://github.com/pytorch/pytorch/issues/75427
    '''
    print(f"bfloat16 support: { torch.cuda.is_bf16_supported()}") # 

# We are selecting one nlp text classification dataset from the papers
	1. GLUE cola
	2. Newsprop
	3. AG New
	4. IMDB
	5. DBPEDIA
	6. Emotion
	7. Tweet offensive
If you want to load a different dataset, you can replace `"glue"` and `"cola"` with the appropriate dataset name and split. Here are the corresponding dataset names for the options you provided:

- "GLUE cola": `"glue", "cola"`
- "Newsprop": `"newsprop"`
- "AG New": `"ag_news"`
- "IMDB": `"imdb"`
- "DBPEDIA": `"dbpedia"`
- "Emotion": `"emotion"`
- "Tweet offensive": `"twt_offensive"

In [3]:
from datasets import load_dataset 
from rich import print
# Loading a text classification dataset 
dataset = load_dataset("ag_news")
print(dataset)
print(dataset['train'].features)

Found cached dataset ag_news (/home/null/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

# The Seed Prompt

In [3]:
seed_prompt = "What label best describes this news article?"

# List of Meta Prompt

In [4]:
from rich import print
# Convert the number of items into python f string for example
list_of_meta_prompt = [
f"Write a paraphrase for the following sentence: {seed_prompt} Paraphrase:",
f"{seed_prompt} Paraphrase:",
f"Write a likely paraphrase of the text: {seed_prompt} Paraphrase:",
f"Write a similar sentence similar to the following one: {seed_prompt} Paraphrase:",
f"Paraphrase the following sentence: {seed_prompt} Paraphrase:",
f"Write a variation of this sentence: {seed_prompt}",
f"How would you say the following sentence in a different way? {seed_prompt}",
]
'''
for index in range(0 , len(list_of_meta_prompt)):
    list_of_meta_prompt[index] = "Write a paraphrase for the following sentence: What label best describes this news article? Paraphrase: Which term accurately characterizes this news article?\n"+list_of_meta_prompt[index]
print(list_of_meta_prompt)
'''

'\nfor index in range(0 , len(list_of_meta_prompt)):\n    list_of_meta_prompt[index] = "Write a paraphrase for the following sentence: What label best describes this news article? Paraphrase: Which term accurately characterizes this news article?\n"+list_of_meta_prompt[index]\nprint(list_of_meta_prompt)\n'

# Download the Model and Tokenizers that will be paraphrased
5. The paper recommend to use GPT3 as is impressive ability to do and to ensure there is separation between the model. I will be using multiple model long instruction model, regular instruction, and dolly
	1. List of model to select from
		1. Flan T5
			1. 11 billion or xxl which you need 88 Gigabytes in fp 32 , if load_8_bits you need bit above 16 bits.
			2. 3 billion xl which will be around
		2. MPT
			1. MPT-7b ( mosaicml/mpt-7b )
				1. Architecture
					1. [[ALiBi]]
					2. [[FasterTransformers]]
					3. [[Flash Attention]]
	2. Inference Notes on Github 
		1. https://github.com/huggingface/transformers/pull/10956
		2. https://github.com/huggingface/transformers/pull/20878
		3. https://github.com/huggingface/transformers/pull/20760
		4. https://github.com/huggingface/transformers/pull/20683A
		5. https://github.com/huggingface/transformers/pull/19468

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration , AutoTokenizer
import time
import torch
if torch.cuda.is_available():
    '''
    The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead of 23 bits precision it has only 10 bits (same as fp16). In total it uses only 19 bits.
    It’s magical in the sense that you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput improvement. All you need to do is to add this to your code:
    '''
    if torch.cuda.get_device_name(0) in ["A100-SXM4-40GB", "A100-SXM4-80GB"]:
        torch.backends.cuda.matmul.allow_tf32 = True 
    MODEL  = "databricks/dolly-v2-3b"
    start_time_to_download_tokenizer = time.time()
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    print(f"Time to download tokenizer: {time.time() - start_time_to_download_tokenizer}")
    print("Using AutoTokenizer")
else:
    MODEL  = "databricks/dolly-v2-3b"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
## Load the 11B encoder and decoder parameter model which will take a while which will be 80 GB in fp 32
from transformers import AutoModelForSeq2SeqLM , AutoModelForCausalLM , AutoConfig
import time
start_time_to_download_model = time.time()
model = None
if MODEL in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl", 
                                                    device_map="auto", # When passing a device_map, low_cpu_mem_usage is automatically set to True, so you don’t need to specify it: 
                                                    load_in_8bit=True,
                                                    torch_dtype=torch.float16 #  due to requirements of `bitsandbytes` to enable model loading in mixed int8
    )
elif MODEL in  ["mosaicml/mpt-7b" , "distilgpt2" , "databricks/dolly-v2-12b" , "databricks/dolly-v2-3b"]:
    config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path = MODEL , 
        trust_remote_code = True # trust_remote_code=True be passed to the from_pretrained method. This is because we use a custom MPT model architecture that is not yet part of the Hugging Face transformers package. MPT includes options for many training efficiency features such as FlashAttention, ALiBi, QK LayerNorm, and more.
    )
    load_in_8bit = True if torch.cuda.is_available() else False
    trust_remote_code = True
    if MODEL in ["mosaicml/mpt-7b" ]:
        torch_data_type =  torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        print(f"torch_data_type: {torch_data_type}")
        if torch.cuda.is_bf16_supported():
            config.attn_config['attn_impl'] = 'triton' # To use the optimized triton implementation of FlashAttention, you can load the model with attn_impl='triton' and move the model to bfloat16:
        else:
            config.attn_config["attn_impl"] = "flash" # To use the optimized FlashAttention implementation, you can load the model with attn_impl='flash' and move the model to float16:
        config.update({"max_seq_len": 4096}) # Although the model was trained with a sequence length of 2048, ALiBi enables users to increase the maximum sequence length during finetuning and/or inference. For example:
        load_in_8bit = False
        trust_remote_code = True
        device_map = None # ValueError: MPTForCausalLM does not support `device_map='auto'` yet.
    elif MODEL in ["distilgpt2" ]:
        trust_remote_code = True
    elif MODEL in ["databricks/dolly-v2-3b"]:
        torch_data_type =  torch.bfloat16  #  You need to execute a model loaded in half precision on a GPU, the operations are not implemented in half on the CPU.
        trust_remote_code = True
        device_map = "auto"
        load_in_8bit = False
    elif MODEL in ["databricks/dolly-v2-12b" , "databricks/dolly-v2-7b"]:
        torch_data_type =  torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else   torch.float16 if torch.cuda.is_available() else torch.float32 #  You need to execute a model loaded in half precision on a GPU, the operations are not implemented in half on the CPU.
        assert torch_data_type != torch.float16 , "torch_data_type should not be torch.float16"
        trust_remote_code = True
        device_map = "auto"
        load_in_8bit = True
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path = MODEL ,
        config = config ,
        load_in_8bit = load_in_8bit ,
        trust_remote_code = True ,
        device_map = device_map ,
        torch_dtype =  torch_data_type,
        low_cpu_mem_usage = True, # device map is set to auto then low_cpu_mem_usage is automatically set to True
    )
    model.eval() # PyTorch Eval mode will disable dropout and batch normalization. This is important to have reproducible results during inference.
else:
    ValueError(f"MODEL: {MODEL} is not supported")
end_time_to_download_model = time.time()
print(f"Time to download model: {end_time_to_download_model - start_time_to_download_model}")

# Pipeline

## Instruction Pipeline

In [None]:
import logging
import re
from typing import List

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer


logger = logging.getLogger(__name__)

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]


class InstructionTextGenerationPipeline(Pipeline):
    def __init__(
        self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
    ):
        """Initialize the pipeline
        Args:
            do_sample (bool, optional): Whether or not to use sampling. Defaults to True.
            max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
            top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with
                probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
            top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.
                Defaults to 0.
        """
        super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k,
                         **kwargs)

    def _sanitize_parameters(self,
                             return_full_text: bool = None,
                             **generate_kwargs):
        preprocess_params = {}

        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
        # append a newline to yield a single token.  find whatever token is configured for the response key.
        tokenizer_response_key = next(
            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
        )

        response_key_token_id = None
        end_key_token_id = None
        if tokenizer_response_key:
            try:
                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

                # Ensure generation stops once it generates "### End"
                generate_kwargs["eos_token_id"] = end_key_token_id
            except ValueError:
                pass

        forward_params = generate_kwargs
        postprocess_params = {
            "response_key_token_id": response_key_token_id,
            "end_key_token_id": end_key_token_id
        }

        if return_full_text is not None:
            postprocess_params["return_full_text"] = return_full_text

        return preprocess_params, forward_params, postprocess_params

    def preprocess(self, instruction_text, **generate_kwargs):
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
        inputs = self.tokenizer(
            prompt_text,
            return_tensors="pt",
        )
        inputs["prompt_text"] = prompt_text
        inputs["instruction_text"] = instruction_text
        return inputs

    def _forward(self, model_inputs, **generate_kwargs):
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)

        if input_ids.shape[1] == 0:
            input_ids = None
            attention_mask = None
            in_b = 1
        else:
            in_b = input_ids.shape[0]

        generated_sequence = self.model.generate(
            input_ids=input_ids.to(self.model.device),
            attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
            pad_token_id=self.tokenizer.pad_token_id,
            **generate_kwargs,
        )

        out_b = generated_sequence.shape[0]
        if self.framework == "pt":
            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
        elif self.framework == "tf":
            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))

        instruction_text = model_inputs.pop("instruction_text")
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_full_text: bool = False):

        generated_sequence = model_outputs["generated_sequence"][0]
        instruction_text = model_outputs["instruction_text"]

        generated_sequence: List[List[int]] = generated_sequence.numpy().tolist()
        records = []
        for sequence in generated_sequence:

            # The response will be set to this variable if we can identify it.
            decoded = None

            # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
            if response_key_token_id and end_key_token_id:
                # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
                # prompt, we should definitely find it.  We will return the tokens found after this token.
                try:
                    response_pos = sequence.index(response_key_token_id)
                except ValueError:
                    logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
                    response_pos = None

                if response_pos:
                    # Next find where "### End" is located.  The model has been trained to end its responses with this
                    # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                    # this token, as the response could be truncated.  If we don't find it then just return everything
                    # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
                    try:
                        end_pos = sequence.index(end_key_token_id)
                    except ValueError:
                        end_pos = None

                    decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()

            if not decoded:
                # Otherwise we'll decode everything and use a regex to find the response and end.

                fully_decoded = self.tokenizer.decode(sequence)

                # The response appears after "### Response:".  The model has been trained to append "### End" at the
                # end.
                m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

                if m:
                    decoded = m.group(1).strip()
                else:
                    # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                    # return everything after "### Response:".
                    m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
                    if m:
                        decoded = m.group(1).strip()
                    else:
                        logger.warn(f"Failed to find response in:\n{fully_decoded}")

            # If the full text is requested, then append the decoded text to the original instruction.
            # This technically isn't the full text, as we format the instruction in the prompt the model has been
            # trained on, but to the client it will appear to be the full text.
            if return_full_text:
                decoded = f"{instruction_text}\n{decoded}"

            rec = {"generated_text": decoded}

            records.append(rec)

        return records

In [None]:
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer, task="text-generation")
print(generate_text.task)

# Langchain Prompt with Hugging Face Pipeline

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

# template for an instrution with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}")

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

print(llm_chain.predict(instruction="Explain to me the difference between nuclear fission and fusion.").lstrip())

context = """George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman,
and Founding Father who served as the first president of the United States from 1789 to 1797."""

print(llm_context_chain.predict(instruction="When was George Washington president?", context=context).lstrip())


# Generation Configuration

In [None]:
from transformers import AutoModelForCausalLM, GenerationConfig
generation_config =  GenerationConfig(
    max_new_tokens = 256, # The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
    num_beams = 2, # 1 means no beam search instead greedy search
    temperature = .3, # Parameters for manipulation of the model output logits
    top_p = 0.95, # Parameters for manipulation of the model output logits
    do_sample = True # select a random token from the top-k tokens (set to 0 to disable top-k sampling) instead of choosing the one with the highest probability
)

# Device Mapping for the Model

In [None]:
try:
    print(model.hf_device_map)
except:
    print("model.hf_device_map is not available")

#  Paraphrase the Prompt

## Parahrase the prompt with the model and Tokenizer

In [1]:
'''
from typing import TypedDict
from torch import LongTensor
class TokenizerOutput(TypedDict):
  input_ids: LongTensor
  attention_mask: LongTensor
potential_prompts = list()
for meta_prompt in list_of_meta_prompt:
    tokenized_prompts: TokenizerOutput = tokenizer( meta_prompt, return_tensors='pt', padding=False, add_special_tokens=False)
    outputs = model.generate(
        input_ids = tokenized_prompts.input_ids.to(model.device) , # model.device is the device where the model is loaded (cpu or gpu
        attention_mask = tokenized_prompts.attention_mask.to(model.device) , # model.device is the device where the model is loaded (cpu or gpu
        generation_config = generation_config,
        do_sample = generation_config.temperature > 0
    )
    potential_prompts.append(tokenizer.decode(outputs[0] , skip_special_tokens=True))
'''

"\nfrom typing import TypedDict\nfrom torch import LongTensor\nclass TokenizerOutput(TypedDict):\n  input_ids: LongTensor\n  attention_mask: LongTensor\npotential_prompts = list()\nfor meta_prompt in list_of_meta_prompt:\n    tokenized_prompts: TokenizerOutput = tokenizer( meta_prompt, return_tensors='pt', padding=False, add_special_tokens=False)\n    outputs = model.generate(\n        input_ids = tokenized_prompts.input_ids.to(model.device) , # model.device is the device where the model is loaded (cpu or gpu\n        attention_mask = tokenized_prompts.attention_mask.to(model.device) , # model.device is the device where the model is loaded (cpu or gpu\n        generation_config = generation_config,\n        do_sample = generation_config.temperature > 0\n    )\n    potential_prompts.append(tokenizer.decode(outputs[0] , skip_special_tokens=True))\n"

## Paraphase the propmt with the Pipeline

In [None]:
potential_prompt = [None] * len(list_of_meta_prompt)
start_time = time.time()
for index, meta_prompt in enumerate(list_of_meta_prompt):
    potential_prompt[index] = llm_chain.predict(instruction= meta_prompt).lstrip()
end_time = time.time()
print(f"Time to generate 10 prompts: {end_time - start_time}")

In [6]:
print(potential_prompt)
potential_prompt.append(seed_prompt)
## remove all empty strings
try:
    potential_prompt = list(filter("", potential_prompt))
except:
    print("potential_prompt is not a list of strings")

NameError: name 'potential_prompt' is not defined

# Use Semantic Search in using a Sentence Transformers in CPU compare which prompt are good

# BackTranslation

In [None]:
eng_german = [None] * len(potential_prompt)
pp_temp = len(potential_prompt)
potential_prompt+= [None] * pp_temp
for index in range(0 , len(potential_prompt)):
    eng_german[index] = llm_chain.predict(instruction= f"translate English to German: {potential_prompt[index]}").lstrip()
    potential_prompt[index + pp_temp] = llm_chain.predict(instruction= f"translate German to English: {eng_german[index]}").lstrip()
    

# Gradio Demo: text_generation
### This text generation demo takes in input text and returns generated text. It uses the Transformers library to set up the model and has two examples.
        

In [None]:
import gradio as gr

def generate(text):
    #result = generator(text, max_length=30, num_return_sequences=1)
    return llm_chain.predict(instruction = text ).lstrip()

examples = [
    ["The Moon's orbit around Earth has"],
    ["The smooth Borealis basin in the Northern Hemisphere covers 40%"],
]

demo = gr.Interface(
    fn=generate,
    inputs=gr.inputs.Textbox(lines=5, label="Input Text"),
    outputs=gr.outputs.Textbox(label="Generated Text"),
    examples=examples
)

demo.launch()


# Perplexity

In [None]:
import torch
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "databricks/dolly-v2-3b"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())