In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch==2.1.2 --progress-bar off
!pip install -qqq transformers==4.36.2 --progress-bar off
!pip install -qqq einops==0.7.0 --progress-bar off
!pip install -qqq accelerate==0.25.0 --progress-bar off

[0m

In [2]:
from inspect import cleandoc # The inspect.cleandoc function from the Python inspect module is a handy tool for working with multi-line docstrings or code blocks.



In [3]:



import torch
from transformers import AutoModelForCausalLM

MODEL_NAME = "microsoft/phi-2"


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto", #Automatically determines the optimal data type for tensors based on the hardware and available memory.
    # flash_attn=True, # These enable Flash Attention and Flash Rotary functions for potential performance improvements.
    # flash_rotary=True, # These enable Flash Attention and Flash Rotary functions for potential performance improvements.
    # fused_dense=True, # Merges certain operations for potential speedups.
    device_map="auto", # Automatically determines the best device placement for computations.
    trust_remote_code=True, #This allows for remote code execution, but should be used with extreme care.
)



# """
# Key points to note

# The AutoModelForCausal LM class is designed for loading and using pre-trained causal language models.
# The from_pretrained() method simplifies the loading process and provides configuration options.
# The options torch_dtype, device_map, and trust_remote_code are related to performance and security.

# """

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)


# """
# Key Components:

# tokenizer = AutoTokenizer.from_pretrained(...): This part creates a tokenizer object using the AutoTokenizer class and loads it from a pre-trained source.
# MODEL_NAME: This placeholder represents the name or path of the pre-trained model or tokenizer you want to load.
# trust_remote_code=True: This optional parameter allows for remote code execution when loading the tokenizer.
# """

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import GenerationConfig # Holds and manages settings for generating text with models trained for tasks like summarization, translation, and open-ended text creation.

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024 # Limits the length of the generated text to a maximum of 1024 words or subwords.
generation_config.temperature = 0.0001 # Produces highly deterministic and less creative text, as the model will strongly favor the most likely words at each step.
generation_config.do_sample = True # Introduces some randomness, even with the low temperature, so there might be slight variations in the generated text despite the deterministic tendency.



# """
# Customize configuration:

# generation_config.max_new_tokens = 1024
# Sets the maximum number of new tokens to generate (excluding the tokens in the input prompt). In this case, it's set to 1024, meaning the generated text will contain a maximum of 1024 new words or subwords.
# generation_config.temperature = 0.0001
# Controls the randomness of the generated text. Lower values (like 0.0001) make the output more deterministic and repetitive, while higher values introduce more creativity and variation.
# generation_config.do_sample = True
# Enables random sampling during generation, allowing for more diverse and unpredictable text sequences.

# """

In [6]:
from transformers import TextStreamer # The TextStreamer object is designed to stream generated text in real-time, as it's produced by the model.

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# """
# tokenizer: This is the tokenizer object associated with the model you'll be using for text generation. It's essential for decoding generated tokens into text.
# skip_prompt=True: This option tells the streamer to not include the input prompt in the streamed output. It will only stream the newly generated text.
# skip_special_tokens=True: This option instructs the streamer to ignore special tokens (like those used for padding or marking sentence boundaries) when streaming text. This ensures a cleaner and more readable output.
# """

In [12]:
from transformers import pipeline # This function enables you to easily create pipelines for various NLP tasks, including text generation. Think of it as a pre-built tool you can use without diving into the complexities of the underlying model and tokenizer.


llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True, # This tells the pipeline to return the complete generated text, including any prompt you provide and special tokens used by the model. By default, it might only return the part that's newly generated.
    generation_config=generation_config,
    num_return_sequences=1, # This specifies how many different generated sequences you want the pipeline to produce. In this case, it's set to 1, so you'll only get one output text.
    # eos_token_id=tokenizer.eos_token_id, # This tells the pipeline which token ID signifies the end of a sentence. This helps ensure proper segmentation of the output text.
    # pad_token_id=tokenizer.pad_token_id, # This provides the ID of the padding token, which is used to pad shorter inputs to a common length for processing.
    streamer=streamer

)

In [13]:
SYSTEM_PROMPT = """
You're helpful assistant that always answers truthfully.
""".strip()

def create_prompt(prompt: str, system_prompt: str = SYSTEM_PROMPT) -> str:

  if not system_prompt:
    return cleandoc(
        f"""
    Instruct: {prompt}
    Output:
    """
    )

  return cleandoc(
      f"""
  Instruct: {system_prompt} {prompt}
  Output:
  """
  )

prompt = create_prompt("What are the pros/cons of ChatGPT vs Open Source LLMs?")

print(prompt)

Instruct: You're helpful assistant that always answers truthfully. What are the pros/cons of ChatGPT vs Open Source LLMs?
Output: 


In [15]:
%%time
# ouput = llm(create_prompt("What are the pros/cons of ChatGPT vs Open Source LLMs?"))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
