In [12]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch==2.1.2 --progress-bar off
!pip install -qqq transformers==4.36.2 --progress-bar off
!pip install -qqq einops==0.7.0 --progress-bar off
!pip install -qqq accelerate==0.25.0 --progress-bar off

[0m

In [13]:
from inspect import cleandoc # The inspect.cleandoc function from the Python inspect module is a handy tool for working with multi-line docstrings or code blocks.



In [18]:
import torch
from transformers import AutoModelForCausalLM

MODEL_NAME = "microsoft/phi-2"


# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     torch_dtype="auto", #Automatically determines the optimal data type for tensors based on the hardware and available memory.
#     flash_attn=True, # These enable Flash Attention and Flash Rotary functions for potential performance improvements.
#     flash_rotary=True, # These enable Flash Attention and Flash Rotary functions for potential performance improvements.
#     fused_dense=True, # Merges certain operations for potential speedups.
#     device_map="auto", # Automatically determines the best device placement for computations.
#     trust_remote_code=True, #This allows for remote code execution, but should be used with extreme care.
# )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    # flash_attn=True,
    # flash_rotary=True,
    fused_dense=True,
    device_map="auto",
    trust_remote_code=True
)



# """
# Key points to note

# The AutoModelForCausal LM class is designed for loading and using pre-trained causal language models.
# The from_pretrained() method simplifies the loading process and provides configuration options.
# The options torch_dtype, device_map, and trust_remote_code are related to performance and security.

# """

TypeError: PhiForCausalLM.__init__() got an unexpected keyword argument 'fused_dense'

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)


# """
# Key Components:

# tokenizer = AutoTokenizer.from_pretrained(...): This part creates a tokenizer object using the AutoTokenizer class and loads it from a pre-trained source.
# MODEL_NAME: This placeholder represents the name or path of the pre-trained model or tokenizer you want to load.
# trust_remote_code=True: This optional parameter allows for remote code execution when loading the tokenizer.
# """

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'\nKey Components:\n\ntokenizer = AutoTokenizer.from_pretrained(...): This part creates a tokenizer object using the AutoTokenizer class and loads it from a pre-trained source.\nMODEL_NAME: This placeholder represents the name or path of the pre-trained model or tokenizer you want to load.\ntrust_remote_code=True: This optional parameter allows for remote code execution when loading the tokenizer.\n'

In [7]:
from transformers import GenerationConfig # Holds and manages settings for generating text with models trained for tasks like summarization, translation, and open-ended text creation.

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024 # Limits the length of the generated text to a maximum of 1024 words or subwords.
generation_config.temperature = 0.0001 # Produces highly deterministic and less creative text, as the model will strongly favor the most likely words at each step.
generation_config.do_sample = True # Introduces some randomness, even with the low temperature, so there might be slight variations in the generated text despite the deterministic tendency.



# """
# Customize configuration:

# generation_config.max_new_tokens = 1024
# Sets the maximum number of new tokens to generate (excluding the tokens in the input prompt). In this case, it's set to 1024, meaning the generated text will contain a maximum of 1024 new words or subwords.
# generation_config.temperature = 0.0001
# Controls the randomness of the generated text. Lower values (like 0.0001) make the output more deterministic and repetitive, while higher values introduce more creativity and variation.
# generation_config.do_sample = True
# Enables random sampling during generation, allowing for more diverse and unpredictable text sequences.

# """

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

"\nCustomize configuration:\n\ngeneration_config.max_new_tokens = 1024\nSets the maximum number of new tokens to generate (excluding the tokens in the input prompt). In this case, it's set to 1024, meaning the generated text will contain a maximum of 1024 new words or subwords.\ngeneration_config.temperature = 0.0001\nControls the randomness of the generated text. Lower values (like 0.0001) make the output more deterministic and repetitive, while higher values introduce more creativity and variation.\ngeneration_config.do_sample = True\nEnables random sampling during generation, allowing for more diverse and unpredictable text sequences.\n\n"

In [None]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# """
# tokenizer: This is the tokenizer object associated with the model you'll be using for text generation. It's essential for decoding generated tokens into text.
# skip_prompt=True: This option tells the streamer to not include the input prompt in the streamed output. It will only stream the newly generated text.
# skip_special_tokens=True: This option instructs the streamer to ignore special tokens (like those used for padding or marking sentence boundaries) when streaming text. This ensures a cleaner and more readable output.
# """