In [1]:
	
#!pip install pypdf
#!pip install -q transformers einops accelerate langchain bitsandbytes
#!pip install sentence-transformers
#!pip install quanto
#!pip install llama_index
#%pip install llama-index-embeddings-langchain
#%pip install llama-index-llms-huggingface

In [2]:
import torch
import os
from typing import List, Optional

from dotenv import load_dotenv
load_dotenv()

Python-dotenv could not parse statement starting at line 5
Python-dotenv could not parse statement starting at line 6
Python-dotenv could not parse statement starting at line 7


True

In [3]:
api_key = os.getenv("HUGGING_FACE_TOKEN")

if api_key is not None:
  HF_TOKEN: Optional[str] = api_key
else:
  print("HF_TOKEN environment variable not set.")


# Set the device variable for cude if available, else using standard CPU
# device = 'cuda' if torch.cuda.is_available() else 'cpu' # This is for windows
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # For M1 Mac
print(f"Device: {device}")

Device: mps


In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

In [5]:
# Load the documents from the specified directory

#documents = SimpleDirectoryReader("/content/data").load_data() # Path if working in colab
documents = SimpleDirectoryReader("./data").load_data()

In [6]:
## Create the system prompt template
# This prompt will be used to guide the behavior of the language model

system_prompt="""
You are a pirate Q&A assistant who always responds in pirate speak!
Your goal is to answer questions as accurately as
possible based on the instructions and context provided. If you do not know
the answer you can say that you dont know, do not try to make up an answer.
"""

## Default format supportable by LLama3
# This prompt template will wrap the user's query and the system prompt
# in a specific format that the language model can understand

query_wrapper_prompt=PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [7]:
import glob
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Model name that we want to load from HF
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Setup the cache directory for the model and tokenizer.
cache_dir = "./model/llama3_8b/"

# Check if the model is already exists. If it does, load the model. 
# If file does not exists, then download from HuggingFace
model_files = glob.glob(cache_dir + "*.safetensors")
tokenizer_files = glob.glob(cache_dir + "tokenizer.json")

if len(model_files)>0:
    print("--Model already exists in the directory. Loading from local directory")
    model = AutoModelForCausalLM.from_pretrained(cache_dir)

else:
    # Load the model and the tokenizer
    # The model is loaded with the specified quantization configuration
    # and the "auto" device mapping for efficient inference
    model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16, # Original precision is float32, but we will convert to float16 for efficiency. MPS doesnt support bfloat16 so changing to float16
            device_map="auto",
            token=HF_TOKEN)


if len(tokenizer_files)>0:
    print("\n--Tokenizer already exists in the directory. Loading from local directory")
    tokenizer = AutoTokenizer.from_pretrained(cache_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained(
            model_id, 
            token=HF_TOKEN)
    


--Model already exists in the directory. Loading from local directory


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


--Tokenizer already exists in the directory. Loading from local directory


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Save the model to a local directory to avoid downloading it everytime. We can call this model in this or other notebooks directly

# Save the model and the tokenizer
model.save_pretrained(cache_dir)
tokenizer.save_pretrained(cache_dir)

In [11]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

# Create the embeddings
embedding_model_id = "sentence-transformers/all-mpnet-base-v2"

# Create the embeddings
# Use the "sentence-transformers/all-mpnet-base-v2" model for embeddings
lc_embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model_id
)
embed_model = LangchainEmbedding(lc_embed_model)

In [None]:
# Use the "sentence-transformers/all-mpnet-base-v2" model for embeddings

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    embed_model=embed_model,
    llm=HuggingFaceLLM(context_window=4096,
                       max_new_tokens=256,
                       generate_kwargs={"temperature": 0.3, "do_sample": False},
                       model=model, 
                       tokenizer=tokenizer,
                       system_prompt=system_prompt,
                       query_wrapper_prompt=query_wrapper_prompt
    )
)

In [None]:
# Create the index from the documents
index = VectorStoreIndex.from_documents(
            documents,
            service_context=service_context
            )

In [None]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

print("Generating Input ID's and applying chat template")
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

print("Generating Output")
outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print("\n\n Response:")
print(tokenizer.decode(response, skip_special_tokens=True))