Special version of Apple Silicon chip for GPU Acceleration (Tested work in MBA M2 2022)

In [2]:
# Import necessary libraries.
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from huggingface_hub import hf_hub_download
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains.question_answering import load_qa_chain

In [3]:
# Define to variables to use "sentence-transformers/all-MiniLM-L6-v2" embedding model from HuggingFace.
# https://huggingface.co/blog/getting-started-with-embeddings
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [4]:
# Define the Chroma vector store and function to generate embeddings.
db = Chroma(persist_directory="./chroma_db/", embedding_function=embeddings)

In [5]:
# Define the query to search and display the most relevant document content.
inquiry = "Who is the author of this book?"
docs = db.similarity_search(inquiry)
print(docs[0].page_content)

58 Building a superhero team - Get your teammates to read this

Congratulations on finishing this book!

In Chapter 2, we talked about how this book can help you become the superhero of your team.

The only thing better than being a superhero is being part of a superhero team. I hope you’ll give copies of this book to your friends and teammates and help create other superheroes!

Page 118

Machine Learning Yearning-Draft

Andrew Ng


In [1]:
# Install llama-cpp-python library to use llama.cpp for Apple Silicon.
# https://python.langchain.com/docs/integrations/llms/llamacpp
# !CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.6.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Obtaining dependency information for typing-extensions>=4.5.0 from https://files.pythonhosted.org/packages/24/21/7d397a4b7934ff4028987914ac1044d3b7d52712f30e2ac7a2ae5bc86dd0/typing_extensions-4.8.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.8.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Obtaining dependency information for numpy>=1.20.0 from https://files.pythonhosted.org/packages/35/21/9e150d654da358beb29fe216f339dc17f2b2ac

In [6]:
# Defien the Llama 2 model name, path, and base name to use.
# https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q4_K_M.gguf"

In [7]:
# Construst the model path and download the model from HuggingFace.
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [8]:
# Print out the model path.
print(model_path)

/Users/easonlai/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/245bc5104d85dcc9a11a0e0a9ae6de38dfae536f/llama-2-13b-chat.Q4_K_M.gguf


In [9]:
# Callbacks support token-wise streaming.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [14]:
# Reference configuration for GPU https://python.langchain.com/docs/integrations/llms/llamacpp#metal.
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Initialize the llamaCpp model.
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=256,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    n_ctx=1024,
    verbose=False,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/easonlai/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/245bc5104d85dcc9a11a0e0a9ae6de38dfae536f/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  5120

In [16]:
# Initialize the question answering chain.
chain = load_qa_chain(llm, chain_type="stuff")

In [17]:
# Perform first sample of question answering.
inquiry = "Who is the author of this book?"
docs = db.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

 The author of this book is Andrew Ng.

' The author of this book is Andrew Ng.'

In [19]:
# Perform second sample of question answering.
inquiry = "Please tell me the key summary of this book."
docs = db.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

 This book is a guide to help you use Machine Learning for real-world projects, especially those that involve deep learning and scaling up your models. The book covers how to set technical direction and avoid common mistakes.

' This book is a guide to help you use Machine Learning for real-world projects, especially those that involve deep learning and scaling up your models. The book covers how to set technical direction and avoid common mistakes.'