In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [1]:
#from huggingface_hub import hf_hub_download
#hf_hub_download(repo_id='TheBloke/Mistral-7B-v0.1-GGUF', filename='mistral-7b-v0.1.Q4_K_M.gguf', local_dir='./models', local_dir_use_symlinks=False)

In [5]:
from llama_index.llms.llama_cpp import LlamaCPP

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"
model_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path='models/mistral-7b-v0.1.Q4_K_M.gguf',
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 0},
    verbose=True,
)


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/mistral-7b-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_print_meta: format           = GGUF V2
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = SPM
llm_load_print_meta: n_vocab          = 32000
llm_load_print_meta: n_merges         = 0
llm_load_print_meta: n_ctx_train      = 32768
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_layer          = 32
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff

In [4]:
import psycopg2

# db_name = "vector_db"
# host = "localhost"
# password = "postgres"
# port = "5432"
# user = "postgres"
# # conn = psycopg2.connect(connection_string)
# conn = psycopg2.connect(
#     dbname=postgres,
#     host=host,
#     password=password,
#     port=port,
#     user=user,
# )
# conn.autocommit = True

# with conn.cursor() as c:
#     c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#     c.execute(f"CREATE DATABASE {db_name}")


In [10]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

db_name = "vectordb"
host = "db"
password = "testpwd"
port = "5432"
user = "testuser"
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="llama2_paper",
    embed_dim=384,  # openai embedding dimension
)

In [11]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

In [12]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))



In [13]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [14]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding


KeyboardInterrupt: 

In [10]:
vector_store.add(nodes)

['cb2850a9-c555-4029-a0ea-d51dbb13a605',
 '96db18ac-b9ee-46c2-bb35-d363caed0378',
 'fc3960c1-b83b-4227-9aff-0998e047d4ae',
 '430298dc-0a40-4088-b5c3-7143d0ebf23c',
 '3e1b6d4a-0f21-4e2a-ba84-b26638fe0139',
 '05b17df6-0912-4bad-a5d0-afa4e2192be5',
 'e17cc6f5-8567-4ad9-899e-a276f9798ffb',
 'ea2126ac-0051-45ee-b016-250b163393ab',
 '357f1da0-c754-42c6-8d3d-3c2a76e7cf47',
 '5b3a432c-2f58-482b-8f1b-7c4412aa23ae',
 'fd5a42cf-5b21-40a5-bd4e-590724cb2652',
 'c54da034-851a-470a-be72-54159528008c',
 '0d36b98c-53e0-427f-9eb7-b28df3ad7178',
 '826ab7fc-2e0a-4656-87cd-5bfc9e6b0121',
 'eb9e8d30-a33a-4e2b-8fce-5100a74c1c76',
 '685bc9e0-3858-47f3-8b11-c9be2e425cb3',
 '1a0fe26a-b514-45bf-990e-4013402b9166',
 '45955a50-b0c7-485a-afe1-32f73992b291',
 '224b62e0-8f9f-4c1b-9564-36de01884bee',
 '77ece127-4ff3-41b1-a292-ad27d5552965',
 '88d126bc-c5a3-42ff-80ee-4a1491914f4f',
 '338357a4-7414-456b-8245-aa1eca61f849',
 '16db3e7a-bd54-4bff-a429-28c8be4e130b',
 '68d52a6c-acd4-449a-a325-938b13e4783b',
 '7fb91cbd-96c9-

In [15]:
query_str = "Can you tell me about the key concepts for safety finetuning"


In [16]:
query_embedding = embed_model.get_query_embedding(query_str)


In [17]:
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [18]:
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())


TruthfulQA ↑
ToxiGen ↓
MPT
7B
29.13
22.32
30B
35.25
22.61
Falcon
7B
25.95
14.53
40B
40.39
23.44
Llama 1
7B
27.42
23.00
13B
41.74
23.08
33B
44.19
22.57
65B
48.71
21.77
Llama 2
7B
33.29
21.25
13B
41.86
26.10
34B
43.45
21.19
70B
50.18
24.60
Table 11: Evaluation of pretrained LLMs on automatic safety benchmarks. For TruthfulQA, we present the
percentage of generations that are both truthful and informative (the higher the better). For ToxiGen, we
present the percentage of toxic generations (the smaller, the better).
Benchmarks give a summary view of model capabilities and behaviors that allow us to understand general
patterns in the model, but they do not provide a fully comprehensive view of the impact the model may have
on people or real-world outcomes; that would require study of end-to-end product deployments. Further
testing and mitigation should be done to understand bias and other social issues for the specific context
in which a system may be deployed. For this, it may be necessary

In [19]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))


In [20]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [21]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)


In [22]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [23]:
query_str = "How does Llama 2 perform compared to other open-source models?"

response = query_engine.query(query_str)



llama_print_timings:        load time =   40041.65 ms
llama_print_timings:      sample time =      65.91 ms /   143 runs   (    0.46 ms per token,  2169.53 tokens per second)
llama_print_timings: prompt eval time =   61818.87 ms /   764 tokens (   80.91 ms per token,    12.36 tokens per second)
llama_print_timings:        eval time =   30660.24 ms /   142 runs   (  215.92 ms per token,     4.63 tokens per second)
llama_print_timings:       total time =   92929.78 ms /   906 tokens


In [20]:
print(str(response))


 Based on the context information, Llama 2 performs on par or better than other open-source models such as PaLM (540B) on almost all benchmarks, but there is still a significant gap in performance between Llama 2 and closed-source models such as GPT-3.5 and GPT-4.


In [21]:
print(response.source_nodes[0].get_content())


Additionally, Llama 2 70B model outperforms all open-source models.
In addition to open-source models, we also compare Llama 2 70B results to closed-source models. As shown
in Table 4, Llama 2 70B is close to GPT-3.5 (OpenAI, 2023) on MMLU and GSM8K, but there is a significant
gap on coding benchmarks. Llama 2 70B results are on par or better than PaLM (540B) (Chowdhery et al.,
2022) on almost all benchmarks. There is still a large gap in performance between Llama 2 70B and GPT-4
and PaLM-2-L.
We also analysed the potential data contamination and share the details in Section A.6.
Benchmark (shots)
GPT-3.5
GPT-4
PaLM
PaLM-2-L
Llama 2
MMLU (5-shot)
70.0
86.4
69.3
78.3
68.9
TriviaQA (1-shot)
–
–
81.4
86.1
85.0
Natural Questions (1-shot)
–
–
29.3
37.5
33.0
GSM8K (8-shot)
57.1
92.0
56.5
80.7
56.8
HumanEval (0-shot)
48.1
67.0
26.2
–
29.9
BIG-Bench Hard (3-shot)
–
–
52.3
65.7
51.2
Table 4: Comparison to closed-source models on academic benchmarks. Results for GPT-3.5 and GPT-4
are from OpenAI

In [22]:
query_str = "What is the parameter count of a Llama 2 model?"
response = query_engine.query(query_str)
print(response.source_nodes[0].get_content())

Llama.generate: prefix-match hit

llama_print_timings:        load time =   98013.41 ms
llama_print_timings:      sample time =       8.26 ms /    15 runs   (    0.55 ms per token,  1816.42 tokens per second)
llama_print_timings: prompt eval time =  198249.40 ms /  1018 tokens (  194.74 ms per token,     5.13 tokens per second)
llama_print_timings:        eval time =    7279.57 ms /    14 runs   (  519.97 ms per token,     1.92 tokens per second)
llama_print_timings:       total time =  205590.94 ms /  1032 tokens


A.7
Model Card
Table 52 presents a model card (Mitchell et al., 2018; Anil et al., 2023) that summarizes details of the models.
Model Details
Model Developers
Meta AI
Variations
Llama 2 comes in a range of parameter sizes—7B, 13B, and 70B—as well as
pretrained and fine-tuned variations.
Input
Models input text only.
Output
Models generate text only.
Model Architecture
Llama 2 is an auto-regressive language model that uses an optimized transformer
architecture. The tuned versions use supervised fine-tuning (SFT) and reinforce-
ment learning with human feedback (RLHF) to align to human preferences for
helpfulness and safety.
Model Dates
Llama 2 was trained between January 2023 and July 2023.
Status
This is a static model trained on an offline dataset. Future versions of the tuned
models will be released as we improve model safety with community feedback.
License
A custom commercial license is available at:
ai.meta.com/resources/
models-and-libraries/llama-downloads/
Where to send com-
me