In [1]:
!pip install -qU \
  datasets==2.14.6 \
  FlagEmbedding==1.1.5

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━

## Dataset Download

We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset.

In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

First we define our embedding function.

In [24]:
import os
import torch
from FlagEmbedding import FlagModel

# see if GPU is available for model to use
device = "0" if torch.cuda.is_available() else ""
os.environ["CUDA_VISIBLE_DEVICES"] = device
print(device)

model = FlagModel(
    'BAAI/bge-large-en-v1.5',
    query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
    use_fp16=True  # did not note noticable difference between fp16 or fp32
)

def embed(docs: list[str]):
    doc_embeds = model.encode(docs)
    return doc_embeds

0


Use this to build a Numpy array of cohere embedding vectors.

In [25]:
from tqdm.auto import tqdm
import numpy as np

chunks = data["chunk"]
batch_size = 256

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    chunk_batch = chunks[i:i_end]
    # embed current batch
    embed_batch = embed(chunk_batch)
    # add to existing np array if exists (otherwise create)
    if i == 0:
        arr = np.array(embed_batch)
    else:
        arr = np.concatenate([arr, np.array(embed_batch)])

  0%|          | 0/163 [00:00<?, ?it/s]


Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.85it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.84it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.92it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]

Inference Embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference Em

Now we need to create the query mechanism, this is simply a cosine similarity calculation between a query vector and our `arr` vectors.

In [26]:
from numpy.linalg import norm

# convert chunks list to array for easy indexing
chunk_arr = np.array(chunks)

def query(text: str, top_k: int=3) -> list[str]:
    # create query embedding
    xq = model.encode_queries([text])[0]
    xq = np.array(xq)
    # calculate dot product (these are normalized vectors)
    sim = np.dot(arr, xq.T)
    # get indices of top_k records
    idx = np.argpartition(sim, -top_k)[-top_k:]
    #scores = sim[idx]
    contexts = chunk_arr[idx]
    for c in contexts.tolist():
        print(c)
        print("----------")

In [27]:
query("why should I use llama 2?")

but BoolQ. Similarly, this model surpasses PaLM540B everywhere but on BoolQ and WinoGrande.
LLaMA-13B model also outperforms GPT-3 on
most benchmarks despite being 10 smaller.
3.2 Closed-book Question Answering
We compare LLaMA to existing large language
models on two closed-book question answering
benchmarks: Natural Questions (Kwiatkowski
et al., 2019) and TriviaQA (Joshi et al., 2017). For
both benchmarks, we report exact match performance in a closed book setting, i.e., where the models do not have access to documents that contain
evidence to answer the question. In Table 4, we
report performance on NaturalQuestions, and in Table 5, we report on TriviaQA. On both benchmarks,
LLaMA-65B achieve state-of-the-arts performance
in the zero-shot and few-shot settings. More importantly, the LLaMA-13B is also competitive on
these benchmarks with GPT-3 and Chinchilla, despite being 5-10 smaller. This model runs on a
single V100 GPU during inference.
0-shot 1-shot 5-shot 64-shot
Gopher 280B

In [28]:
query("can you tell me about red teaming for llama 2?")

Ricardo Lopez-Barquilla, Marc Shedroﬀ, Kelly Michelena, Allie Feinstein, Amit Sangani, Geeta
Chauhan,ChesterHu,CharltonGholson,AnjaKomlenovic,EissaJamil,BrandonSpence,Azadeh
Yazdan, Elisa Garcia Anzano, and Natascha Parks.
•ChrisMarra,ChayaNayak,JacquelinePan,GeorgeOrlin,EdwardDowling,EstebanArcaute,Philomena Lobo, Eleonora Presani, and Logan Kerr, who provided helpful product and technical organization support.
46
•Armand Joulin, Edouard Grave, Guillaume Lample, and Timothee Lacroix, members of the original
Llama team who helped get this work started.
•Drew Hamlin, Chantal Mora, and Aran Mun, who gave us some design input on the ﬁgures in the
paper.
•Vijai Mohan for the discussions about RLHF that inspired our Figure 20, and his contribution to the
internal demo.
•Earlyreviewersofthispaper,whohelpedusimproveitsquality,includingMikeLewis,JoellePineau,
Laurens van der Maaten, Jason Weston, and Omer Levy.
----------
improved various NLP tasks. The introduction of the Transformer architec

In [29]:
query("what is the best llm?")

Rank the {{num}} passages above based on their relevance to the search query. The passages
should be listed in descending order using identiﬁers, and the most relevant passages should be
listed ﬁrst, and the output format should be [] > [], e.g., [1] > [2]. Only response the ranking results,
do not say any word or explain.
B Related Work
B.1 Information Retrieval with LLMs
Recently, large language models (LLMs) have found increasing applications in information retrieval.
Several approaches have been proposed to utilize LLMs for passage retrieval. For example, SGPT (Muennighoff, 2022) generates text embeddings using GPT, DSI (Tay et al., 2022) proposes a differentiable
search index, and HyDE (Gao et al., 2022) generates pseudo-documents using GPT-3. In addition, LLMs
have also been used for passage re-ranking tasks. UPR (Sachan et al., 2022a) and SGPT-CE (Muennighoff,
2022) introduce instructional query generation methods, while HELM (Liang et al., 2022) utilizes instruction relevance g

In [30]:
query("what is the difference between gpt-3.5 and llama 2?")  # seems more relevant than other responses

ranked from top 1 to top 5. We compare the ﬁve ranked groups against the baseline, and show the
relative scores in Figure 4 (a,b). The ChatGPT and GPT-4 evaluation is consistent with the orders
6
60% 70% 80% 90% 100%LLaMA (13B)Alpaca (13B)Vicuna (13B)LLaMA_GPT4 (7B)LLaMA_GPT4 (7B, R1)BardChatGPTGPT4
67% 466 : 69776% 539 : 71293% 639 : 68887% 607 : 70089% 620 : 69392% 624 : 68195% 652 : 684100% 758 : 758(a) All chatbots against GPT-4, whose Chinese responses are translated from English
60% 70% 80% 90% 100%LLaMA (13B)Alpaca (13B)Vicuna (13B)LLaMA_GPT4 (7B)LLaMA_GPT4 (7B, R1)BardChatGPTGPT4
----------
to GPT-3 corresponds to the Stanford Alpaca model. From Figure 3(a), we observe that ( i) For the
“Helpfulness” criterion, GPT-4 is the clear winner with 54.12% of the votes. GPT-3 only wins 19.74%
of the time. ( ii) For the “Honesty” and “Harmlessness” criteria, the largest portion of votes goes
to the tie category, which is substantially higher than the winning categories but GPT-3 (Alpaca

---