In [1]:
!pip install -qU \
  datasets==2.14.6 \
  transformers==4.35.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m124.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25h

## Dataset Download

We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset.

In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

First we define our embedding function.

In [3]:
import torch
from torch.nn.functional import normalize
from transformers import AutoModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

model_id = "thenlper/gte-large"

# initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

def embed(docs: list[str]) -> list[list[float]]:
    # tokenize
    tokens = tokenizer(
        docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        # process with model for token-level embeddings
        out = model(**tokens)
        # mask padding tokens
        last_hidden = out.last_hidden_state.masked_fill(
            ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # create mean pooled embeddings
        doc_embeds = last_hidden.sum(dim=1) / \
            tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy()

Using cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

Use this to build a Numpy array of cohere embedding vectors.

In [4]:
from tqdm.auto import tqdm
import numpy as np

chunks = data["chunk"]
batch_size = 128

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    chunk_batch = chunks[i:i_end]
    # embed current batch
    embed_batch = embed(chunk_batch)
    # add to existing np array if exists (otherwise create)
    if i == 0:
        arr = embed_batch.copy()
    else:
        arr = np.concatenate([arr, embed_batch.copy()])

  0%|          | 0/325 [00:00<?, ?it/s]

Now we need to create the query mechanism, this is simply a cosine similarity calculation between a query vector and our `arr` vectors.

In [10]:
from numpy.linalg import norm

# convert chunks list to array for easy indexing
chunk_arr = np.array(chunks)

def query(text: str, top_k: int=3) -> list[str]:
    # create query embedding
    xq = embed([text])[0]
    # calculate cosine similarities
    sim = np.dot(arr, xq.T) / (norm(arr, axis=1)*norm(xq.T))
    # get indices of top_k records
    idx = np.argpartition(sim, -top_k)[-top_k:]
    docs = chunk_arr[idx]
    for d in docs.tolist():
        print(d)
        print("----------")

In [11]:
query("why should I use llama 2?")

•Zero-shot. We provide a textual description
of the task and a test example. The model
either provides an answer using open-ended
generation, or ranks the proposed answers.
•Few-shot. We provide a few examples of the
task (between 1 and 64) and a test example.
The model takes this text as input and generates the answer or ranks different options.
We compare LLaMA with other foundation models, namely the non-publicly available language
models GPT-3 (Brown et al., 2020), Gopher (Rae
et al., 2021), Chinchilla (Hoffmann et al., 2022)
and PaLM (Chowdhery et al., 2022), as well as
the open-sourced OPT models (Zhang et al., 2022),
GPT-J (Wang and Komatsuzaki, 2021), and GPTNeo (Black et al., 2022). In Section 4, we also
brieﬂy compare LLaMA with instruction-tuned
models such as OPT-IML (Iyer et al., 2022) and
Flan-PaLM (Chung et al., 2022).We evaluate LLaMA on free-form generation
tasks and multiple choice tasks. In the multiple
choice tasks, the objective is to select the most
----------
but

In [12]:
query("can you tell me about red teaming for llama 2?")

future work, we plan on explicitly comparing and contrasting (semi-)manual versus automated approaches
to red teaming in order to determine how the two methods vary in the efﬁcacy and diversity of resulting red
team attacks.
5.2 Policy Interventions
Red teaming entails working with inherently controversial subject matter, and most organizations that red
team systems have strong counter-incentives to share their ﬁndings.13This is a problem; if we cannot publicly
13Red team datasets include offensive content, and may potentially reveal embarrassing or sensitive details about an
institution’s AI system if released publicly.
14
discuss — in detail — how we red team systems and what we learn as a result, it will be difﬁcult to broadly
share the future risks, failures, and implications of yet-to-be developed systems. This problem gets worse
over time. As systems become more capable, the results of red teaming may surface increasingly undesirable
harms. Therefore, we need to change the incent

In [13]:
query("what is the best llm?")

faces negligible performance degradation compared to its uncompressed original, while it consumes
only 25% of the GPU memory required by the uncompressed version, thus supporting its effective
inference on 4RTX 3090 Ti (24G) or 8 RTX 2080 Ti (11G). We will attempt to further reduce
the resource requirements and keep the community updated on this important working item.
23
Technical Report 2022-10-06 (v1)
B E THICS : EVALUATION ON BIASES AND TOXICITY
Albeit LLMs’ strong abilities in language and beyond, which could bring substantial welfare to
human beings, they can potentially produce toxic and illegal contents for evil use (Weidinger et al.,
2021; Sheng et al., 2021; Dev et al., 2021; Bommasani et al., 2021). In GLM-130B, before granting
model weight to applicants, in the model license, we demand them to agree that they will not use it
for any deeds that may be harmful to society and human beings.
Additionally, from a technical perspective, we argue that we must also understand LLMs

In [14]:
query("what is the difference between gpt-4 and llama 2?")

-0.043
-0.009+0.0132-0.004 +0.0562
+0.0387-0.012
-0.076Alpaca: 0.39 LLaMA-GPT4: 0.34 GPT4: 0.37Figure 6: ROUGE-L on unnatural instructions evaluated with 9K samples. The instructions are
grouped into four subsets based on the ground-truth response length. The mean values are reported in
the legend. The difference with GPT-4 is reported on the bar per group. LLaMA-GPT4 is a closer
proxy to GPT-4 than Alpaca.
closely follow the behavior of GPT-4. When the sequence length is short, both LLaMA-GPT4 and
GPT-4 can generate responses that contains the simple ground truth answers, but add extra words to
make the response more chat-like, which probably leads to lower ROUGE-L scores.
5 R ELATED WORK
Instruction Tuning. Instruction tuning of LLMs is an increasingly popular research direction in
NLP (Zhong et al., 2021; Ouyang et al., 2022; Wei et al., 2021). Existing works aim to improve
the quality and scale of three factors in the development pipeline, including instruction-following
----------

---