In [1]:
!pip install -q accelerate==0.21.0 bitsandbytes==0.40.2 transformers==4.31.0 pandas datasets tqdm chromadb sentence-transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import re
from collections import defaultdict

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

import pandas as pd
import datasets
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [4]:
dataset = datasets.load_dataset("jamescalam/llama-2-arxiv-papers-chunked")
df = dataset['train'].to_pandas()
df.head(5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
3,1102.0183,3,"Mutch and Lowe, 2008), whose lters are xed, ...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
4,1102.0183,4,We evaluate various networks on the handwritte...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [5]:
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

texts = df['chunk'].to_list()
texts_embeddings = embedding_model.encode(texts)
texts_embeddings.shape

(4838, 768)

In [6]:
chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(name="rag")
collection.add(
    embeddings = texts_embeddings,
    documents=texts,
    ids= [str(i) for i in range(len(texts))]
)

In [7]:
def query_vector_db(query, n_results=3):
    results = collection.query(
        query_embeddings = embedding_model.encode(query).tolist(),
        n_results=n_results
    )
    return results['documents']
query = "What is llama2"
retrieved_results = query_vector_db(query)
print(retrieved_results[0])

['our responsible release strategy can be found in Section 5.3.\nTheremainderofthispaperdescribesourpretrainingmethodology(Section2),ﬁne-tuningmethodology\n(Section 3), approach to model safety (Section 4), key observations and insights (Section 5), relevant related\nwork (Section 6), and conclusions (Section 7).\n‡https://ai.meta.com/resources/models-and-libraries/llama/\n§We are delaying the release of the 34B model due to a lack of time to suﬃciently red team.\n¶https://ai.meta.com/llama\n‖https://github.com/facebookresearch/llama\n4\nFigure 4: Training of L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc : This process begins with the pretraining ofL/l.sc/a.sc/m.sc/a.sc /two.taboldstyle using publicly\navailableonlinesources. Followingthis,wecreateaninitialversionof L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc throughtheapplication', 'Ricardo Lopez-Barquilla, Marc Shedroﬀ, Kelly Michelena, Allie Feinstein, Amit Sangani, Geeta\nChauhan,ChesterHu,CharltonGholson,Anja

In [8]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model_name = "NousResearch/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = True
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Create pipe
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=5000)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [9]:
def get_rag_responce(question):
    context = query_vector_db(question)
    full_prompt = f'''
[INST]
### Instruction: Give answer for the question strictly based on the context provided.

### Input: {question}

### Context : {context}
[/INST]
    '''
    result = pipe(full_prompt)
    return result[0]['generated_text']


prompt = "What is llama2"
print(get_rag_responce(prompt))




[INST]
### Instruction: Give answer for the question strictly based on the context provided.

### Input: What is llama2

### Context : [['our responsible release strategy can be found in Section 5.3.\nTheremainderofthispaperdescribesourpretrainingmethodology(Section2),ﬁne-tuningmethodology\n(Section 3), approach to model safety (Section 4), key observations and insights (Section 5), relevant related\nwork (Section 6), and conclusions (Section 7).\n‡https://ai.meta.com/resources/models-and-libraries/llama/\n§We are delaying the release of the 34B model due to a lack of time to suﬃciently red team.\n¶https://ai.meta.com/llama\n‖https://github.com/facebookresearch/llama\n4\nFigure 4: Training of L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc : This process begins with the pretraining ofL/l.sc/a.sc/m.sc/a.sc /two.taboldstyle using publicly\navailableonlinesources. Followingthis,wecreateaninitialversionof L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc throughtheapplicatio

In [10]:
prompt = "What is attention in relation to llama2"
print(get_rag_responce(prompt))


[INST]
### Instruction: Give answer for the question strictly based on the context provided.

### Input: What is attention in relation to llama2

### Context : [['attention graph, a path from node vat position k\ninli, to node uat position minlj, is a series of\nedges that connect these two nodes. If we look\nat the weight of each edge as the proportion of\ninformation transferred between two nodes, we\ncan compute how much of the information at v\nis propagated to uthrough a particular path by\nmultiplying the weights of all edges in that path.\nSince there may be more than one path between\ntwo nodes in the attention graph, to compute the\ntotal amount of information propagated from vtou,\nwe sum over all possible paths between these two\nnodes. At the implementation level, to compute the\nattentions from litolj, we recursively multiply the\nattention weights matrices in all the layers below.\n~A(li) =\x1a\nA(li)~A(li\x001)ifi > j\nA(li) ifi=j(1)\nIn this equation, ~Ais attention ro