In [32]:

from dotenv import load_dotenv
import os

load_dotenv("../jerry.env")


True

In [10]:
import pinecone
import os

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment="us-west1-gcp-free")

In [11]:
pinecone.create_index("quickstart", dimension=1536, metric="euclidean")


In [12]:
pinecone_index = pinecone.Index("quickstart")


In [13]:
from llama_index.vector_stores import PineconeVectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"


--2023-09-14 22:25:34--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 128.84.21.199
Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’


2023-09-14 22:25:45 (1.24 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]



In [15]:
from pathlib import Path
from llama_hub.file.pymu_pdf.base import PyMuPDFReader


In [17]:
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")


In [19]:
from llama_index.text_splitter import SentenceSplitter
text_splitter = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_splitter.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))


In [21]:
from llama_index.schema import TextNode
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)


In [23]:
# print a sample node
print(nodes[1].get_content(metadata_mode="all"))

total_pages: 77
file_path: ./data/llama2.pdf
source: 2

Contents
1
Introduction
3
2
Pretraining
5
2.1
Pretraining Data .............................................5
2.2
Training Details .............................................5
2.3
Llama 2 Pretrained Model Evaluation ................................7
3
Fine-tuning
8
3.1
Supervised Fine-Tuning (SFT) .....................................9
3.2
Reinforcement Learning with Human Feedback (RLHF)
.....................9
3.3
System Message for Multi-Turn Consistency .............................16
3.4
RLHF Results
..............................................17
4
Safety
20
4.1
Safety in Pretraining
..........................................20
4.2
Safety Fine-Tuning
...........................................23
4.3
Red Teaming ...............................................28
4.4
Safety Evaluation of Llama 2-Chat ..................................29
5
Discussion
32
5.1
Learnings and Observations ......................................32
5.

In [36]:
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", api_key=os.environ["OPENAI_API_KEY"])

metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=5, llm=llm),
        QuestionsAnsweredExtractor(questions=3, llm=llm),
    ],
    in_place=False,
)


In [37]:
nodes = metadata_extractor.process_nodes(nodes)


Extracting questions:   0%|          | 0/107 [00:00<?, ?it/s]

In [45]:
from pprint import pprint

pprint(nodes[0])

TextNode(id_='440dd934-e1d7-40ce-af68-4ba48fc741be', embedding=None, metadata={'total_pages': 77, 'file_path': './data/llama2.pdf', 'source': '1', 'document_title': 'Llama 2: Pretraining, Fine-tuning, Safety, and Discussion for Developing and Evaluating a Pretrained and Fine-Tuned Large Language Model, and Release of Llama 2 and Llama 2-Chat Models for Research and Commercial Use', 'questions_this_excerpt_can_answer': '1. What is the purpose of Llama 2 and what makes it different from other large language models?\n2. How does Llama 2-Chat perform compared to open-source chat models in terms of benchmarks and human evaluations?\n3. What are the details of the fine-tuning and safety improvements made to Llama 2-Chat, and how can the community contribute to the responsible development of large language models?'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['questions_this_excerpt_can_answer'], relationships={}, hash='98ef71d6b4ce195ef3323de8aa571372161f2b9864cc7aad637b3d1