In [4]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Step 1: Load all PDF files from the "data" directory

import os
from langchain.document_loaders import PyPDFLoader

data_dir = "data"
documents = []

for filename in os.listdir(data_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(data_dir, filename)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())


In [3]:
# Step 2: Split the text with recursive character-based splitting

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64
)

chunked_docs = text_splitter.split_documents(documents)

In [4]:
# Step 3: Create a FAISS index using the HuggingFace embeddings

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


EMDED_MODEL_NAME = "BAAI/bge-small-en-v1.5"

embed_model = HuggingFaceEmbeddings(model_name=EMDED_MODEL_NAME)

db = FAISS.from_documents(chunked_docs, embed_model)

  warn_deprecated(


In [5]:
# Step 4: Create a retriever using the FAISS index as the backend
TOP_K = 10

faiss_retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": TOP_K} )

In [6]:
# question = """How do they prepare the dataset for each data type? 
# Be as detailed as possible and present the information as a bullet-point list."""
question = """
How do they prepare datasets for text, images, and videos? 
Detail the steps involved, describe preprocessing techniques, and explain data structure and quantity used for pretraining. 
Present as a bullet-point list.
"""

In [7]:
docs = faiss_retriever.get_relevant_documents(question)

for doc in docs:
    print("\n====================================Top document====================================")
    print(doc.page_content)
    print("Metadata:")
    print(doc.metadata)


3 Pre-Training
Language model pre-training involves: (1)the curation and ﬁltering of a large-scale training corpus, (2)the
development of a model architecture and corresponding scaling laws for determining model size, (3)the
development of techniques for eﬃcient pre-training at large scale, and (4)the development of a pre-training
recipe. We present each of these components separately below.
3.1 Pre-Training Data
Metadata:
{'source': 'data\\llama3.pdf', 'page': 3}

tables and charts. Additionally, we use captions and OCR extractions from existing images to generate
additional conversational or question-answer data related to the images.
Video.Similar to the image adapter, we use academic datasets with pre-existing annotations and convert them
into appropriate textual instructions and target responses. The targets are converted to open-ended responses
or multiple-choice options, whichever is more appropriate. We ask humans to annotate videos with questions
Metadata:
{'source': 'data\\l

  warn_deprecated(


In [8]:
# step 5: Load and initialize the Llama 3 model
from langchain.llms import Ollama

llm = Ollama(model="llama3.1", base_url="http://127.0.0.1:11434")

In [9]:
# Step 6: Create a prompt template

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


prompt_template = """
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer:
"""

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Step 7: Create an LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

  warn_deprecated(


In [10]:
# Step 8: Function to answer questions 
def answer_question(question, retriever):
    # Retrieve relevant documents
    docs = retriever.get_relevant_documents(question)
    
    # Combine the retrieved documents into a single context string
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Generate the answer
    response = chain.run(context=context, question=question)
    
    return response


In [11]:
# Example usage
answer = answer_question(question, faiss_retriever)

print(f"Question: {question}")
print(f"Answer: {answer}")

  warn_deprecated(


Question: 
How do they prepare datasets for text, images, and videos? 
Detail the steps involved, describe preprocessing techniques, and explain data structure and quantity used for pretraining. 
Present as a bullet-point list.

Answer: Here are the steps involved in preparing datasets for text, images, and videos:

**Text Data:**

* Curation of web data from various sources (until 2023)
* Application of de-duplication methods and data cleaning mechanisms to obtain high-quality tokens
* Removal of domains containing large amounts of personally identifiable information (PII) and known adult content
* Use of captions and OCR extractions from existing images to generate additional conversational or question-answer data related to the images
* Conversion of academic datasets with pre-existing annotations into appropriate textual instructions and target responses

**Image Data:**

* Construction of image-text pairs via a complex data processing pipeline consisting of four main stages:
	+ Qu

## Ensemble Retriever
Add a BM25Retriever and combine it with the FAISS retriever as an EnsembleRetriever. The EnsembleRetriever will use weighted averaging to combine the results from both retrievers

In [12]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# initialize the bm25 retriever
bm25_retriever = BM25Retriever.from_documents(chunked_docs)
bm25_retriever.k = TOP_K

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [13]:
docs = ensemble_retriever.get_relevant_documents(question)

for doc in docs:
    print("\n====================================Top document====================================")
    print(doc.page_content)
    print("Metadata:")
    print(doc.metadata)


still under development and not yet ready for release.
Before presenting the results of our experiments in Section 7.6 and 7.7, we describe the data we used to train
visual recognition capabilities, the model architecture of the vision components, how we scale training of those
components, and our pre-training and post-training recipes.
7.1 Data
We describe our image and video data separately below.
7.1.1 Image Data
Metadata:
{'source': 'data\\llama3.pdf', 'page': 53}

7.5.1 Supervised Finetuning Data
We describe our supervised ﬁnetuning (SFT) data for image and video capabilities separately below.
Image.We utilize a mix of diﬀerent datasets for supervised ﬁnetuning.
•Academic datasets. We convert a highly ﬁltered collection of existing academic datasets to question-
answer pairs using templates or via LLM rewriting. The LLM rewriting’s purpose is to augment the
data with diﬀerent instructions and to improve the language quality of answers.
Metadata:
{'source': 'data\\llama3.pdf', 'pa

In [14]:
# Example usage

answer = answer_question(question, ensemble_retriever)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: 
How do they prepare datasets for text, images, and videos? 
Detail the steps involved, describe preprocessing techniques, and explain data structure and quantity used for pretraining. 
Present as a bullet-point list.

Answer: Here is the answer:

**Preparing Datasets for Text, Images, and Videos:**

* **Text Data:**
	+ Create a dataset from various sources containing knowledge until 2023
	+ Apply de-duplication methods and data cleaning mechanisms on each data source to obtain high-quality tokens
	+ Remove domains with personally identifiable information (PII) or adult content
	+ Process HTML pages with mathematics and code content to preserve the structure of that content
* **Image Data:**
	+ Construct a dataset via a complex data processing pipeline:
		- Quality filtering: remove non-English captions and low-quality captions
		- Perceptual de-duplication: group similar images together
		- Resampling: resize images to uniform size
		- Optical character recognition (OCR): ex

## semantic chunk

In [15]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

data_folder = "data"
loader = DirectoryLoader(data_folder, glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="standard_deviation")
semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])
semantic_chunk_vectorstore = FAISS.from_documents(semantic_chunks, embed_model)


In [16]:
semantic_chunk_retriever = semantic_chunk_vectorstore.as_retriever(search_kwargs={"k" : TOP_K})
docs = semantic_chunk_retriever.invoke(question)

In [17]:
for doc in docs:
    print("\n====================================Top document====================================")
    print(doc.page_content)


Figure 11 Processing file uploads. Example of Llama 3 performing analysis and visualization of an uploaded ﬁle. We follow the principle that post-training should align the model to “know what it knows” rather than add
knowledge (Gekhman et al., 2024; Mielke et al., 2020). Our primary approach involves generating data that
aligns model generations with subsets of factual data present in the pre-training data. To achieve this, we
develop a knowledge probing technique that takes advantage of Llama 3’s in-context abilities. This data
generation process involves the following procedure:
1.Extract a data snippet from the pre-training data. 2.Generate a factual question about these snippets (context) by prompting Llama 3
3.Sample responses from Llama 3 to the question
4.Score the correctness of the generations using the original context as a reference and Llama 3 as a judge
5.Score the informativeness of the generations using Llama 3 as a judge
6.Generate a refusal for responses which are co

In [18]:
# Example usage

answer = answer_question(question, semantic_chunk_retriever)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: 
How do they prepare datasets for text, images, and videos? 
Detail the steps involved, describe preprocessing techniques, and explain data structure and quantity used for pretraining. 
Present as a bullet-point list.

Answer: Here are the steps involved in preparing datasets for text, images, and videos:

**Text Dataset Preparation:**

* Curate and filter a large-scale training corpus from various data sources
* Remove domains with personally identifiable information (PII) and adult content
* Apply de-duplication methods to remove redundant data
* Clean and preprocess the data using techniques such as tokenization and filtering

**Image Dataset Preparation:**

* Construct image-text pairs through a complex data processing pipeline consisting of:
	+ Quality filtering: Remove non-English captions, low-quality captions, and image-text pairs with low alignment scores
	+ Perceptual de-duplication: Reduce redundant data to improve model performance
	+ Resampling: Adjust the resolu

# Conclusion
1. The ensemble retriever with BM25 did not improve performance and lacks depth and video data. 
2. Faiss, with either RecursiveCharacterTextSplitter or SemanticTextSplitter, provides a clear, structured method for preparing text, image, and video data, enhancing understanding.