#### Complete LangChain Pipeline (end-to-end)

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [1]:
# 1) Load Data
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
# 2) Embeddings + Vector DB
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [5]:
# 3) Load all PDF file

docs = DirectoryLoader(
    'papers', '**/*.pdf', loader_cls= PyPDFLoader
).load()
docs

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}, page_content='STaR: Self-Taught Reasoner\nBootstrapping Reasoning With Reasoning\nEric Zelikman∗1, Yuhuai Wu∗12, Jesse Mu1, Noah D. Goodman1\n1Department of Computer Science, Stanford University\n2 Google Research\n{ezelikman, yuhuai, muj, ngoodman}@stanford.edu\nAbstract\nGenerating step-by-step "chain-of-thought" rationales improves language model\nperformance on complex reasoning tasks like mathematics or commonsense\nquestion-answering. However, inducing language model rationale generation cur-\nrently requires either constructing massive rational

In [6]:
print('Loaded docs:', len(docs))
print('Example metadata:', docs[0].metadata)

Loaded docs: 52
Example metadata: {'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}


In [7]:
# 4) Split the text into chunks

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap = 200
)
chunks = splitter.split_documents(docs)

print('Total chunks:', len(chunks))
print('Example Chunk metadata:', chunks[0].metadata)

Total chunks: 215
Example Chunk metadata: {'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-05-23T01:29:23+00:00', 'author': '', 'keywords': '', 'moddate': '2022-05-23T01:29:23+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/2203.14465v2.pdf', 'total_pages': 30, 'page': 0, 'page_label': '1'}


In [9]:
# 5) Create Embeddings

embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2736.81it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [10]:
# 6) Store the Embeddings in Vector DB (FAISS)
vector_db = FAISS.from_documents(chunks, embeddings)

# saving to disk
vector_db.save_local('faiss_index')

In [15]:
# 7) Retrieval: query -> similar chunks

query = 'What is the main idea discussed in these papers'
retriever = vector_db.as_retriever(search_kwargs = {'k': 4})
top_docs = retriever.invoke(query)

In [16]:
print("\nTop matches:")
for i, d in enumerate(top_docs, 1):
    src = d.metadata.get('source', 'unknown')
    page = d.metadata.get('page', 'N/A')
    print(f'\n[{i}] Source: {src} | Page: {page}')
    print(d.page_content[:300], '...')


Top matches:

[1] Source: papers/2501.12948v1.pdf | Page: 1
4.2 Unsuccessful Attempts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 15
5 Conclusion, Limitations, and Future Work 16
A Contributions and Acknowledgments 20
2 ...

[2] Source: papers/2203.14465v2.pdf | Page: 9
Acknowledgements
We thank Imanol Schlag for his detailed feedback about this work, as well as Rose E Wang, Markus
Rabe, Aitor Lewkowycz, Rishi Bommasani, Allen Nie, Alex Tamkin, and Qian Huang. We thank
Cem Anil for his very helpful insight that rationale ﬁnetuning performance can be improved if the ...

[3] Source: papers/2203.14465v2.pdf | Page: 12
Appendix
A CommonsenseQA Error Patterns
Throughout our experiments, we came across a variety of interesting failure cases for commonsense
reasoning. Note that all the ﬁnal answers are correct – however, we take issue with the reasoning
used in order to arrive at those answers.
A.1 Question Implies A ...

[4] Source: papers/2203.14465v2.pdf | Page: 2