# NLP - CA6
* **Name:** Mohammad Mahdi Salmani
* **Student id:** 810102174

## Load libraries

In [None]:
%pip install -r requirements.txt

In [88]:
import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import LocalFileStore
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_together import ChatTogether
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Literal
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from IPython.core.display import Markdown
from langchain_core.messages import HumanMessage, AIMessage
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.embeddings import CacheBackedEmbeddings
from langchain.utilities.tavily_search import TavilySearchAPIWrapper
from langchain.tools.tavily_search import TavilySearchResults

## Part1: Load dataset

In [79]:
with open("configs.json") as f:
    configs = json.loads(f.read())

os.environ["TAVILY_API_KEY"] = configs["api"]["TAVILY_API_KEY"]
os.environ["TOGETHER_API_KEY"] = configs["api"]["TOGETHER_API_KEY"]

### Crawl pdf list from site

In [2]:
url = "https://stanford.edu/~jurafsky/slp3/"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

chapters = []
for link in soup.find_all('a', href=True):
    if '.pdf' in link['href'] and link['href'].split('.')[0].isdigit():
        chapters.append(link['href'])

pdf_links = [url+link for link in chapters]
pdf_links

['https://stanford.edu/~jurafsky/slp3/2.pdf',
 'https://stanford.edu/~jurafsky/slp3/3.pdf',
 'https://stanford.edu/~jurafsky/slp3/4.pdf',
 'https://stanford.edu/~jurafsky/slp3/5.pdf',
 'https://stanford.edu/~jurafsky/slp3/6.pdf',
 'https://stanford.edu/~jurafsky/slp3/7.pdf',
 'https://stanford.edu/~jurafsky/slp3/8.pdf',
 'https://stanford.edu/~jurafsky/slp3/9.pdf',
 'https://stanford.edu/~jurafsky/slp3/10.pdf',
 'https://stanford.edu/~jurafsky/slp3/11.pdf',
 'https://stanford.edu/~jurafsky/slp3/13.pdf',
 'https://stanford.edu/~jurafsky/slp3/14.pdf',
 'https://stanford.edu/~jurafsky/slp3/15.pdf',
 'https://stanford.edu/~jurafsky/slp3/16.pdf',
 'https://stanford.edu/~jurafsky/slp3/17.pdf',
 'https://stanford.edu/~jurafsky/slp3/18.pdf',
 'https://stanford.edu/~jurafsky/slp3/19.pdf',
 'https://stanford.edu/~jurafsky/slp3/20.pdf',
 'https://stanford.edu/~jurafsky/slp3/21.pdf',
 'https://stanford.edu/~jurafsky/slp3/22.pdf',
 'https://stanford.edu/~jurafsky/slp3/23.pdf']

### Download PDFs

In [4]:
selected_pdf_links = pdf_links[:10] # Just retrieve from Chapter I: Fundamental Algorithms

In [12]:
documents = []
for pdf_link in tqdm(selected_pdf_links):
    loader = PyPDFLoader(pdf_link)
    documents.extend(loader.load())

100%|██████████| 10/10 [02:42<00:00, 16.28s/it]


In [13]:
len(documents)

275

### Chunking

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
chunks = text_splitter.split_documents(documents)

len(chunks)

904

### Vector Store

In [19]:
%pip install --upgrade --quiet  langchain-openai faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [18]:
embedding_function = HuggingFaceEmbeddings(show_progress=True, multi_process=True)

model.safetensors:   2%|2         | 10.5M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
store = LocalFileStore("./embedding_cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_function, store, namespace=embedding_function.model_name)

## Part3: Retriever

In [30]:
# %%time
vector_store = FAISS.from_documents(documents=chunks, embedding=cached_embedder)

CPU times: total: 203 ms
Wall time: 2min 34s


In [57]:
query = "Recurrent Neural Networks"
results = vector_store.similarity_search_with_relevance_scores(query, k=3)

for i, result in enumerate(results):
    print(f'[{i+1}] Score: {result[1]:.3f}, Metadata: {result[0].metadata}')
    print(f'\n{result[0].page_content[:200]}...\n')

[1] Score: 0.589, Metadata: {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 0}

the prior context, in its recurrent connections , allowing the model’s decision to
depend on information from hundreds of words in the past. We’ll see how to apply
the model to the task of language mo...

[2] Score: 0.562, Metadata: {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 23}

covered:
• In simple Recurrent Neural Networks sequences are processed one element at
a time, with the output of each neural unit at time tbased both on the current
input at tand the hidden layer from...

[3] Score: 0.553, Metadata: {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 26}

neural networks. SemEval-2016 .
Gers, F. A., J. Schmidhuber, and F. Cummins. 2000. Learn-
ing to forget: Continual prediction with lstm. Neural
computation , 12(10):2451–2471.
Giles, C. L., G. M. Kuhn...



In [58]:
sample_queries = [
    "What is the advantage of Bidirectional RNNs?",
    "What is the differences between a stack and a queue.",
    "Who is the president of United States?"
]

In [60]:
faiss_retriever = vector_store.as_retriever(search_kwargs={"k": 3})

for i, question in enumerate(sample_queries):
    print(f'[Q{i+1}] {question}\n')
    results = faiss_retriever.invoke(query)
    for j, result in enumerate(results):
        print(f'Retrieve {j+1}) {result.metadata}')
        print(f'\n{result.page_content[:200]}...\n')

[Q1] What is the advantage of Bidirectional RNNs?

Retrieve 1) {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 0}

the prior context, in its recurrent connections , allowing the model’s decision to
depend on information from hundreds of words in the past. We’ll see how to apply
the model to the task of language mo...

Retrieve 2) {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 23}

covered:
• In simple Recurrent Neural Networks sequences are processed one element at
a time, with the output of each neural unit at time tbased both on the current
input at tand the hidden layer from...

Retrieve 3) {'source': 'https://stanford.edu/~jurafsky/slp3/9.pdf', 'page': 26}

neural networks. SemEval-2016 .
Gers, F. A., J. Schmidhuber, and F. Cummins. 2000. Learn-
ing to forget: Continual prediction with lstm. Neural
computation , 12(10):2451–2471.
Giles, C. L., G. M. Kuhn...

[Q2] What is the differences between a stack and a queue.

Retrieve 1) {'source': 'https://s

## Part3: EnsembleRetriever

In [None]:
%pip install --upgrade --quiet  rank_bm25

In [68]:
bm25_retriever = BM25Retriever.from_documents(documents=chunks, embedding=cached_embedder)

In [72]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.4, 0.6]
)

In [73]:
ensemble_retriever.invoke(sample_queries[0])

[Document(page_content='sequence from tto the end of the sequence.\nAbidirectional RNN (Schuster and Paliwal, 1997) combines two independentbidirectional\nRNN\nRNNs, one where the input is processed from the start to the end, and the other from\nthe end to the start. We then concatenate the two representations computed by the\nnetworks into a single vector that captures both the left and right contexts of an input\nat each point in time. Here we use either the semicolon ”;” or the equivalent symbol\n⊕to mean vector concatenation:\nht= [hf\nt;hb\nt]\n=hf\nt⊕hb\nt (9.18)\nFig. 9.11 illustrates such a bidirectional network that concatenates the outputs of\nthe forward and backward pass. Other simple ways to combine the forward and\nbackward contexts include element-wise addition or multiplication. The output at\neach step in time thus captures information to the left and to the right of the current\ninput. In sequence labeling applications, these concatenated outputs can serve as the\nbas

## Part4: Router Chain

In [82]:
llm = ChatTogether(
    model="meta-llama/Llama-3-70b-chat-hf",
    temperature= 0,
    )

In [83]:
router_prompt = ChatPromptTemplate.from_template(
    template="""
You are a routing assistant responsible for directing user queries to the appropriate tool: either a VectorStore, SearchEngine, or a fallback message.
Your VectorStore contains data about chapters of the Natural Language Processing book by Jurafsky. If the given query is about N-grams, Text Classifications, Embeddings, Sequence Labeling, RNNs, LSTMs, Transformers, or Large Language Models, choose VectorStore.
If the query is related to general computer science topics, choose SearchEngine.
If the query is not related to NLP or computer science, choose None.

Decide whether the given question should be routed to "VectorStore", "SearchEngine", or "None".
Provide only the name of the chosen tool and nothing else. If no tool is chosen, return the string "None".

{output_instructions}

Question: {question}

Response: """,
)

In [84]:
class ChosenTool(BaseModel):
    tool_name: Literal["VectorStore", "SearchEngine", "None"] = Field(description="the tool that was chosen by LLM in question routing stage")

# Create the parser
question_router_parser = PydanticOutputParser(pydantic_object=ChosenTool)

In [85]:
router_chain = router_prompt | llm | question_router_parser

## Part5: Search Engine Chain

### Tavily Search Sample

In [98]:
from langchain.agents import initialize_agent, AgentType

search = TavilySearchAPIWrapper()
tavily_tool = TavilySearchResults(api_wrapper=search)

agent_chain = initialize_agent(
    [tavily_tool],
    llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

agent_chain.run("Who is the president of Iran?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: Who is the president of Iran?
Thought: I need to look up the current president of Iran.
Action:
```
{
  "action": "tavily_search_results_json",
  "action_input": "current president of Iran"
}
```
[0m
Observation: [36;1m[1;3m[{'url': 'https://abcnews.go.com/International/wireStory/masoud-pezeshkian-heart-surgeon-rose-power-parliament-now-111709850', 'content': 'The stances of Iran\'s President-elect Masoud Pezeshkian reflect the dualities of being a reformist politician within Iran’s Shiite theocracy DUBAI, United Arab Emirates -- After the 2022 death of Mahsa Amini, Iranian lawmaker Masoud Pezeshkian wrote that it was “unacceptable in the Islamic Republic to arrest a girl for her hijab Trending Reader Picks 12-year-old attacked and taken by crocodile Would Harris be a stronger candidate than Biden? How to escape from life-threatening rip currents 4 people injured by shark in Gulf on July 4 Parents at beach with k

'The current president of Iran is Masoud Pezeshkian.'

In [None]:
%pip install tavily-python

In [104]:
from tavily import TavilyClient
client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
client.search("Who is the president of Iran?", search_depth="advanced")

{'query': 'Who is the president of Iran?',
 'follow_up_questions': None,
 'answer': None,
 'images': None,
 'results': [{'url': 'https://en.wikipedia.org/wiki/President_of_Iran',
   'title': 'President of Iran - Wikipedia',
   'content': "Powers and responsibilities[edit]\nThe President's duties include the following, subject to supervision and approval by the Supreme Leader:\nsome of these duties require the approval of the Supreme Leader.[28]\nOath of office[edit]\nI, as the President, upon the Holy Qur'an and in the presence of the Iranian nation, do hereby swear in the name of Almighty God to safeguard the official Faith, the system of the Islamic republic and the Constitution of the country; to use all my talents and abilities in the discharge of responsibilities undertaken by me; to devote myself to the service of the people, glory of the country, promotion of religion and morality, support of right and propagation of justice; to refrain from being autocratic; to protect the free

### Implement Search Engine Chain

In [117]:
from tavily import TavilyClient
from langchain_core.documents.base import Document
from langchain.chains.base import Chain


class SearchEngineChain:
    def __init__(self, api_key):
        self.client = TavilyClient(api_key=api_key)

    def __call__(self, query: str):
        results = self.search_tavily(query)
        documents = self.process_results(results)
        return documents

    def search_tavily(self, query: str):
        response = self.client.search(query, search_depth="advanced")
        return response["results"]

    def process_results(self, results: list[dict]):
        documents = []
        for result in results[:5]:  # حداکثر 5 نتیجه
            content = result.get("content")
            url = result.get("url")
            document = Document(page_content=content, metadata={"url": url}) #TODO: source
            documents.append(document)
        return documents

In [116]:
search_engine_chain = SearchEngineChain(api_key=os.environ['TAVILY_API_KEY'])

query = "Who is the president of Iran?"
result = search_engine_chain(query)
result

[Document(page_content="Powers and responsibilities[edit]\nThe President's duties include the following, subject to supervision and approval by the Supreme Leader:\nsome of these duties require the approval of the Supreme Leader.[28]\nOath of office[edit]\nI, as the President, upon the Holy Qur'an and in the presence of the Iranian nation, do hereby swear in the name of Almighty God to safeguard the official Faith, the system of the Islamic republic and the Constitution of the country; to use all my talents and abilities in the discharge of responsibilities undertaken by me; to devote myself to the service of the people, glory of the country, promotion of religion and morality, support of right and propagation of justice; to refrain from being autocratic; to protect the freedom and dignity of individuals and the rights of the Nation recognized by the Constitution; to spare no efforts in safeguarding the frontiers and the political, economic and cultural freedoms of the country; to guar

## Part6: Relevancy Check Chain

In [119]:
relevancy_check_prompt = ChatPromptTemplate.from_template(
    template="""
You are given a user query and a document. Your task is to evaluate if the document is relevant to the query.
Respond with only one word: "relevant" or "irrelevant".

{output_instructions}

Query: {query}

Document: {document}

Response: """
)

class Relevance(BaseModel):
    relevance: Literal["relevant", "irrelevant"]

relevance_parser = PydanticOutputParser(pydantic_object=Relevance)

relevancy_check_chain = relevancy_check_prompt | llm | relevance_parser