# Set-up

In [None]:
%pip install langchain_community tiktoken langchainhub chromadb langchain
%pip install sentence_transformers
%pip install -qU langchain-cohere

In [None]:
import os
import json
import torch
import psycopg2
import numpy as np
import pandas as pd
from langchain import hub
from langchain_cohere import ChatCohere
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# Dataset

In [None]:
# Read Data
f = open('/content/corpus.json')
corpus = json.load(f)
f.close()

corpus_df = pd.DataFrame(corpus)

In [None]:
# Text Chunker
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=16,
    length_function=len,
    separators = ['\n']
)

# Chunking Text
corpus_df['chunks'] = corpus_df['body'].apply(lambda x: text_splitter.create_documents([x]))

In [None]:
# Add metadata and change format of chunk to node
def add_metadata(df):
    for i in range(len(df)):
        x = df.iloc[i]
        metadata = {
            'title': x['title'] if x['title']!=None else "None",
            'author': x['author'] if x['author']!=None else "None",
            'source': x['source'] if x['source']!=None else "None",
            'published_at': x['published_at'] if x['published_at']!=None else "None",
            'category': x['category'] if x['category']!=None else "None",
            'url': x['url'] if x['url']!=None else "None"
        }

        node_chunks = []
        for node in x['chunks']:
            node.metadata = metadata
            node_chunks.append(node)

        df.at[i,'chunks'] = node_chunks

    return df

corpus_df = add_metadata(corpus_df)

In [None]:
splits = []
for i in range(len(corpus_df)):
    splits.extend(corpus_df.iloc[i]['chunks'])

# RAG

In [None]:
# Embeddings Model
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':device},
    encode_kwargs={'normalize_embeddings': True},
    show_progress=True
)

In [None]:
# Vector Store and Retriever
vectorstore = Chroma.from_documents(documents=splits,embedding=embed_model,persist_directory="/content",)
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})

Batches:   0%|          | 0/677 [00:00<?, ?it/s]

In [74]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")
prompt.messages[0].prompt.template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use short and direct answers like 'Yes'/'No' and Names and keep the answer concise.
Question: {question}
Context: {context}
Answer:"""



In [75]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [78]:
# Define the Cohere LLM
llm = ChatCohere(model="command-r-plus-08-2024")

In [79]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [80]:
# Question
rag_chain.invoke("Does the Yardbarker article suggest that Travis Kelce will have a productive outing against the Raiders' defense, while the Sporting News article indicates that the Raiders' offense was generally ineffective, except for a specific instance led by Josh Jacobs?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Yes.'

# **CRAG**



1.   Retrieve Documents
2.   Grade Document
    *   Relevant:
        1. Generate Answer
    *   Not Relevant:
        1. Rewrite Query
        2. Web Search
        3. Generate Answer








In [None]:
%pip install langgraph tavily-python

In [None]:
from typing import List
from langchain.schema import Document
from typing_extensions import TypedDict
from langgraph.graph import END, StateGraph, START
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.tools.tavily_search import TavilySearchResults

In [98]:
rag_chain = prompt | llm | StrOutputParser()

# Grader

In [81]:
class GradeDocs(BaseModel):
    binary_score: str = Field(description = "Documents are relevant to the question, 'yes' or 'no'")


structured_llm_grader = llm.with_structured_output(GradeDocs)

system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

# Query Re-write

In [82]:
system = """You a question re-writer that converts an input question to a better version that is optimized \n
     for web search. Look at the input and try to reason about the underlying semantic intent / meaning."""

re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

question_rewriter = re_write_prompt | llm | StrOutputParser()

# Web-search

In [83]:
web_search_tool = TavilySearchResults(k=3)

# Graph

In [84]:
class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        web_search: whether to add search
        documents: list of documents
    """

    question: str
    generation: str
    web_search: str
    documents: List[str]

In [99]:
# Retrive Documents
def retrieve(state):
    question = state["question"]
    # Retrieval
    documents = retriever.get_relevant_documents(question)
    return {"documents": documents, "question": question}

# Grade Documents
def grade_documents(state):
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    web_search = "No"
    for d in documents:
        score = retrieval_grader.invoke(
            {"question": question, "document": d.page_content}
        )

        grade = score['binary_score']
        # Relevant Document
        if grade == "yes":
            filtered_docs.append(d)
        # Not Relevent Document
        else:
            web_search = "Yes"
            continue
    return {"documents": filtered_docs, "question": question, "web_search": web_search}

# Decide to search or generate result
def decide_to_generate(state):
    state["question"]
    web_search = state["web_search"]
    state["documents"]

    if web_search == "Yes":
        return "search"
    else:
        return "generate_result"

# Transform query to be searchable
def transform_query(state):
    question = state["question"]
    documents = state["documents"]

    # Re-write question
    better_question = question_rewriter.invoke({"question": question})
    return {"documents": documents, "question": better_question}

# Search web
def web_search(state):
    question = state["question"]
    documents = state["documents"]

    # Web search
    docs = web_search_tool.invoke({"query": question})
    web_results = "\n".join([d["content"] for d in docs])
    web_results = Document(page_content=web_results)
    documents.append(web_results)

    return {"documents": documents, "question": question}

# Finally Generate
def generate(state):
    question = state["question"]
    documents = state["documents"]

    # RAG generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation}

In [100]:
workflow = StateGraph(GraphState)

# Define the nodes

# retrieve
workflow.add_node("retrieve", retrieve)
# grade documents
workflow.add_node("grade_documents", grade_documents)
# generatae
workflow.add_node("generate", generate)
# transform_query
workflow.add_node("transform_query", transform_query)
# web search
workflow.add_node("web_search_node", web_search)

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "search": "transform_query",
        "generate_result": "generate",
    },
)
workflow.add_edge("transform_query", "web_search_node")
workflow.add_edge("web_search_node", "generate")
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()

# Use

In [102]:
from pprint import pprint

# Run
inputs = {"question": "Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?"}
for output in app.stream(inputs):
    for key, value in output.items():
        # Node
        pprint(f"Node '{key}':")
        # Optional: print full state at each node
        # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
    pprint("\n---\n")

# Final generation
pprint(value["generation"])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

"Node 'retrieve':"
'\n---\n'
{'binary_score': 'yes'}
{'binary_score': 'yes'}
{'binary_score': 'yes'}
{'binary_score': 'yes'}
"Node 'grade_documents':"
'\n---\n'
Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
"Node 'generate':"
'\n---\n'
'Sam Bankman-Fried'
