# Clone the repo having the dataset and install necessary requirements

In [None]:
%%capture
!git clone https://github.com/yixuantt/MultiHop-RAG.git


In [None]:
%%capture
%cd MultiHop-RAG/
!pip install sentencex openai qdrant-client[fastembed]

In [None]:
!rm -rf db
!mkdir db

In [None]:
import polars as pr
import json
from sentencex import segment
from openai import OpenAI
import ast
import time
import pickle
from qdrant_client import QdrantClient



client = OpenAI(api_key="api key",max_retries=3)
qdrant_client = QdrantClient(path="db")


#Get around 100 urls for quick testing

In [None]:
data=json.load(open("dataset/corpus.json"))
df=pr.from_dicts(data)
df.head(2)

title,author,source,published_at,category,url,body
str,str,str,str,str,str,str
"""200+ of the be‚Ä¶",,"""Mashable""","""2023-11-27T08:‚Ä¶","""entertainment""","""https://mashab‚Ä¶","""Table of Conte‚Ä¶"
"""ASX set to dro‚Ä¶","""Stan Choe""","""The Sydney Mor‚Ä¶","""2023-09-26T19:‚Ä¶","""business""","""https://www.sm‚Ä¶","""ETF provider B‚Ä¶"


In [None]:
queries=json.load(open("dataset/MultiHopRAG.json"))
df_q=pr.from_dicts(queries)
df_q.head(2)


query,answer,question_type,evidence_list
str,str,str,list[struct[7]]
"""Who is the ind‚Ä¶","""Sam Bankman-Fr‚Ä¶","""inference_quer‚Ä¶","[{""The FTX trial is bigger than Sam Bankman-Fried"",""Elizabeth Lopatto"",""https://www.theverge.com/2023/9/28/23893269/ftx-sam-bankman-fried-trial-evidence-crypto"",""The Verge"",""technology"",""2023-09-28T12:00:00+00:00"",""Before his fall, Bankman-Fried made himself out to be the Good Boy of crypto ‚Äî the trustworthy face of a sometimes-shady industry.""}, {""SBF‚Äôs trial starts soon, but how did he ‚Äî and FTX ‚Äî get here?"",""Jacquelyn Melinek"",""https://techcrunch.com/2023/10/01/ftx-lawsuit-timeline/"",""TechCrunch"",""technology"",""2023-10-01T14:00:29+00:00"",""The highly anticipated criminal trial for Sam Bankman-Fried, former CEO of bankrupt crypto exchange FTX, started Tuesday to determine whether he‚Äôs guilty of seven counts of fraud and conspiracy.""}, {""Sam Altman backs teens‚Äô startup, Google unveils the Pixel 8 and TikTok tests an ad-free tier"",""Kyle Wiggers"",""https://techcrunch.com/2023/10/07/sam-altman-backs-a-teens-startup-google-unveils-the-pixel-8-and-tiktok-tests-an-ad-free-tier/"",""TechCrunch"",""technology"",""2023-10-07T20:15:26+00:00"",""The prosecution painted Bankman-Fried as someone who knowingly committed fraud to achieve great wealth, power and influence, while the defense countered that the FTX founder acted in good faith, never meant to commit fraud or steal and basically got in over his head.""}]"
"""Which individu‚Ä¶","""Donald Trump""","""inference_quer‚Ä¶","[{""Donald Trump defrauded banks with 'fantasy' to build his real estate empire, judge rules in a major repudiation against the former president"",""Michael R. Sisak, The Associated Press"",""https://fortune.com/2023/09/26/donald-trump-fraud-banks-insurers-real-estate-judge-new-york/"",""Fortune"",""business"",""2023-09-26T21:11:15+00:00"",""No apartment in New York City has ever sold for close to that amount, James said.""}, {""The $777 million surprise: Donald Trump is getting richer"",""Tom Maloney"",""https://www.theage.com.au/business/companies/the-777-million-surprise-donald-trump-is-getting-richer-20231108-p5eicf.html?ref=rss&utm_medium=rss&utm_source=rss_business"",""The Age"",""business"",""2023-11-07T22:22:05+00:00"",""The prosecution argues that was to mask a drop in the value of one of his other properties.""}]"


In [None]:
set_urls=set([])
questions=[]
answers=[]
types=[]

for e,q,a,t in zip(df_q["evidence_list"],df_q["query"],df_q["answer"],df_q["question_type"]):

    questions.append(q)
    answers.append(a)
    types.append(t)
    for dic in list(e):
        set_urls.add(dic["url"])
    if len(list(set_urls))>100:
        break
print(len(set_urls))
print(list(zip(questions,answers)))
print(len(questions),len(answers))

101
[('Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?', 'Sam Bankman-Fried'), ("Which individual is implicated in both inflating the value of a Manhattan apartment to a figure not yet achieved in New York City's real estate history, according to 'Fortune', and is also accused of adjusting this apartment's valuation to compensate for a loss in another asset's worth, as reported by 'The Age'?", 'Donald Trump'), ('Who is the figure associated with generative AI technology whose departure from OpenAI was considered shocking according to Fortune, and is also the subject of a prevailing theory suggesting a lack of full truthfulness with the board as reported by TechCrunch?', 'Sam Altman'), ('Do the TechCrunch article on software companies and the Hacker News article on The Epoch Times both report an incr

#helper functions

In [None]:
def call_llm_json(prompt,model="gpt-3.5-turbo-0125",temperature=0.1):
    """call openai and get json output
    """
    response=client.chat.completions.create(
                                model=model,
                                messages=[
                                {"role": "user", "content": prompt}
                                ],

                                temperature=temperature,
                                response_format={ "type": "json_object" },
                                )
    return response.choices[0].message.content.strip()

In [None]:
def call_llm(prompt,model="gpt-3.5-turbo-0125",temperature=0.1):
    """call openai
    """
    response=client.chat.completions.create(
                                model=model,
                                messages=[
                                {"role": "user", "content": prompt}
                                ],

                                temperature=temperature,

                                )
    return response.choices[0].message.content.strip()

#Divide quickly into chunks, save to vector db

In [None]:
avg_chunk_size=6

#create chunk with overlap for the urls in set_urls, aiming for average
for row in df.rows(named=True):

    if row["url"] not in set_urls:
        continue
    chunk_list = []
    metadata = []

    data=" ".join(row['body'].split())
    sent_list = list(segment("en", data))


    rem = len(sent_list) % avg_chunk_size
    for k in range(0, len(sent_list), avg_chunk_size):
        if (k + avg_chunk_size + rem) > len(sent_list) - 1:
            if rem < 4:
                chunk = sent_list[k - 1:]
                break
            else:
                chunk = sent_list[k - 1:k + avg_chunk_size]
        else:
            chunk = sent_list[max(0, k - 1):k + avg_chunk_size]
        chunk_list.append("".join(chunk))
        metadata.append({"source":row["url"]})


    #add to vector db
    qdrant_client.add(collection_name="demo_collection",documents=chunk_list,metadata=metadata)



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 77.7M/77.7M [00:02<00:00, 32.0MiB/s]


#let's try simple retrieval

In [None]:
def query_llm(context,query):

    res=call_llm(f"""given context: {context}\n Based on the given context, answer the query in not more than 10 words. However, if the query cannot be answered based on the given context, say I can't answer based on the given context.  \n query: {query} """)

    return res

In [None]:
result=[]
for q,a,t in zip(questions,answers,types):
    context=""
    search_result = qdrant_client.query(
        collection_name="demo_collection",
        query_text=f"""{q}""",
        limit=3
    )


    for res in search_result:
        doc= res.metadata["document"]
        source = res.metadata["source"]
        context=context+doc +" source: "+source+" "
    answer=query_llm(context,q)
    result.append({"question":q,"answer":a,"context":context,"predicted":answer,"type":t})

df_result=pr.from_dicts(result)


In [None]:
for row in df_result.rows(named=True):
      print("Question: ",row["question"])
      print("Answer: ",row["answer"])
      print("context: ",row["context"])
      print("predicted: ", row["predicted"])
      print("type: ", row["type"])
      print("===============")

Question:  Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
Answer:  Sam Bankman-Fried
context:  I think it‚Äôs important for people to think I look crazy.‚Äù) Because he was so successful at this kind of public relations, his fall from grace was another mark against an industry that was already roiled by bankruptcies and scandals.Some additional trouble for the crypto industry is likely to come from one crucial element of the fraud trial ‚Äî the part where the government must prove intent.The first part of proving the government‚Äôs case is pretty simple and a little boring: prosecutors must show that certain transactions took place.Whatever records the Southern District of New York has for the transactions will be shown.‚ÄúWhat conversations happened between him and his co-conspirators that are now





#**For inference_query type the result is not that bad. But for other query types the results are quite poor. Let's see if we can improve for other query types by tweaking the prompt and some preprocessing(trying to get more relevant chunks by breaking the query into relevant phrases)**

In [None]:
def get_subphrases(query):
    format = {"subphrases":["list of sub phrases if it can be broken into smaller simpler sub phrases, or the original query"],"question":"the original question"}
    example_query1="Who's article was better about the rising mental health issues, Mayank or Suresh?"
    output_query1={"subphrases":["Mayank's article about mental health issues","Suresh's article about mental health issues"],"question":"Who's article was better about the rising mental health issues, Mayank or Suresh?"}
    example_query2="How many chambers does the heart have?"
    output_query2={"subphrases":[],"question":"How many chambers does the heart have?"}
    res=call_llm_json(f"""You are given a query. If the query can be broken into multiple smaller phrases,return sub phrases and the query. However, if the query is simple and cannot be broken into smaller phrases, just return the original query. A sub phrase is just a relevant part of the query, not a not a whole new query. \n query: {query} \n
    Examples:  query: {example_query1}:  Output: {output_query1} query: {example_query2}:  Output: {output_query2} \n
    Return the response only as a json with the following format: {format}. Always maintain the structure.
     """)
    return ast.literal_eval(res)

In [None]:
def query_llm_others(context,query):
    """get response for temporal or comparision query
    """
    format = {"analysis":"analysis","answer":"answer"}
    res=call_llm_json(f"""given context: {context}\n query: {query}  You an expert in analysing and responding to queries related to temporal aspects or comparisons. First, analyse the given context in 80-100 words with respect to the given query. Then, based on the analysis, try to answer the query in not more than 5 words. However, if the query cannot be answered based on the analysis, say I can't answer based on the given context.  \n Return the response only as a json with the following format: {format} """)
    res=ast.literal_eval(res)
    return res["analysis"],res["answer"]

In [None]:
result1=[]
subphrases=[]

for q,a,t in zip(questions,answers,types):
    context=""

    if "inference" in t.lower():
        continue
    else:
        result=get_subphrases(q)
        phrases=result["subphrases"]
        question=result["question"]
        subphrases.append((phrases,question))
        if phrases:

            for phrase in phrases:
                search_result = qdrant_client.query(
                    collection_name="demo_collection",
                    query_text=f"""{phrase}""",
                    limit=1
                )


                for res in search_result:
                    doc= res.metadata["document"]
                    source = res.metadata["source"]
                    context=context+doc +" source: "+source+" "
                if len(context)>10000:
                    break
        else:
            search_result = qdrant_client.query(
                    collection_name="demo_collection",
                    query_text=f"""{q}""",
                    limit=3
                )


            for res in search_result:
                    doc= res.metadata["document"]
                    source = res.metadata["source"]
                    context=context+doc +" source: "+source+" "



        analysis,answer=query_llm_others(context,question)
        answer+=" analysis: "+analysis



    result1.append({"question":q,"answer":a,"context":context,"predicted":answer,"type":t})

df_result1=pr.from_dicts(result1)

In [None]:
for row in df_result1.rows(named=True):
      print("Question: ",row["question"])
      print("Answer: ",row["answer"])
      print("context: ",row["context"])
      print("predicted: ", row["predicted"])
      print("type: ", row["type"])
      print("===============")

Question:  Do the TechCrunch article on software companies and the Hacker News article on The Epoch Times both report an increase in revenue related to payment and subscription models, respectively?
Answer:  Yes
context:  Microsoft has always been awesome to developers, always being respectful, giving developers a great deal and treating them as partners, you know?And so even as Microsoft was crushing corporate competitors, the developer experience was excellent.[Editor‚Äôs note: Netscape might feel differently.] ‚ÄúEven as Microsoft was crushing corporate competitors, the developer experience was excellent.‚Äù Google and Apple both treat developers as adversaries ‚Äî they try to attack our revenue streams and prevent us from competing with their products.They‚Äôve built these massive self-preferencing schemes all around excluding developers and disadvantaging third-party developers.I think this is very shortsighted.I think any tech company ‚Äî Apple, Google included ‚Äî would be much 

In [None]:
print(subphrases)

[(['TechCrunch article on software companies', 'Hacker News article on The Epoch Times', 'increase in revenue related to payment and subscription models'], 'Do both articles report an increase in revenue related to payment and subscription models?'), (["TechCrunch article on Twitch's subscription revenue split policy", "TechCrunch article on Beeper's plans for Beeper Mini subscriptions"], "Does the TechCrunch article on Twitch's subscription revenue split policy indicate a different monetization strategy compared to the TechCrunch article on Beeper's plans for Beeper Mini subscriptions?"), (["The New York Times' article attribute the success of the Buffalo Bills' defense to the contributions of Jordan Poyer", "'Sporting News' article suggests that the Baltimore Ravens' defense needs to improve before their game against the Cincinnati Bengals"], "Does 'The New York Times' article attribute the success of the Buffalo Bills' defense to Jordan Poyer and 'Sporting News' suggest that the Bal

#**We have observed a notable improvement. While some responses are incorrect, the overall improvement from the previous version is significant We can try to tweak the prompt, use a better model(gpt4), change other settings(make better use of metadata), may be create a knowledge graph. My aim was not to get the best accuracy but to see whether minor adjustments in query preprocessing and prompt refinement could enhance retrieval and final outcomes.And it does.**