# SEMA Semantic Agent. Arxiv search powered by LLMs

What it does:
- Convert user query into keyword search queries
- Google search top 10 results with SERP API
- Scrape html for each result, convert to markdown
- Structure output using function calling -> json to get paper, title
- Call arxiv to get paper, abstract, metadata
- Call Google Scholar to get citations, ...
- Use LLM to answer user query based on the paper, evaluate answer relevance
- Rank results based on citations, relevance to user query
- Print results in structured format, give links to download, or to use in notebook LM

# Setup

load secret variables

In [212]:
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.environ.get("OPENAI_API_KEY")
serp_api_key = os.environ.get("SERP_API_KEY")
gemini_api_key = os.environ.get("GEMINI_API_KEY")
llamaindex_api_key = os.environ.get("LLAMAINDEX_API_KEY")
perplexity_api_key = os.environ.get("PERPLEXITY")

# Hide part of the key
openai_api_key_hidden = openai_api_key[:3] + "*" * (len(openai_api_key) - 6) + openai_api_key[-3:]
serp_api_key_hidden = serp_api_key[:3] + "*" * (len(serp_api_key) - 6) + serp_api_key[-3:]
gemini_api_key_hidden = gemini_api_key[:3] + "*" * (len(gemini_api_key) - 6) + gemini_api_key[-3:]
llamaindex_api_key_hidden = llamaindex_api_key[:3] + "*" * (len(llamaindex_api_key) - 6) + llamaindex_api_key[-3:]
perplexity_api_key_hidden = perplexity_api_key[:3] + "*" * (len(perplexity_api_key) - 6) + perplexity_api_key[-3:]

# Print the hidden keys
print(f"OpenAI API Key (hidden): {openai_api_key_hidden}")
print(f"Serp API Key (hidden): {serp_api_key_hidden}")
print(f"Gemini API Key (hidden): {gemini_api_key_hidden}")
print(f"Llamaindex API Key (hidden): {llamaindex_api_key_hidden}")
print(f"Perplexity API Key (hidden): {perplexity_api_key_hidden}")

OpenAI API Key (hidden): sk-*********************************************0jF
Serp API Key (hidden): 68c**********************************************************266
Gemini API Key (hidden): AIz*********************************MUc
Llamaindex API Key (hidden): llx**********************************************hA3
Perplexity API Key (hidden): ppl***********************************************ffa


### Put data into a local database

establish db connection

In [213]:
import psycopg2
import os

def connection():
    """Creates and returns a new database connection."""
    try:
        conn = psycopg2.connect(
            user=os.environ["MY_INTEGRATION_USER"],
            password=os.environ["MY_INTEGRATION_PASSWORD"],
            host=os.environ["MY_INTEGRATION_HOST"],
            port=os.environ["MY_INTEGRATION_PORT"],
            database=os.environ["MY_INTEGRATION_DATABASE"]
        )
        
        # Test the connection
        with conn.cursor() as cursor:
            cursor.execute("SELECT version();")
            record = cursor.fetchone()
            # print("You are connected to - ", record)
        
        return conn  # Return the connection object if successful

    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to database", error)
        return None  # Return None if connection was not successful

conn = connection()

In [214]:
DSN = (
    f"dbname={os.environ['MY_INTEGRATION_DATABASE']} "
    f"user={os.environ['MY_INTEGRATION_USER']} "
    f"password={os.environ['MY_INTEGRATION_PASSWORD']} "
    f"host={os.environ['MY_INTEGRATION_HOST']} "
    f"port={os.environ['MY_INTEGRATION_PORT']}"
)

In [215]:
import psycopg2
import os

# Function to create tables in the database
def create_tables():
    # Define your SQL statements for creating tables
    sql_commands = [
        """
        CREATE TABLE IF NOT EXISTS google_search_results (
            url TEXT PRIMARY KEY,
            html TEXT,
            scraping_status TEXT,
            processed_markdown TEXT,
            query TEXT
            title TEXT,
            snippet TEXT,
            job_id INT
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS Papers (
            id SERIAL PRIMARY KEY,
            paper_title TEXT,
            source_content TEXT,
            links TEXT,
            arxiv_link TEXT UNIQUE,
            arxiv_title TEXT,
            arxiv_abstract TEXT,
            arxiv_metadata TEXT,
            arxiv_filename TEXT,
            arxiv_paper_markdown TEXT,
            citations INTEGER,
            versions INTEGER
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS Query_Papers (
            id SERIAL PRIMARY KEY,
            query TEXT,
            arxiv_link TEXT,
            relevance_score REAL,
            final_rank INTEGER,
            relevant_answer TEXT,
            paper_stats TEXT,
            paper_metadata_filtered TEXT,
            download_link TEXT,
            relevant_snippets TEXT,
            job_id INT,
            CONSTRAINT unique_query_arxiv_link UNIQUE (query, arxiv_link)
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS jobs (
            job_id SERIAL PRIMARY KEY,
            query TEXT,
            job_status TEXT,
            printed_ranks INTEGER DEFAULT 0,
            terminal_output TEXT
            gpt_response TEXT
            perplexity_response TEXT
            final_response TEXT
            keyword_search_queries TEXT
            paper_search_queries TEXT
        );
        """
    ]
    try:
        with conn.cursor() as cursor:
            # Execute each SQL command separately
            for sql_command in sql_commands:
                cursor.execute(sql_command)
            conn.commit()  # Commit the transaction
            print("All tables are created successfully.")

    except (Exception, psycopg2.Error) as error:
        print("Failed to create tables", error)
        conn.rollback()  # Rollback the transaction on error

    finally:
        if conn:
            conn.close()
            print("Database connection is closed.")
# Main script execution
try:
    connection()
    create_tables()

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to database", error)


Failed to create tables syntax error at or near "title"
LINE 8:             title TEXT,
                    ^

Database connection is closed.


clean arxiv URLs

In [216]:
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote, urlparse, parse_qs

def extract_arxiv_url_from_wrapped_url(wrapped_url):
    # Unquote to handle deeply nested or doubly-encoded URLs
    wrapped_url = unquote(wrapped_url)
    # print('Wrapped URL:', wrapped_url)
    parsed_url = urlparse(wrapped_url)
    # print('Parsed URL:', parsed_url)
    query_params = parse_qs(parsed_url.query)
    # print('Query Parameters:', query_params)
    # Explore all possible URLs found in the 'url' parameter and extract the arXiv link
    if 'url' in query_params:
        for possible_url in query_params['url']:
            # Check if we have nested URLs and extract the innermost one
            while 'url=' in possible_url:
                inner_parsed = parse_qs(urlparse(possible_url).query)
                if 'url' in inner_parsed:
                    possible_url = inner_parsed['url'][0]
                else:
                    break  # Break if no inner 'url' parameter is found
            possible_url = unquote(possible_url)  # Ensure the inner URL is fully decoded
            # print('Possible URL after extraction:', possible_url)
            
            # Now check if the final extracted URL is an arXiv link
            if 'arxiv.org' in possible_url:
                # print('Extracted arXiv URL:', possible_url)
                return possible_url  # Return the first arXiv URL found
    return None  # Return None if no arXiv URL is found

def clean_arxiv_link(link):
    # Clean the extracted arXiv link
    link = unquote(link)  # Ensure the link is fully decoded
    link_obj = urlparse(link)

    # Handle typical arXiv link structures for abstract and PDF
    if re.search(r'(/abs/|/pdf/)[0-9]+\.[0-9]+', link_obj.path):
        clean_path = re.sub(r'/pdf/', '/abs/', link_obj.path.split('.pdf')[0])
        clean_path = re.sub(r'v\d+$', '', clean_path)  # Remove versioning
        return f"https://arxiv.org{clean_path}"
    # Handle FTP PDF links
    elif re.search(r'/ftp/arxiv/papers/[0-9]{4}/[0-9]{4}\.[0-9]+\.pdf', link_obj.path):
        paper_id = re.findall(r'/ftp/arxiv/papers/[0-9]{4}/([0-9]{4}\.[0-9]+)\.pdf', link_obj.path)[0]
        return f"https://arxiv.org/abs/{paper_id}"  # Convert to standard abstract link
    return None

insert arxiv URLs into db

In [217]:
def insert_arxiv_links_and_snippets_into_db(arxiv_links_dict, user_query):
    cleaned_arxiv_links = []
    snippets = []

    # Clean the arXiv links and prepare snippets
    for link, snippet in arxiv_links_dict.items():
        cleaned_link = clean_arxiv_link(link)
        if cleaned_link:
            cleaned_arxiv_links.append(cleaned_link)
            snippets.append(snippet)  # Keep the snippet aligned with its cleaned link

    # Insert cleaned links and their corresponding snippets into the database
    print(f"Cleaned arXiv links[{len(cleaned_arxiv_links)}]: {cleaned_arxiv_links}")
    if cleaned_arxiv_links and snippets:
        try:
            conn = connection()  # Establish your database connection here
            c = conn.cursor()

            # Insert arxiv links into Papers table in bulk
            arxiv_links_data = [(link,) for link in cleaned_arxiv_links]
            psycopg2.extras.execute_batch(
                c,
                "INSERT INTO Papers (arxiv_link) VALUES (%s) ON CONFLICT (arxiv_link) DO NOTHING",
                arxiv_links_data
            )

            # Insert records associated with user query and their snippets in Query_Papers table in bulk
            query_papers_data = [(user_query, link, snippet) for link, snippet in zip(cleaned_arxiv_links, snippets)]
            psycopg2.extras.execute_batch(
                c,
                "INSERT INTO Query_Papers (query, arxiv_link, relevant_snippets) VALUES (%s, %s, %s) ON CONFLICT (query, arxiv_link) DO NOTHING",
                query_papers_data
            )

            # Commit the transaction
            conn.commit()
            print(f"Successfully inserted records associated with the query '{user_query}' into the database. Links: {cleaned_arxiv_links}")
        except Exception as e:
            # Rollback any changes if an error occurs
            conn.rollback()
            print(f"Transaction rolled back. Error occurred: {e}")
        finally:
            if conn:
                conn.close()

In [267]:
def insert_first_arxiv_link_into_db(arxiv_links_dict, user_query):
    # Extract the first (and only) cleaned arXiv link and its snippet from the dictionary
    if arxiv_links_dict:
        cleaned_arxiv_link, snippet = next(iter(arxiv_links_dict.items()))
        try:
            conn = connection()  # Establish your database connection here
            c = conn.cursor()

            # Insert arxiv link into Papers table
            c.execute(
                "INSERT INTO Papers (arxiv_link) VALUES (%s) ON CONFLICT (arxiv_link) DO NOTHING",
                (cleaned_arxiv_link,)
            )

            # Insert record associated with user query and its snippet into Query_Papers table
            c.execute(
                "INSERT INTO Query_Papers (query, arxiv_link, relevant_snippets) VALUES (%s, %s, %s) ON CONFLICT (query, arxiv_link) DO NOTHING",
                (user_query, cleaned_arxiv_link, snippet)
            )

            # Commit the transaction
            conn.commit()
            print(f"Successfully inserted the record associated with the query '{user_query}' into the database. Link: {cleaned_arxiv_link}")
        except Exception as e:
            # Rollback any changes if an error occurs
            conn.rollback()
            print(f"Transaction rolled back. Error occurred: {e}")
        finally:
            if conn:
                conn.close()
    else:
        print("No valid arXiv link to insert into the database.")

In [260]:
def insert_arxiv_links_into_db(html_content, user_query, job_id):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all('a', href=True)
    
    arxiv_links = []

    for link in links:
        href = unquote(link['href']).split(']')[0]  # Decode and preliminarily clean the URL
        if 'arxiv.org/ct?url=' in href:
            # Special handling for wrapped links
            extracted_url = extract_arxiv_url_from_wrapped_url(href)
            if extracted_url:
                # Clean and validate the extracted arXiv URL
                cleaned_link = clean_arxiv_link(extracted_url)
                if cleaned_link:
                    arxiv_links.append(cleaned_link)
                    continue  # Move to the next link after handling a wrapped URL

        # Regular processing for non-wrapped arXiv links
        if 'arxiv.org' in href and not any(x in href for x in ['/login', '/search', '/about', '/help', '/status']):
            cleaned_link = clean_arxiv_link(href)
            if cleaned_link:
                arxiv_links.append(cleaned_link)
    
    # Insert cleaned links into the database
    # print(f"Cleaned arXiv links[{len(arxiv_links)}]: {arxiv_links}")
    # print(f"Cleaned arXiv links[{len(arxiv_links)}]:")
    # for i, link in enumerate(arxiv_links, start=1):
    #     print(f"{i}. {link}")

    # Insertion into the database would go here - omitted for brevity
    if arxiv_links:
        try:
            conn = connection()
            c = conn.cursor()
            # Insert arxiv links into Papers table in bulk if there are any
            arxiv_links_data = [(link,) for link in arxiv_links]  # Prepare data for bulk insert
            psycopg2.extras.execute_batch(
                c, 
                "INSERT INTO Papers (arxiv_link) VALUES (%s) ON CONFLICT (arxiv_link) DO NOTHING",
                arxiv_links_data
            )
            
            # Insert records associated with user query in Query_Papers table in bulk
            query_papers_data = [(job_id,user_query, link) for link in arxiv_links]  # Prepare data
            psycopg2.extras.execute_batch(
                c, 
                "INSERT INTO Query_Papers (job_id, query, arxiv_link) VALUES (%s, %s, %s) ON CONFLICT (query, arxiv_link) DO NOTHING",
                query_papers_data
            )
            # Commit the transaction
            conn.commit()
            # print(f"Successfully inserted records associated with the query '{user_query}' into the database.")

        except Exception as e:
            # Rollback any changes if an error occurs
            conn.rollback()
            print(f"Transaction rolled back. Error occurred: {e}")
        if conn:
            conn.close()
# Connect to the database

# Example 
job_id = 1
user_query = "Example Query for Testing"
# Example HTML content
html_content = """
<html>
    <body>
        <p>Here are some arXiv papers that might interest you:</p>
        <a href="https://arxiv.org/abs/12457457234623434.56789">Paper 1</a>
        <a href="https://arxiv.org/abs/98724723463246234623466.54321">Paper 2</a>
        <a href="http://example.com">Non-arXiv link</a>
        <a href="https://arxiv.org/abs/11223472347234722.3344">Paper 3</a>
        <a href="https://info.arxiv.org/help/submit_latex_best_practices.html">LaTeX Best Practices</a>
        <a href="https://info.dev.arxiv.org/about/accessibility_html_error_messages.html">Accessibility Info</a>
        <a href="https://arxiv.org/abs/2112.08726v1">Versioned Paper</a>
        <a href="https://arxiv.org/pdf/2112.08726.pdf">PDF Paper</a>
        <a href="https://arxiv.org/ftp/arxiv/papers/2312/2312.00752.pdf">FTP PDF Paper</a>
        <a href="https://arxiv.org/pdf/2310.06825.pdf%5D%5D%3E">Dirty PDF Link</a>
        <a href="https://arxiv.org/abs/2308.09687v2">Versioned Paper 2</a>
        <a href="https://arxiv.org/ct?url=http://www.bibsonomy.org/BibtexHandler?requTask%3Dupload%26url%3Dhttps://arxiv.org/abs/2402.10200%26description%3DChain-of-Thought+Reasoning+Without+Prompting&v=50e87f9b">Wrapped Link 1</a>
        <a href="https://arxiv.org/ct?url=https://reddit.com/submit?url%3Dhttps://arxiv.org/abs/2402.99200%26title%3DChain-of-Thought+Reasoning+Without+Prompting&v=8392271f">Wrapped Link 2</a>
        <a href="https://arxiv.org/login">Login Page</a>
        <a href="https://arxiv.org/search/advanced">Advanced Search</a>
        <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+X">Author Search 1</a>
        <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+D">Author Search 2</a>
        <a href="https://info.arxiv.org/about">About arXiv</a>
        <a href="https://info.arxiv.org/about/donate.html">Donate to arXiv</a>
        <a href="https://info.arxiv.org/about/ourmembers.html">Our Members</a>
        <a href="https://info.arxiv.org/help">Help Page</a>
        <a href="https://info.arxiv.org/help/contact.html">Contact Page</a>
        <a href="https://status.arxiv.org">Status Page</a>
        <p>Here are some arXiv papers and other links that might interest you:</p>
        <a href="https://arxiv.org/pdf/2010.02903.pdf">Citations and Versions</a>
        <a href="https://arxiv.org/ct?url=http://www.bibsonomy.org/BibtexHandler?requTask%3Dupload%26url%3Dhttps://arxiv.org/abs/2201.11903%26description%3DChain-of-Thought+Prompting+Elicits+Reasoning+in+Large+Language+Models&v=be299b0a">Wrapped Link for Chain-of-Thought Prompting</a>
        <a href="https://arxiv.org/login">Login Page</a>
        <a href="https://arxiv.org/search/cs?searchtype=author&query=Gao,+Y">Author Search for Gao, Y</a>
        <a href="https://arxiv.org/abs/2005.11401v4">arXiv Paper Version 4</a>
        <a href="https://arxiv.org/pdf/2310.06825.pdf%5D%5D%3E">Dirty PDF Link</a>
        <a href="https://arxiv.org/ftp/arxiv/papers/2312/2312.00752.pdf">FTP PDF Paper</a>
        <a href="https://llamahub.ai/l/llama-packs/llama-index-packs-node-parser-semantic-chunking?from=all">LLamaHub AI Node Parser</a>
        <a href="https://arxiv.org/abs/2310.06147">arXiv Paper</a>
    </body>
</html>
"""
insert_arxiv_links_into_db(html_content, user_query, job_id)

google search

In [219]:
import requests
import json

# Set up your SERP API key
# It's better to use an environment variable for API keys

def search_google(query):
    params = {
        "engine": "google",
        "q": query,
        "api_key": serp_api_key,
        "location": "San Francisco Bay Area, United States",
        "google_domain": "google.com",
        "gl": "us",
        "hl": "en",
        "num": "10"
    }
    response = requests.get("https://serpapi.com/search", params=params)
    response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
    results = response.json()
    # Extracting only the needed information
    formatted_data = {
        "organic_results": [
            {
                "link": result["link"],
                "title": result["title"],
                "snippet": result.get("snippet", "")
            } for result in results.get("organic_results", [])
        ]
    }
    # Assuming search_results is your JSON dictionary obtained from the search
    organic_results = formatted_data.get('organic_results', [])

    # Initialize an empty list to store all the links
    all_links = []

    # Loop through each result in the organic results
    for result in results.get("organic_results", []):
        # Extract the link, title, and snippet if they exist and add them to the list
        link = result.get("link", "")
        title = result.get("title", "")
        snippet = result.get("snippet", "")
        if link:  # Only add to list if link is present
            all_links.append({"link": link, "title": title, "snippet": snippet})

    # Process search results to extract arXiv links and snippets
    arxiv_links_dict = {}
    for result in results.get("organic_results", []):
        link = result.get("link", "")
        snippet = result.get("snippet", "")
        cleaned_link = clean_arxiv_link(link)
        if cleaned_link:  # Only add to dict if link is an arXiv link
            arxiv_links_dict[cleaned_link] = snippet

    # Now call your function to insert arXiv links and snippets into the database
    if arxiv_links_dict:  # Only attempt to insert if we have arXiv links
        insert_arxiv_links_and_snippets_into_db(arxiv_links_dict, query)


    return all_links

# Example usage
query = "how many experts are queries routed to in deepseek moe"
# query = "Top academic papers on LLMs"
search_google(query)

Cleaned arXiv links[1]: ['https://arxiv.org/abs/2401.06066']
Successfully inserted records associated with the query 'how many experts are queries routed to in deepseek moe' into the database. Links: ['https://arxiv.org/abs/2401.06066']


[{'link': 'https://arxiv.org/html/2401.06066v1',
  'title': 'DeepSeekMoE: Towards Ultimate Expert Specialization in ...',
  'snippet': 'DeepSeekMoE has 1 shared expert and 63 routed experts, where each expert is 0.25 times the size of a standard FFN. Including DeepSeekMoE, all compared models ...'},
 {'link': 'https://arxiv.org/pdf/2401.06066',
  'title': 'DeepSeekMoE: Towards Ultimate Expert Specialization in ...',
  'snippet': 'Each MoE layer consists of 2 shared experts and 64 routed experts, where each expert is 0.25 times the size of a standard FFN. Each token ...'},
 {'link': 'https://www.linkedin.com/posts/pramodith_deepseek-mixture-of-experts-moe-proposes-activity-7152253019256991744-97Xz',
  'title': "Pramodith B.'s Post",
  'snippet': 'Most MoE models route inputs to a handful of experts i.e. 1 or 2. 🎛️. Where the outputs of each expert can be fused based on a relevance score.'},
 {'link': 'https://medium.com/@bnjmn_marie/deepseekmoe-moe-with-segmented-and-shared-experts-dedf

In [None]:
def search_google_for_specific_papers(query):
    params = {
        "engine": "google",
        "q": query,
        "api_key": serp_api_key,
        "location": "San Francisco Bay Area, United States",
        "google_domain": "google.com",
        "gl": "us",
        "hl": "en",
        "num": "10"
    }
    response = requests.get("https://serpapi.com/search", params=params)
    response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
    results = response.json()
    # Extracting only the needed information
    formatted_data = {
        "organic_results": [
            {
                "link": result["link"],
                "title": result["title"],
                "snippet": result.get("snippet", "")
            } for result in results.get("organic_results", [])
        ]
    }
    # Assuming search_results is your JSON dictionary obtained from the search
    organic_results = formatted_data.get('organic_results', [])

    # Initialize variables to store the first arXiv link and its snippet
    first_arxiv_link = None
    first_arxiv_snippet = ""

    # Loop through each result in the organic results
    for result in results.get("organic_results", []):
        link = result.get("link", "")
        snippet = result.get("snippet", "")
        cleaned_link = clean_arxiv_link(link)
        if cleaned_link:  # Check if link is an arXiv link and store the first one
            first_arxiv_link = cleaned_link
            first_arxiv_snippet = snippet
            break  # Stop searching once the first arXiv link is found

    # Insert the first arXiv link and its snippet into the database
    if first_arxiv_link:  # Only attempt to insert if an arXiv link was found
        arxiv_links_dict = {first_arxiv_link: first_arxiv_snippet}
        insert_first_arxiv_link_into_db(arxiv_links_dict, query)

    # Return the first arXiv link if found, otherwise return None
    return first_arxiv_link if first_arxiv_link else None


    return all_links

# Example usage
query = "how many experts are queries routed to in deepseek moe"
# query = "Top academic papers on LLMs"
search_google_for_specific_papers(query)

# Main logic

### Scrape the content of the page displayed in the search results

In [220]:
import requests
from bs4 import BeautifulSoup

def fetch_url_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)

    # Initialize the default response structure
    result = {
        "status": response.status_code,
        "soup": None
    }

    # Check if the request was successful
    if response.status_code == 200:
        # Adjusts encoding to match what the response seems to use
        response.encoding = response.apparent_encoding
        
        # Now using response.text to utilize the corrected encoding rather than response.content
        result['soup'] = BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    return result

# Test the function with a URL
url = 'https://medium.com/@thedatabeast/top-10-breakthrough-research-papers-on-large-language-models-llms-in-2023-pioneering-7abfcb69da7f'
response = fetch_url_content(url)
print("Status Code:", response['status'])
print("Soup:", response['soup'])

Status Code: 200
Soup: <!DOCTYPE html>
<html lang="en"><head><title data-rh="true">Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications | by The Data Beast | Medium</title><meta charset="utf-8" data-rh="true"/><meta content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1" data-rh="true" name="viewport"/><meta content="#000000" data-rh="true" name="theme-color"/><meta content="Medium" data-rh="true" name="twitter:app:name:iphone"/><meta content="828256236" data-rh="true" name="twitter:app:id:iphone"/><meta content="Medium" data-rh="true" property="al:ios:app_name"/><meta content="828256236" data-rh="true" property="al:ios:app_store_id"/><meta content="com.medium.reader" data-rh="true" property="al:android:package"/><meta content="542599432471018" data-rh="true" property="fb:app_id"/><meta content="Medium" data-rh="true" property="og:site_name"/><meta content="article" data-rh="true" property="

### Convert resulting html into markdown

In [221]:
import html2text
# Function to convert HTML to Markdown
def html_to_markdown(html_content):
    # Create a converter object
    converter = html2text.HTML2Text()
    converter.ignore_links = False  # Set to True if you want to ignore converting links
    
    # Convert the HTML content to Markdown
    markdown = converter.handle(html_content)

    return markdown

markdown = html_to_markdown(str(response['soup']))
print(markdown)

[Open in
app](https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F7abfcb69da7f&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderUser&source=---two_column_layout_nav----------------------------------)

Sign up

[Sign
in](/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40thedatabeast%2Ftop-10-breakthrough-
research-papers-on-large-language-models-llms-
in-2023-pioneering-7abfcb69da7f&source=post_page---two_column_layout_nav
-----------------------global_nav-----------)

[](/?source=---two_column_layout_nav----------------------------------)

[Write](/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2Fnew-
story&source=---two_column_layout_nav-----------------------
new_post_topnav-----------)

[](/search?source=---two_column_layout_nav----------------------------------)

Sign up

[Sign
in](/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40thedatabeast%2Ftop-10-breakthrough-
research-papers-on-large-language-models-llms-
in

### Convert markdown into a structured JSON format using function calling

we'll structure the JSON to include the page title, page summary, and details for each paragraph (title, content, and links)

In [222]:
from pydantic import BaseModel, HttpUrl
from typing import List
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

# Define your Pydantic models
class Link(BaseModel):
    url: HttpUrl

class Paper(BaseModel):
    paper_title: str
    content: str
    links: List[Link] = []

class Page(BaseModel):
    title: str
    summary: str
    paragraphs: List[Paper]

# Define the OpenAI Pydantic program
def process_markdown(markdown: str, query: str):
    max_length: int = 16000  # Updated max length for token count

    # Check token length before splitting
    token_count = count_tokens(markdown)  # Implement this function
    if token_count > max_length:
        markdown_parts = split_into_parts(markdown, max_length)
    else:
        markdown_parts = [markdown]  # No need to split

    results = []
    for part in markdown_parts:
        print("Current part length (tokens):", count_tokens(part))

        # Define the OpenAI Pydantic program
        prompt_template_str = """
        Given the following markdown_content, extract only structured information about academic papers including paper title, content, and links. The papers should reflect answers to the user query {user_query}:
        {markdown_content}
        """
        program = OpenAIPydanticProgram.from_defaults(
            output_cls=Page,
            llm=OpenAI(model="gpt-3.5-turbo-1106"),
            prompt_template_str=prompt_template_str,
            allow_multiple=False,
            verbose=True,
        )

        # Run the program to get structured output
        description_str = f"Structured json of search results based on a user {query}"
        try:
            output = program(markdown_content=part, user_query=query, description=description_str)
            results.append(output)
        except Exception as e:
            # Catch all exceptions
            if hasattr(e, 'error') and 'message' in e.error:
                print(f"Error: {e.error['message']}")
            elif hasattr(e, 'args') and e.args:
                print(f"Error: {e.args[0]}")
            else:
                print(f"An unexpected error occurred: {e}")
            continue

    # Combine results from all parts or handle as needed
    combined_result = combine_page_results(results)
    return combined_result

# Function to count tokens (replace with your implementation)
def count_tokens(text: str) -> int:
    # Use your preferred tokenizer (e.g., tiktoken)
    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
    return len(tokenizer(text))

# Assuming 'results' is a list of Page objects or similar structured data
def combine_page_results(results: List[Page]) -> Page:
    if not results:
        return None  # Or some default value
    
    # Start with the title and summary from the first result
    combined_title = results[0].title
    combined_summary = results[0].summary
    combined_paragraphs = []

    # Iterate through all results and combine the paragraphs
    for result in results:
        combined_paragraphs.extend(result.paragraphs)  # Assuming 'paragraphs' is a list of 'Paper' objects
    
    # Create a new combined Page object
    combined_page = Page(
        title=combined_title,
        summary=combined_summary,
        paragraphs=combined_paragraphs
    )
    return combined_page

def split_into_parts(text: str, max_length: int) -> List[str]:
    paragraphs = text.split('\n\n')
    parts = []
    current_part = ""

    for paragraph in paragraphs:
        if count_tokens(current_part) + count_tokens(paragraph) + 2 > max_length:  # +2 for the two newlines
            parts.append(current_part)
            current_part = paragraph  # Start new part with the current paragraph
        else:
            # Add paragraph to current part, include two newlines if it's not the first paragraph
            current_part += ('\n\n' + paragraph) if current_part else paragraph

    if current_part:  # Add the last part if not empty
        parts.append(current_part)
    
    return parts


result = process_markdown(markdown, query)
print(result)

Current part length (tokens): 2513
Function call: Page with args: {"title":"Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications","summary":"The following are the top 10 breakthrough research papers on large language models (LLMs) in 2023, along with their practical applications and details.","paragraphs":[{"paper_title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","content":"Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.","links":[{"url":"https://arxiv.org/abs/1810.04805"}]},{"paper_title":"BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage","content":"From Meta AI, BlenderBot 3, with its 175 billion parameters, can scour the internet, setting it apart from other conversational bots.","links":[{"url":"https://arxiv.org

In [223]:
import json

# Assuming `output` is your object and it has a method `.dict()` to convert it to a dictionary.
# If `output` is already a dictionary, you can skip the `.dict()` conversion.
output_dict = result.dict() if hasattr(result, 'dict') else result

# Convert to JSON string with indentation for readability
pretty_output = json.dumps(output_dict, indent=4, default=str)

# Print with added line breaks
print(pretty_output)

{
    "title": "Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications",
    "summary": "The following are the top 10 breakthrough research papers on large language models (LLMs) in 2023, along with their practical applications and details.",
    "paragraphs": [
        {
            "paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
            "content": "Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.",
            "links": [
                {
                    "url": "https://arxiv.org/abs/1810.04805"
                }
            ]
        },
        {
            "paper_title": "BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage",
            "content": "From Meta AI, BlenderBot 3, with its 175 billion pa

In [224]:
def insert_papers_into_db(result, query):
    arxiv_links = []
    if result is None:
        print("No data to insert into Papers table.")
        return
    # Parse JSON data
    output_dict = result.dict() if hasattr(result, 'dict') else result
    pretty_output = json.dumps(output_dict, indent=4, default=str)
    data = json.loads(pretty_output)
    if data is None or 'paragraphs' not in data:
        print("Invalid or empty data.")
        print('Parsed website stuctured data=', data)
        return

    # Connect to SQLite database
    conn = connection()
    c = conn.cursor()

    try:
        # Start transaction
        c.execute("BEGIN;")
        # Insert data into Papers table
        for paragraph in data['paragraphs']:
            paper_title = paragraph['paper_title']
            source_content = paragraph['content']
            links = json.dumps(paragraph['links'])  # Convert list of links to JSON string
            # Initialize an empty arXiv link
            arxiv_link = None
            # Search for the arXiv link among the links
            for link in paragraph['links']:
                if 'arxiv.org' in link['url']:
                    temp_link = link['url'].replace('.pdf', '')  # Remove .pdf if present
                    # Remove any trailing file identifiers after the arXiv ID
                    temp_link = temp_link.split('/abs/')[1] if '/abs/' in temp_link else temp_link.split('/')[-1]
                    arxiv_link = 'https://arxiv.org/abs/' + temp_link  # Construct the cleaned arXiv link
                    # Add the arXiv link to the list
                    if arxiv_link not in arxiv_links:
                        arxiv_links.append(arxiv_link)
                    break  # Stop searching once the arXiv link is found
            # Check if the arxiv_link already exists in the database
            c.execute('SELECT COUNT(*) FROM Papers WHERE arxiv_link = %s', (arxiv_link,))
            if c.fetchone()[0] == 0:  # If the count is 0, then the link does not exist
                # SQL statement for inserting data
                insert_sql = '''
                INSERT INTO Papers (paper_title, source_content, links, arxiv_link) VALUES (%s, %s, %s, %s)
                '''
                c.execute(insert_sql, (paper_title, source_content, links, arxiv_link))
            else:
                print(f'Skipping insert: arXiv link already exists in the database: {arxiv_link}')

        for link in arxiv_links:
            # Insert new row into Query_Papers if it does not exist
            c.execute("INSERT INTO Query_Papers (query, arxiv_link) SELECT %s, %s WHERE NOT EXISTS (SELECT 1 FROM Query_Papers WHERE query = %s AND arxiv_link = %s)", (query, link, query, link))

        # Commit the transaction
        conn.commit()
        print(f"Processed and inserted links associated with the query '{query}' into the database.")

    except Exception as e:
        # Rollback the transaction on error
        conn.rollback()
        print(f"An error occurred: {e}. Transaction was rolled back.")

    finally:
        if conn:
            conn.close()
        pass

# Example data
query = "Example Query for Testing"
result_data = {
    "title": "Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications",
    "summary": "The following are the top 10 breakthrough research papers on large language models (LLMs) in 2023, along with their practical applications and details.",
    "paragraphs": [
        {"paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "content": "Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.", "links": [{"url": "https://arxiv.org/abs/1810.04805"}]},
        {"paper_title": "BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage", "content": "From Meta AI, BlenderBot 3, with its 175 billion parameters, can scour the internet, setting it apart from other conversational bots.", "links": [{"url": "https://arxiv.org/abs/2208.03188"}]},
        # Add more papers as needed...
    ]
}
# result_string = json.dumps(result_data)
insert_papers_into_db(result_data, query)

Skipping insert: arXiv link already exists in the database: https://arxiv.org/abs/1810.04805
Skipping insert: arXiv link already exists in the database: https://arxiv.org/abs/2208.03188
Processed and inserted links associated with the query 'Example Query for Testing' into the database.


In [225]:
def print_papers_table():
    conn = connection()
    c = conn.cursor()

    try:
        # Start transaction (mainly useful if there are preceding data manipulations)
        c.execute("BEGIN;")

        # Query all records from Papers table
        query_sql = 'SELECT * FROM Papers'  # Add any condition if necessary
        c.execute(query_sql)

        # Fetch all rows from the query
        all_rows = c.fetchall()

        # Get the column names
        field_names = [description[0] for description in c.description]

        # Check if the table is not empty
        if all_rows:
            print("Preview of Papers Table:")
            for row_counter, row in enumerate(all_rows, start=1):
                print(f"Row {row_counter}:")
                row_with_field_names = {
                    field_name: (content[:60] + '...' if isinstance(content, str) and len(content) > 60 else content) 
                    for field_name, content in zip(field_names, row)
                }
                for field, content in row_with_field_names.items():
                    print(f"{field}: {content}")
                print("-------------")  # Separator for readability
        else:
            print("The Papers table is currently empty.")

        # Commit if there were preceding changes; otherwise, this is optional for read-only operations
        conn.commit()

    except Exception as e:
        # Rollback any changes if an exception occurs
        conn.rollback()
        print(f"An error occurred: {e}")

    finally:
        if conn:
            conn.close()
        pass

# Call the function
print_papers_table()

 67
-------------
Row 63:
paper_title: None
source_content: None
links: None
arxiv_link: https://arxiv.org/abs/2205.11822
arxiv_title: None
arxiv_abstract: None
arxiv_metadata: None
arxiv_filename: None
arxiv_paper_markdown: None
citations: 29
versions: 4
id: 54
-------------
Row 64:
paper_title: None
source_content: None
links: None
arxiv_link: https://arxiv.org/abs/2201.11990
arxiv_title: None
arxiv_abstract: None
arxiv_metadata: None
arxiv_filename: None
arxiv_paper_markdown: None
citations: 454
versions: 4
id: 83
-------------
Row 65:
paper_title: None
source_content: None
links: None
arxiv_link: https://arxiv.org/abs/2302.13971
arxiv_title: LLaMA: Open and Efficient Foundation Language Models
arxiv_abstract: We introduce LLaMA, a collection of foundation language mode...
arxiv_metadata: <?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://w...
arxiv_filename: LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf
arxiv_paper_markdown: None
citations: 4450
versions: 13
id: 5

Define database function to insert scraping results

In [256]:
def insert_scraping_results(url, html, status, query, title, snippet, job_id):    
    conn = connection()
    c = conn.cursor()
    try:
        # Check if the URL already exists in the table
        c.execute('SELECT COUNT(*) FROM google_search_results WHERE url = %s', (url,))
        count = c.fetchone()[0]

        if count == 0:
            # URL does not exist, insert new row with all fields including title and snippet
            c.execute('''
                INSERT INTO google_search_results (url, html, scraping_status, query, title, snippet, job_id)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
            ''', (url, html, status, query, title, snippet, job_id))
        else:
            # URL exists, skip inserting
            print("URL already exists in google_search_results. Skipping insert.")

        # Commit the transaction
        conn.commit()

    except Exception as e:
        # Rollback the transaction if an error occurs
        conn.rollback()
        print(f"An error occurred: {e}. Transaction was rolled back.")
    finally:
        if conn:
            conn.close()

# Example usage
insert_scraping_results('https://www.example.com/top-llm-research-papers-2023/', '<html lang="en-US"><head>..</html>', '200', 'example query', 'Example Title', 'This is an example snippet', '1')

URL already exists in google_search_results. Skipping insert.


define a database function to check for processed markdown

In [227]:
def check_processed_markdown(url: str) -> bool:
    conn = connection()
    """Check if the markdown for a given URL has already been processed."""
    c = conn.cursor()
    c.execute("SELECT processed_markdown FROM google_search_results WHERE url = %s", (url,))
    result = c.fetchone()
    if result and result[0]:
        # If there's processed markdown, return True
        return True
    return False
    if conn:
        conn.close()

# Example usage:
url = 'https://www.topbots.com/top-llm-research-papers-2023/'
check_processed_markdown(url) 

False

define a database function to insert processed markdown

In [228]:
def insert_processed_markdown(url: str, processed_markdown: dict):  # processed_markdown should be a dict based on usage
    try:
        conn = connection()
        c = conn.cursor()
        
        # First, check if the URL exists in the database
        c.execute('SELECT COUNT(*) FROM google_search_results WHERE url = %s', (url,))
        url_exists = c.fetchone()[0]
        
        if url_exists:
            # Convert processed_markdown to a JSON string
            processed_markdown_str = json.dumps(processed_markdown, indent=4)  # Assuming processed_markdown is always a dict based on your usage
            
            # Update the row where the URL matches, setting the processed_markdown column
            c.execute('''
                UPDATE google_search_results 
                SET processed_markdown = %s 
                WHERE url = %s;
            ''', (processed_markdown_str, url))
            conn.commit()
            print(f"Processed markdown inserted successfully for URL: {url}")
        else:
            print(f"No entry found in the database for URL: {url}. Update skipped.")
        
    except Exception as e:
        print(f"An error occurred while inserting processed markdown: {e}")
    finally:
        if conn:
            conn.close()
            
url = 'https://test.url'
processed_markdown = {
    "Function call": "Page",
    "args": {
        "title": "Top academic papers on LLMs",
        "summary": "A list of academic papers and resources related to Large Language Models (LLMs) and their applications.",
        "paragraphs": [
            {
                "paper_title": "Awesome-LLM-hallucination",
                "content": "LLM hallucination paper list.",
                "links": [
                    {"url": "https://github.com/LuckyyySTA/Awesome-LLM-hallucination"}
                ]
            },
            {
                "paper_title": "awesome-hallucination-detection",
                "content": "List of papers on hallucination detection in LLMs.",
                "links": [
                    {"url": "https://github.com/EdinburghNLP/awesome-hallucination-detection"}
                ]
            },
            {
                "paper_title": "LLMsPracticalGuide",
                "content": "A curated (still actively updated) list of practical guide resources of LLMs",
                "links": [
                    {"url": "https://github.com/Mooler0410/LLMsPracticalGuide"}
                ]
            },
            # Add other papers here in the same format
        ]
    }
}

insert_processed_markdown(url, processed_markdown)

No entry found in the database for URL: https://test.url. Update skipped.


### Get metadata from arxiv for the paper

In [229]:
import xml.etree.ElementTree as ET

def fetch_arxiv_paper_from_url(arxiv_url):
    # Extract the arXiv ID from the provided URL
    arxiv_id = arxiv_url.split('/')[-1]
    # Ensure that .pdf is not part of the arXiv ID
    arxiv_id = arxiv_id.replace('.pdf', '')  # Remove '.pdf' if it's part of the ID

    print("Fetching information for arXiv ID:", arxiv_id)

    # Define the base URL for the arXiv API
    base_url = 'http://export.arxiv.org/api/query?'
    query_params = 'id_list={}&max_results=1'.format(arxiv_id)
    final_url = base_url + query_params  # Construct the final URL
    print("Final API Request URL:", final_url)  # Debug: print the URL to be requested

    # Make the request
    response = requests.get(final_url)

    # Check if the request was successful
    if response.status_code == 200:
        print("Raw XML response received")
        xml_data = response.text
        root = ET.fromstring(xml_data)
        ns = {'atom': 'http://www.w3.org/2005/Atom'}  # Namespace for parsing

        # Extract paper details
        link_element = root.find('.//atom:entry/atom:link[@rel="related"]', ns)
        if link_element is not None:
            pdf_url = link_element.attrib['href']
        else:
            pdf_url = None
        title = root.find('.//atom:entry/atom:title', ns).text.strip()
        abstract = root.find('.//atom:entry/atom:summary', ns).text.strip()
        published_date = root.find('.//atom:entry/atom:published', ns).text.strip()

        # Extract authors
        authors = [author.find('atom:name', ns).text for author in root.findall('.//atom:entry/atom:author', ns)]

        # Generate a sanitized file name from the title
        file_name = title.replace(':', '').replace(' ', '_') + '.pdf'

        # Print extracted information for debugging
        print(f"PDF URL: {pdf_url}")
        print(f"Title: {title}")
        print(f"File Name: {file_name}")
        print(f"Abstract: {abstract[:100]}..." if len(abstract) > 100 else abstract)
        print(f"Published Date: {published_date}")
        print(f"Authors: {', '.join(authors)}")

        # Return the collected information
        return xml_data, pdf_url, title, file_name, abstract, published_date, authors
    else:
        print("Failed to fetch data from arXiv API. Status code:", response.status_code)
        return None, None, None, None, None, None, None

# Example usage
arxiv_url = 'https://arxiv.org/abs/2302.13971'
fetch_arxiv_paper_from_url(arxiv_url)

Fetching information for arXiv ID: 2302.13971
Final API Request URL: http://export.arxiv.org/api/query?id_list=2302.13971&max_results=1
Raw XML response received
PDF URL: http://arxiv.org/pdf/2302.13971v1
Title: LLaMA: Open and Efficient Foundation Language Models
File Name: LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf
Abstract: We introduce LLaMA, a collection of foundation language models ranging from
7B to 65B parameters. We...
Published Date: 2023-02-27T17:11:15Z
Authors: Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample


('<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3D%26id_list%3D2302.13971%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=&amp;id_list=2302.13971&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/qJuhZNxbRqWajNrNkNtkRSmyBuQ</id>\n  <updated>2024-03-14T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/2302.13971v1</id>\n    <updated>2023-02-27T17:11:15Z</updated>\n    <published>2023-02-27T17:11:15Z</published>\n    <title>LLaMA: Open and Efficient Foundation Language 

### Download pdf of the paper

In [230]:
def download_pdf(pdf_url, file_name):
    # Create the "papers" directory if it doesn't exist
    papers_dir = "papers"
    if not os.path.exists(papers_dir):
        os.makedirs(papers_dir)

    # Construct the full file path
    file_path = os.path.join(papers_dir, file_name)

    # Check if the file already exists
    if os.path.exists(file_path):
        print("The paper already exists.")
        return file_path  # Return the file path

    # Send a GET request to download the PDF
    response = requests.get(pdf_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Write the PDF content to the file
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print("The paper has been downloaded successfully.")
        return file_path  # Return the file path
    else:
        # Return a status error message
        error_message = f"Failed to download the paper. Status code: {response.status_code}"
        return error_message

# Example usage
download_pdf('http://arxiv.org/pdf/2302.13971v1', 'LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf')



The paper already exists.


'papers/LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf'

### Convert pdf into markdown

In [231]:
import nest_asyncio
from llama_parse import LlamaParse

# This function will convert a given PDF file to Markdown format using LlamaParse
def convert_pdf_to_markdown(file_name):
    # Necessary for running async code in notebooks or scripts
    nest_asyncio.apply()

    # Initialize the LlamaParse parser
    parser = LlamaParse(
        api_key=llamaindex_api_key,
        result_type="markdown",  # Choose "markdown" as the output format
        verbose=True,  # Enable verbose output to see detailed logs
    )
    
    # Define the path to your PDF file
    pdf_file_path = os.path.join("./papers/", file_name)
    print(pdf_file_path, "type:", type(pdf_file_path))
    # Convert the PDF to Markdown
    # This is a synchronous call, you can also use asynchronous calls as shown in the documentation
    documents = parser.load_data(pdf_file_path)

    # Return the converted documents
    return documents

# Define the path to your PDF file
file_name = "Retrieval-Augmented_Generation_for_Knowledge-Intensive_NLP_Tasks.pdf"
documents = convert_pdf_to_markdown(file_name)

./papers/Retrieval-Augmented_Generation_for_Knowledge-Intensive_NLP_Tasks.pdf type: <class 'str'>
Started parsing the file under job_id 9fafeba8-b5e0-49ca-9bc2-adc90b1c5c76
....

In [232]:
markdown_content = None
if documents:
    # Assuming the first document contains the content
    # Use the get_text() method to retrieve the Markdown content
    markdown_content = documents[0].get_text()
    print(markdown_content)

    # Optionally, write the markdown content to a file
    with open('converted_markdown.md', 'w', encoding='utf-8') as markdown_file:
        markdown_file.write(markdown_content)

## Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

|Authors|Patrick Lewis†‡, Ethan Perez⋆, Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†, Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†|
|---|---|
|Affiliations|†Facebook AI Research; ‡University College London; ⋆New York University;|
|Email|plewis@fb.com|

Abstract

Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory have so far been

### Get citations and number of versions from Google Scholar

In [233]:
from serpapi import GoogleSearch

def get_scholar_citations_versions(query_url):
    params = {
        "api_key": serp_api_key,  # Ensure serp_api_key is defined elsewhere
        "engine": "google_scholar",
        "q": query_url,
        "hl": "en"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    # Initialize the return values
    number_of_citations = None
    number_of_versions = None

    # Extracting number of citations and versions
    if 'organic_results' in results:
        if 'inline_links' in results['organic_results'][0]:
            if 'cited_by' in results['organic_results'][0]['inline_links']:
                number_of_citations = results["organic_results"][0]["inline_links"]["cited_by"]["total"]

            if 'versions' in results['organic_results'][0]['inline_links']:
                number_of_versions = results["organic_results"][0]["inline_links"]["versions"]["total"]

    return number_of_citations, number_of_versions

query_url = 'https://arxiv.org/abs/2302.13971'
citations, versions = get_scholar_citations_versions(query_url)
print("Number of citations:", citations)
print("Number of versions:", versions)

Number of citations: 4477
Number of versions: 13


### Gemini summary and relevance score

Gemini Set up

In [234]:
import google.generativeai as genai
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel('gemini-1.0-pro')

In [235]:
model

genai.GenerativeModel(
    model_name='models/gemini-1.0-pro',
    generation_config={},
    safety_settings={},
    tools=None,
)

Given `arxiv` structure, summarize and evaluate against user prompt. Give a heuritic score.

In [236]:
import re
import json

def process_arxiv(mkdn, metdata, query):
  # 1 - `arxiv` dict
  def extract_markdown(markdown_text, pattern):
    # Use re.findall to find all matches of the pattern in the markdown text
    matches = re.findall(pattern, markdown_text, re.MULTILINE)

    # Return the first match (if any)
    if matches:
        return matches[0]
    else:
        return None

  paper_title = extract_markdown(mkdn, r'^##\s+(.*)$')
  if paper_title is None:
    print("extract_markdown for paper_title isn't working")

  abstract = extract_markdown(mkdn, r'^Abstract(.*)#')
  if abstract is None:
    print("extract_markdown for abstract isn't working...hardcoding the abstract instead")
    abstract = '''We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.'''

  arxiv = {'paper_title': paper_title, 'abstract': abstract, 'metadata': metadata, 'paper': mkdn}

  # 2 - Summarizer
  prompt = '''Please summarize the following paper in one sentence given the user query "{query}". The paper is provided in a structured format {paper_format} \n\nDocument: {document}'''.format(query=query, document=arxiv, paper_format={key: "" for key in arxiv.keys()})
  print(prompt, "\nGenerating summarization............")

  if model.count_tokens(prompt).total_tokens > 28_000:
    print("The prompt is too long, visiting https://aistudio.google.com/app/prompts/new_freeform to manually use Gemini 1.5 pro instead with the prompt above.")
  relevant_answer = model.generate_content(prompt).text

  print(relevant_answer)

  # 3 - Relevance scorer
  prompt = '''From a scale of 1 to 5, rate how relevant the following paper is with the user query "{query}". The paper is provided in a structured format {paper_format}. Please provide the score in the format of a json object with one key, 'score'. Example: {{"score": 5}}. Also please provide reasoning why it doesn't have a higher or lower relevance score. \n\nDocument: {document}'''.format(query=query, document=arxiv, paper_format={key: "" for key in arxiv.keys()})
  print(prompt, "\nGenerating............")

  if model.count_tokens(prompt).total_tokens > 28_000:
    print("The prompt is too long, visiting https://aistudio.google.com/app/prompts/new_freeform to manually use Gemini 1.5 pro instead with the prompt above.")

  model_response = model.generate_content(prompt).text

  re_match = re.search(r'"score": (\d+)', model_response)
  relevance_score = re_match.group(1)

  print("relevance score: " + relevance_score)

  return {
      'relevance_score': relevance_score,
      'relevant_answer': relevant_answer
  }

  query

  #@title `mkdn` and `metadata`
metadata = markdown_content #right now it's just the entire paper pdf

#@title Extractors to process `mkdn` and `metadata` into `arxiv` dict

import re

def extract_markdown(markdown_text, pattern):
  # Use re.findall to find all matches of the pattern in the markdown text
  matches = re.findall(pattern, markdown_text, re.MULTILINE)

  # Return the first match (if any)
  if matches:
      return matches[0]
  else:
      return None

paper_title = extract_markdown(metadata, r'^##\s+(.*)$')
if paper_title is None:
  print("extract_markdown for paper_title isn't working")

abstract = extract_markdown(metadata, r'^Abstract(.*)#')
if abstract is None:
  print("extract_markdown for abstract isn't working...hardcoding the abstract instead")
  abstract = '''We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.'''

arxiv = {'paper_title': paper_title, 'abstract': abstract, 'metadata': metadata, 'paper': metadata}



import json

# Convert to JSON string with indentation for readability
pretty_arxiv_output = json.dumps(arxiv, indent=4, default=str)

# Print with added line breaks
print("\narxiv=",)
print(pretty_arxiv_output)

extract_markdown for abstract isn't working...hardcoding the abstract instead

arxiv=
{
    "paper_title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    "abstract": "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.",
    "metadata": "## Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\n\n|Authors|Patrick Lewis\u2020\u2021, Ethan Perez\u22c6, Aleksandra Piktus\u2020, Fabio Petroni\u2020, Vladimir Karpukhin\u2020, Naman Goyal\u2020, Heinrich K\u00fcttler\u2020, Mike Lewis\u2020, Wen-tau Yi

### Perplexity logic

query answer

In [237]:
from openai import OpenAI
import textwrap

async def query_perplexity_response(job_id, user_query):
    # Message configuration for Perplexity
    messages = [
        {
            "role": "system",
            "content": (
                "You are an artificial intelligence assistant and you need to "
                "engage in a helpful, detailed, polite conversation with a user."
            ),
        },
        {
            "role": "user",
            "content": user_query
        },
    ]

    client = OpenAI(api_key=perplexity_api_key, base_url="https://api.perplexity.ai")

    # chat completion without streaming
    response = client.chat.completions.create(
        model="pplx-70b-online",
        messages=messages,
    )
    answer = response.choices[0].message.content 
    # Establish a new database connection
    async with aiopg.create_pool(DSN) as pool:
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                # Update the jobs table, setting perplexity_response
                await cur.execute(
                    "UPDATE jobs SET perplexity_response = %s WHERE job_id = %s",
                    (answer, job_id)  # Store the response, wrapped to 75 characters
                )

    return textwrap.fill(answer, width=75)  # Return the formatted answer

# Example usage
job_id = 3
user_query = "Top academic papers on LLMs"
asyncio.run(query_perplexity_response(job_id, user_query))

"As per your request, I'll provide a summary of top academic papers on Large\nLanguage Models (LLMs) based on the resources provided. These papers cover\na range of topics, from prompt engineering to the analysis of various LLM\narchitectures and their applications.  ### Latest Papers (as of May 2023):\nFrom the PromptingGuide.ai resource (), here are some notable papers:  1.\nDemonstration-Retrieved In-context Learning 2. Probing in Context: Toward\nBuilding Robust Classifiers via Probing Large Language Models 3. Skill-\nBased Few-Shot Selection for In-Context Learning 4. Exploring Chain-of-\nThought Style Prompting for Text-to-SQL 5. On Learning to Summarize with\nLarge Language Models as References 6. Element-aware Summarization with\nLarge Language Models: Expert-aligned Evaluation and Chain-of-Thought\nMethod 7. Small Language Models Improve Giants by Rewriting Their Outputs\n8. Prompting and Evaluating Large Language Models for Proactive Dialogues:\nClarification, Target-guided, 

### GPT logic

answer the user query

In [238]:
import openai
import aiopg
from openai import AsyncOpenAI

async def query_answer_with_gpt(job_id, user_query):

    # Constructing the prompt
    prompt = (
        f"Please answer the user query if you were a best in class expert on this subject. Keep answer short to 1 sentence. Include factual info and links. Query: {user_query}"
    )

    # Setting up OpenAI client
    client = AsyncOpenAI(api_key=openai_api_key) # Replace 'your_openai_api_key' with your actual OpenAI API key

    # Making an asynchronous API call
    response = await client.chat.completions.create(
        messages=[
            {"role": "system", "content": prompt}
        ],
        model="gpt-4-turbo-preview"  # You can switch to other models if needed
    )
    answer = response.choices[0].message.content  # Extracting the response   answer = response.choices[0].message.content  # Extracting the response

    async with aiopg.create_pool(DSN) as pool:
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                await cur.execute(
                    "UPDATE jobs SET gpt_response = %s WHERE job_id = %s",
                    (answer, job_id)  # Store the original query if sub_queries is None
                )

    return answer  # You might still return the answer for logging or other purposes

# Example
job_id = 3
user_query = "Top academic papers on LLMs" 
asyncio.run(query_answer_with_gpt(job_id, user_query))

'As of my last update in April 2023, "Attention Is All You Need" by Ashish Vaswani et al., which introduced the Transformer model foundational for current large language models (LLMs), is among the top academic papers in the field. You can access it here: https://arxiv.org/abs/1706.03762.'

generate keyword queries

In [239]:
import openai
import aiopg
from openai import AsyncOpenAI

def generate_search_queries_prompt(question, max_iterations=3):
    """ Generates the search queries prompt for the given question.
    Args: question (str): The question to generate the search queries prompt for
    Returns: str: The search queries prompt for the given question
    """

    return f'Write {max_iterations} google search queries to search online that form an objective opinion from the following: "{question}"' \
           f'You must respond in a json format: ["query 1", "query 2", "query 3"].'


async def get_sub_queries(job_id, user_query):
    client = AsyncOpenAI(api_key=openai_api_key) 
    attempts = 0
    max_attempts = 3
    sub_queries = None

    while attempts < max_attempts:
        try:
            response = await client.chat.completions.create(
                model="gpt-4-turbo-preview",
                messages=[
                    {"role": "system", "content": "You are an experienced academic research analyst."},
                    {"role": "user", "content": generate_search_queries_prompt(user_query, max_iterations=3)}
                ],
                response_format={ "type": "json_object" },
                temperature=0,
            )
            answer = response.choices[0].message.content
            parsed_response = json.loads(answer)  # This attempts to parse the JSON
            sub_queries = list(parsed_response.values())
            break  # Exit loop if parsing is successful
        except json.JSONDecodeError:  # If JSON is invalid
            print("Attempt: ", attempts, ". Invalid JSON: ", json.JSONDecodeError)
            attempts += 1  # Increment attempts and try again

    # If parsing successful or max attempts reached, update the database
    async with aiopg.create_pool(DSN) as pool:
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                # Update the jobs table, set keyword_search_queries
                await cur.execute(
                    "UPDATE jobs SET keyword_search_queries = %s WHERE job_id = %s",
                    (json.dumps(sub_queries or [user_query]), job_id)  # Store the original query if sub_queries is None
                )

    return sub_queries or [query]  # Return the sub queries or the original query if failed

# Example
job_id = 3
user_query = "Top academic papers on LLMs" 
asyncio.run(get_sub_queries(job_id, user_query))

['most cited academic papers on LLMs',
 'peer-reviewed research on large language models',
 'high impact publications on LLMs']

analyze gpt and perplexity response to generate more google queries

In [240]:
import openai
import aiopg
from openai import AsyncOpenAI

async def queries_based_on_LLM_responses(job_id, GPT, Perplexity):
    client = AsyncOpenAI(api_key=openai_api_key) 
    attempts = 0
    max_attempts = 3
    sub_queries = None
    parsing_successful = False

    while attempts < max_attempts:
        try:
            response = await client.chat.completions.create(
                model="gpt-4-turbo-preview",
                messages=[
                    {"role": "system", "content": "Extract titles of academic papers from the following responses. You must respond in a json format: ['title 1', 'title 2', 'title 3']"},
                    {"role": "assistant", "content": f'GPT response: "{GPT}"\nPerplexity response: "{Perplexity}"'}
                ],
                response_format={ "type": "json_object" },
                temperature=0,
            )
            answer = response.choices[0].message.content
            parsed_response = json.loads(answer)  # This attempts to parse the JSON
            sub_queries = parsed_response.get('titles', [])
            parsing_successful = True
            break  # Exit loop if parsing is successful
        except json.JSONDecodeError:  # If JSON is invalid
            print("Attempt: ", attempts, ". Invalid JSON: ", json.JSONDecodeError)
            print("Answer: ", answer)
            attempts += 1  # Increment attempts and try again

    if parsing_successful:
        # Only update the database if parsing was successful
        async with aiopg.create_pool(DSN) as pool:
            async with pool.acquire() as conn:
                async with conn.cursor() as cur:
                    await cur.execute(
                        "UPDATE jobs SET paper_search_queries = %s WHERE job_id = %s",
                        (json.dumps(sub_queries), job_id)  # Only update with sub_queries if parsing was successful
                    )
        return sub_queries  # Return the successfully parsed sub_queries
    else:
        return "Error: Unable to parse the JSON response correctly after multiple attempts."

# Example
job_id = 3
GPT = 'Identifying the "top" academic papers on Large Language Models (LLMs) is subjective, but one of the most foundational and widely cited in this area is "Attention is All You Need" by Vaswani et al., which introduced the Transformer architecture, the backbone of many current LLMs (https://arxiv.org/abs/1706.03762).' 
Perplexity = 'Based on the search results provided, I\'ll share a selection of top\nacademic papers on LLMs that would be particularly relevant for your\ninterest in handling research papers, summarizing, and citing research\npapers when possible.  1. **Sparks of Artificial General Intelligence:\nEarly experiments with GPT-4** – This paper from Microsoft Research\ndiscusses the capabilities and limits of GPT-4, a large language model. It\nexplores the potential for GPT-4 to generate new science and its ability to\ncite research papers when possible. The paper is mentioned in the OpenAI\nDeveloper Forum as a must-read for understanding GPT/LLM capabilities and\nuse.  2. **LLAMA: Open and Efficient Foundation Language Models** – This\nresearch paper introduces LLaMA, a collection of foundational language\nmodels by Meta AI, ranging from 7B to 65B parameters. The models were\ntrained on publicly available datasets without relying on proprietary or\nrestricted data. This paper is mentioned as a key contribution to the field\nby TopBots.  3. **Tree of Thoughts: Deliberate Problem Solving with Large\nLanguage Models** – This paper from Princeton University and Google\nDeepMind presents the "Tree of Thoughts" approach, which enables LLMs to\nmake deliberate decisions by considering multiple reasoning paths, self-\nevaluating choices, and making global decisions by looking ahead or\nbacktracking when needed. The approach is demonstrated to be effective on\nchallenging tasks like Game of 24, Creative Writing, and Crosswords.  4.\n**Prompting and Evaluating Large Language Models for Proactive Dialogues**\n– This paper, listed on the Prompt Engineering Guide, discusses prompting\nand evaluating LLMs for proactive dialogues, including clarification,\ntarget-guided, and non-collaborative scenarios. It covers recent advances\nin prompt engineering, which is crucial for effective interaction with LLMs\nlike ChatGPT or similar systems.  5. **A Comprehensive Overview of Large\nLanguage Models** – This paper is a comprehensive overview of LLMs,\ndiscussing architectures, training pipelines, and utilization in different\naspects, including a focus on LLMs for handling scientific literature and\nsummarization.  I hope this selection of papers provides valuable insights\ninto LLM capabilities, prompt engineering, and their potential to assist in\nresearch paper handling, summarization, and citations. Let me know if you\nneed further clarification or have additional questions!'
asyncio.run(queries_based_on_LLM_responses(job_id, GPT, Perplexity))

['Sparks of Artificial General Intelligence: Early experiments with GPT-4',
 'LLAMA: Open and Efficient Foundation Language Models',
 'Tree of Thoughts: Deliberate Problem Solving with Large Language Models',
 'Prompting and Evaluating Large Language Models for Proactive Dialogues',
 'A Comprehensive Overview of Large Language Models']

summary

In [241]:
import openai
import aiopg
from openai import AsyncOpenAI

async def query_info_with_gpt(paper_id, arxiv_paper_markdown, arxiv_metadata, user_query):

    MAX_CONTEXT_LENGTH = 15500 

    # Initial context setup and trimming
    context = f"Context: {arxiv_metadata}\n{arxiv_paper_markdown}"
    while count_tokens(context) > MAX_CONTEXT_LENGTH:
        char_to_token_ratio = len(context) / count_tokens(context)
        max_char_length = int(MAX_CONTEXT_LENGTH * char_to_token_ratio)
        context = context[:max_char_length]

    # Constructing the prompt
    prompt = (
        f"Summarize this in 100 characters based on the user query {user_query}: {context}"
    )

    # Setting up OpenAI client
    client = AsyncOpenAI(api_key=openai_api_key) # Replace 'your_openai_api_key' with your actual OpenAI API key

    # Making an asynchronous API call
    response = await client.chat.completions.create(
        messages=[
            {"role": "system", "content": prompt}
        ],
        model="gpt-4-turbo-preview"  # You can switch to other models if needed
    )
    answer = response.choices[0].message.content  # Extracting the response   answer = response.choices[0].message.content  # Extracting the response

    # Establish a new database connection
    async with aiopg.create_pool(DSN) as pool:
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                # Update the database with the answer
                await cur.execute(
                    "UPDATE Query_Papers SET relevant_answer = %s WHERE id = %s",
                    (answer, paper_id)
                )

    return answer  # You might still return the answer for logging or other purposes


# Example
user_query = "Top academic papers on LLMs" 
paper_id = 801
arxiv_paper_markdown = "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.",
arxiv_metadata = "## Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\n\nPatrick Lewis\u2020\u2021, Ethan Perez\u22c6, Aleksandra Piktus\u2020, Fabio Petroni\u2020, Vladimir Karpukhin\u2020, Naman Goyal\u2020, Heinrich K\u00fcttler\u2020\n\narXiv:2005.11401v4 [cs.CL] 12 Apr 2021\n\nMike Lewis\u2020, Wen-tau Yih\u2020, Tim Rockt\u00e4schel\u2020\u2021, Sebastian Riedel\u2020\u2021, Douwe Kiela\u2020\n\n\u2020Facebook AI Research; \u2021University College London; \u22c6New York University;\n\nplewis@fb.com\n\n### Abstract\n\nLarge pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory have so far been only investigated for extractive downstream tasks. We explore a general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) \u2014 models which combine pre-trained parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages across the whole generated sequence, and another which can use different passages per token. We fine-tune and evaluate our models on a wide range of knowledge-intensive NLP tasks and set the state of the art on three open domain QA tasks, outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art parametric-only seq2seq baseline.\n\n### Introduction\n\nPre-trained neural language models have been shown to learn a substantial amount of in-depth knowledge from data [47]. They can do so without any access to an external memory, as a parameterized implicit knowledge base [51, 52]. While this development is exciting, such models do have downsides: They cannot easily expand or revise their memory, can\u2019t straightforwardly provide insight into their predictions, and may produce \u201challucinations\u201d [38]. Hybrid models that combine parametric memory with non-parametric (i.e., retrieval-based) memories [20, 26, 48] can address some of these issues because knowledge can be directly revised and expanded, and accessed knowledge can be inspected and interpreted. REALM [20] and ORQA [31], two recently introduced models that combine masked language models [8] with a differentiable retriever, have shown promising results.\n---\n## Define \"middle ear\"(x)\n\nThe middle ear includes the tympanic cavity and the three ossicles.\n\n## Question Answering:\n\n|Question Query|Query|Retriever p|\u03b7|Document|Generator p|\u03b8|\n|---|---|---|---|---|---|---|\n|Barack Obama was born in Hawaii.(x)|Encoder|(Non-Parametric)| |Index|(Parametric)|Answer Generation supports (y)|\n\n## Fact Verification: Fact Query\n\nThe Divine Comedy (x)\n\n## Jeopardy Question Generation:\n\nAnswer Query\n\nFigure 1: Overview of our approach. We combine a pre-trained retriever (Query Encoder + Document Index) with a pre-trained seq2seq model (Generator) and fine-tune end-to-end. For query x, we use Maximum Inner Product Search (MIPS) to find the top-K documents zi. For final prediction y, we treat z as a latent variable and marginalize over seq2seq predictions given different documents. but have only explored open-domain extractive question answering. Here, we bring hybrid parametric and non-parametric memory to the \u201cworkhorse of NLP,\u201d i.e. sequence-to-sequence (seq2seq) models. We endow pre-trained, parametric-memory generation models with a non-parametric memory through a general-purpose fine-tuning approach which we refer to as retrieval-augmented generation (RAG). We build RAG models where the parametric memory is a pre-trained seq2seq transformer, and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever. We combine these components in a probabilistic model trained end-to-end (Fig. 1). The retriever (Dense Passage Retriever [26], henceforth DPR) provides latent documents conditioned on the input, and the seq2seq model (BART [32]) then conditions on these latent documents together with the input to generate the output. We marginalize the latent documents with a top-K approximation, either on a per-output basis (assuming the same document is responsible for all tokens) or a per-token basis (where different documents are responsible for different tokens). Like T5 [51] or BART, RAG can be fine-tuned on any seq2seq task, whereby both the generator and retriever are jointly learned. There has been extensive previous work proposing architectures to enrich systems with non-parametric memory which are trained from scratch for specific tasks, e.g. memory networks [64, 55], stack-augmented networks [25] and memory layers [30]. In contrast, we explore a setting where both parametric and non-parametric memory components are pre-trained and pre-loaded with extensive knowledge. Crucially, by using pre-trained access mechanisms, the ability to access knowledge is present without additional training. Our results highlight the benefits of combining parametric and non-parametric memory with generation for knowledge-intensive tasks\u2014tasks that humans could not reasonably be expected to perform without access to an external knowledge source. Our RAG models achieve state-of-the-art results on open Natural Questions [29], WebQuestions [3] and CuratedTrec [2] and strongly outperform recent approaches that use specialised pre-training objectives on TriviaQA [24]. Despite these being extractive tasks, we find that unconstrained generation outperforms previous extractive approaches. For knowledge-intensive generation, we experiment with MS-MARCO [1] and Jeopardy question generation, and we find that our models generate responses that are more factual, specific, and diverse than a BART baseline. For FEVER [56] fact verification, we achieve results within 4.3% of state-of-the-art pipeline models which use strong retrieval supervision. Finally, we demonstrate that the non-parametric memory can be replaced to update the models\u2019 knowledge as the world changes.\n\n## Methods\n\nWe explore RAG models, which use the input sequence x to retrieve text documents z and use them as additional context when generating the target sequence y. As shown in Figure 1, our models leverage two components: (i) a retriever p\u03b7(z|x) with parameters \u03b7 that returns (top-K truncated) distributions over text passages given a query x and (ii) a generator p\u03b8(yi|x, z, y1:i\u22121) parametrized\n\n1 Code to run experiments with RAG has been open-sourced as part of the HuggingFace Transformers Library [66] and can be found at https://github.com/huggingface/transformers/blob/master/ examples/rag/. An interactive demo of RAG models can be found at https://huggingface.co/rag/\n---\n## by \u03b8 that generates a current token based on a context of the previous i \u2212 1 tokens y1:i\u22121, the original input x and a retrieved passage z.\n\nTo train the retriever and generator end-to-end, we treat the retrieved document as a latent variable. We propose two models that marginalize over the latent documents in different ways to produce a distribution over generated text. In one approach, RAG-Sequence, the model uses the same document to predict each target token. The second approach, RAG-Token, can predict each target token based on a different document. In the following, we formally introduce both models and then describe the p\u03b7 and p\u03b8 components, as well as the training and decoding procedure.\n\n### Models\n\n|RAG-Sequence Model|The RAG-Sequence model uses the same retrieved document to generate the complete sequence. Technically, it treats the retrieved document as a single latent variable that is marginalized to get the seq2seq probability p(y|x) via a top-K approximation. Concretely, the top K documents are retrieved using the retriever, and the generator produces the output sequence probability for each document, which are then marginalized,|\n|---|---|\n| |pRAG-Sequence(y|x) \u2248 p\u03b7(z|x)p\u03b8(y|x, z) = p\u03b7(z|x) \u03a3 p\u03b8(yi|x, z, y1:i\u22121) z\u2208top-k(p(\u00b7|x)) z\u2208top-k(p(\u00b7|x)) i|\n|RAG-Token Model|In the RAG-Token model we can draw a different latent document for each target token and marginalize accordingly. This allows the generator to choose content from several documents when producing an answer. Concretely, the top K documents are retrieved using the retriever, and then the generator produces a distribution for the next output token for each document, before marginalizing, and repeating the process with the following output token, Formally, we define:|\n| |pRAG-Token(y|x) \u2248 \u03a3 z\u2208top-k(p(\u00b7|x)) p\u03b7(z|x)p\u03b8(yi|x, z, y1:i\u22121)|\n\nFinally, we note that RAG can be used for sequence classification tasks by considering the target class as a target sequence of length one, in which case RAG-Sequence and RAG-Token are equivalent.\n\n### Retriever: DPR\n\nThe retrieval component p\u03b7(z|x) is based on DPR [26]. DPR follows a bi-encoder architecture: p\u03b7(z|x) \u221d exp d(z)\u22a4q(x) d(z) = BERTd(z), q(x) = BERTq(x) where d(z) is a dense representation of a document produced by a BERTBASE document encoder [8], and q(x) a query representation produced by a query encoder, also based on BERTBASE. Calculating top-k(p\u03b7(\u00b7|x)), the list of k documents z with highest prior probability p\u03b7(z|x), is a Maximum Inner Product Search (MIPS) problem, which can be approximately solved in sub-linear time [23]. We use a pre-trained bi-encoder from DPR to initialize our retriever and to build the document index. This retriever was trained to retrieve documents which contain answers to TriviaQA [24] questions and Natural Questions [29]. We refer to the document index as the non-parametric memory.\n\n### Generator: BART\n\nThe generator component p\u03b8(yi|x, z, y1:i\u22121) could be modelled using any encoder-decoder. We use BART-large [32], a pre-trained seq2seq transformer [58] with 400M parameters. To combine the input x with the retrieved content z when generating from BART, we simply concatenate them. BART was pre-trained using a denoising objective and a variety of different noising functions. It has obtained state-of-the-art results on a diverse set of generation tasks and outperforms comparably-sized T5 models [32]. We refer to the BART generator parameters \u03b8 as the parametric memory henceforth.\n\n### Training\n\nWe jointly train the retriever and generator components without any direct supervision on what document should be retrieved. Given a fine-tuning training corpus of input/output pairs (xj, yj), we\n---\nminimize the negative marginal log-likelihood of each target, j \u2212log p(yj|xj) using stochastic gradient descent with Adam [28]. Updating the document encoder BERTd during training is costly as it requires the document index to be periodically updated as REALM does during pre-training [20]. We do not find this step necessary for strong performance, and keep the document encoder (and index) fixed, only fine-tuning the query encoder BERTq and the BART generator.\n\n## Decoding\n\nAt test time, RAG-Sequence and RAG-Token require different ways to approximate arg max y p(y|x).\n\n|RAG-Token|The RAG-Token model can be seen as a standard, autoregressive seq2seq generator with transition probability: p\u2032 \u03b8(yi|x, y1:i\u22121) = z\u2208top-k(p(\u00b7|x)) p\u03b7(zi|x)p\u03b8(yi|x, zi, y1:i\u22121) To decode, we can plug p\u2032 \u03b8(yi|x, y1:i\u22121) into a standard beam decoder.|\n|---|---|\n|RAG-Sequence|For RAG-Sequence, the likelihood p(y|x) does not break into a conventional per-token likelihood, hence we cannot solve it with a single beam search. Instead, we run beam search for each document z, scoring each hypothesis using p\u03b8(yi|x, z, y1:i\u22121). This yields a set of hypotheses Y, some of which may not have appeared in the beams of all documents. To estimate the probability of a hypothesis y we run an additional forward pass for each document z for which y does not appear in the beam, multiply generator probability with p\u03b7(z|x) and then sum the probabilities across beams for the marginals. We refer to this decoding procedure as \u201cThorough Decoding.\u201d For longer output sequences, |Y| can become large, requiring many forward passes. For more efficient decoding, we can make a further approximation that p\u03b8(y|x, zi) \u2248 0 where y was not generated during beam search from x, zi. This avoids the need to run additional forward passes once the candidate set Y has been generated. We refer to this decoding procedure as \u201cFast Decoding.\u201d|\n\n## Experiments\n\nWe experiment with RAG in a wide range of knowledge-intensive tasks. For all experiments, we use a single Wikipedia dump for our non-parametric knowledge source. Following Lee et al. [31] and Karpukhin et al. [26], we use the December 2018 dump. Each Wikipedia article is split into disjoint 100-word chunks, to make a total of 21M documents. We use the document encoder to compute an embedding for each document, and build a single MIPS index using FAISS [23] with a Hierarchical Navigable Small World approximation for fast retrieval [37]. During training, we retrieve the top k documents for each query. We consider k \u2208 {5, 10} for training and set k for test time using dev data. We now discuss experimental details for each task.\n\n### Open-domain Question Answering\n\nOpen-domain question answering (QA) is an important real-world application and common testbed for knowledge-intensive tasks [20]. We treat questions and answers as input-output text pairs (x, y) and train RAG by directly minimizing the negative log-likelihood of answers. We compare RAG to the popular extractive QA paradigm [5, 7, 31, 26], where answers are extracted spans from retrieved documents, relying primarily on non-parametric knowledge. We also compare to \u201cClosed-Book QA\u201d approaches [52], which, like RAG, generate answers, but which do not exploit retrieval, instead relying purely on parametric knowledge. We consider four popular open-domain QA datasets: Natural Questions (NQ) [29], TriviaQA (TQA) [24]. WebQuestions (WQ) [3] and CuratedTrec (CT) [2]. As CT and WQ are small, we follow DPR [26] by initializing CT and WQ models with our NQ RAG model. We use the same train/dev/test splits as prior work [31, 26] and report Exact Match (EM) scores. For TQA, to compare with T5 [52], we also evaluate on the TQA Wiki test set.\n\n### Abstractive Question Answering\n\nRAG models can go beyond simple extractive QA and answer questions with free-form, abstractive text generation. To test RAG\u2019s natural language generation (NLG) in a knowledge-intensive setting, we use the MSMARCO NLG task v2.1 [43]. The task consists of questions, ten gold passages retrieved from a search engine for each question, and a full sentence answer annotated from the retrieved passages. We do not use the supplied passages, only the questions and answers, to treat\n---\nMSMARCO as an open-domain abstractive QA task. MSMARCO has some questions that cannot be answered in a way that matches the reference answer without access to the gold passages, such as \"What is the weather in Volcano, CA?\" so performance will be lower without using gold passages. We also note that some MSMARCO questions cannot be answered using Wikipedia alone. Here, RAG can rely on parametric knowledge to generate reasonable responses.\n\nJeopardy Question Generation\n\nTo evaluate RAG\u2019s generation abilities in a non-QA setting, we study open-domain question generation. Rather than use questions from standard open-domain QA tasks, which typically consist of short, simple questions, we propose the more demanding task of generating Jeopardy questions. Jeopardy is an unusual format that consists of trying to guess an entity from a fact about that entity. For example, \"The World Cup\" is the answer to the question \"In 1986 Mexico scored as the first country to host this international sports competition twice.\" As Jeopardy questions are precise, factual statements, generating Jeopardy questions conditioned on their answer entities constitutes a challenging knowledge-intensive generation task.\n\nWe use the splits from SearchQA [10], with 100K train, 14K dev, and 27K test examples. As this is a new task, we train a BART model for comparison. Following [67], we evaluate using the SQuAD-tuned Q-BLEU-1 metric [42]. Q-BLEU is a variant of BLEU with a higher weight for matching entities and has higher correlation with human judgment for question generation than standard metrics. We also perform two human evaluations, one to assess generation factuality, and one for specificity. We define factuality as whether a statement can be corroborated by trusted external sources, and specificity as high mutual dependence between the input and output [33]. We follow best practice and use pairwise comparative evaluation [34]. Evaluators are shown an answer and two generated questions, one from BART and one from RAG. They are then asked to pick one of four options\u2014question A is better, question B is better, both are good, or neither is good.\n\nFact Verification\n\nFEVER [56] requires classifying whether a natural language claim is supported or refuted by Wikipedia, or whether there is not enough information to decide. The task requires retrieving evidence from Wikipedia relating to the claim and then reasoning over this evidence to classify whether the claim is true, false, or unverifiable from Wikipedia alone. FEVER is a retrieval problem coupled with a challenging entailment reasoning task. It also provides an appropriate testbed for exploring the RAG models\u2019 ability to handle classification rather than generation. We map FEVER class labels (supports, refutes, or not enough info) to single output tokens and directly train with claim-class pairs. Crucially, unlike most other approaches to FEVER, we do not use supervision on retrieved evidence. In many real-world applications, retrieval supervision signals aren\u2019t available, and models that do not require such supervision will be applicable to a wider range of tasks. We explore two variants: the standard 3-way classification task (supports/refutes/not enough info) and the 2-way (supports/refutes) task studied in Thorne and Vlachos [57]. In both cases we report label accuracy.\n\nResults\n\nOpen-domain Question Answering\n\n|Task|RAG|State-of-the-Art Models|\n|---|---|---|\n|All four open-domain QA tasks|RAG sets a new state of the art (only on the T5-comparable split for TQA)|RAG combines the generation flexibility of the \u201cclosed-book\u201d (parametric only) approaches and the performance of \"open-book\" retrieval-based approaches. Unlike REALM and T5+SSM, RAG enjoys strong results without expensive, specialized \u201csalient span masking\u201d pre-training [20]. It is worth noting that RAG\u2019s retriever is initialized using DPR\u2019s retriever, which uses retrieval supervision on Natural Questions and TriviaQA. RAG compares favourably to the DPR QA system, which uses a BERT-based \u201ccross-encoder\u201d to re-rank documents, along with an extractive reader. RAG demonstrates that neither a re-ranker nor extractive reader is necessary for state-of-the-art performance.|\n\nThere are several advantages to generating answers even when it is possible to extract them. Documents with clues about the answer but do not contain the answer verbatim can still contribute towards a correct answer being generated, which is not possible with standard extractive approaches, leading\n---\n## Table 1: Open-Domain QA Test Scores\n\n|Model|NQ|TQA|WQ|CT|\n|---|---|---|---|---|\n|Closed Book T5-11B [52]|34.5|- /50.1|37.4|-|\n|Book T5-11B+SSM[52]|36.6|- /60.5|44.7|-|\n|Open REALM [20]|40.4|- / -|40.7|46.8|\n|Book DPR [26]|41.5|57.9/ -|41.1|50.6|\n|RAG-Token|44.1|55.2/66.1|45.5|50.0|\n|RAG-Seq.|44.5|56.8/68.0|45.2|52.2|\n\n## Table 2: Generation and classification Test Scores\n\n|Model|Jeopardy|MSMARCO|FVR3|FVR2|\n|---|---|---|---|---|\n|B-1|QB-1|R-L|B-1|Label Acc.|\n|BART|15.1|19.7|38.2|41.6|\n|RAG-Tok.|17.3|22.2|40.1|41.5|\n|RAG-Seq.|14.7|21.4|40.8|44.2|\n\n4.2 Abstractive Question Answering\n\nAs shown in Table 2, RAG-Sequence outperforms BART on Open MS-MARCO NLG by 2.6 Bleu points and 2.6 Rouge-L points. RAG approaches state-of-the-art model performance, which is impressive given that (i) those models access gold passages with specific information required to generate the reference answer, (ii) many questions are unanswerable without the gold passages, and (iii) not all questions are answerable from Wikipedia alone. Table 3 shows some generated answers from our models. Qualitatively, we find that RAG models hallucinate less and generate factually correct text more often than BART. Later, we also show that RAG generations are more diverse than BART generations (see \u00a74.5).\n\n4.3 Jeopardy Question Generation\n\nTable 2 shows that RAG-Token performs better than RAG-Sequence on Jeopardy question generation, with both models outperforming BART on Q-BLEU-1. 4 shows human evaluation results, over 452 pairs of generations from BART and RAG-Token. Evaluators indicated that BART was more factual than RAG in only 7.1% of cases, while RAG was more factual in 42.7% of cases, and both RAG and BART were factual in a further 17% of cases, clearly demonstrating the effectiveness of RAG on the task over a state-of-the-art generation model. Evaluators also find RAG generations to be more specific by a large margin. Table 3 shows typical generations from each model.\n\nJeopardy questions often contain two separate pieces of information, and RAG-Token may perform best because it can generate responses that combine content from several documents. Figure 2 shows an example. When generating \u201cSun\u201d, the posterior is high for document 2 which mentions \u201cThe Sun Also Rises\u201d. Similarly, document 1 dominates the posterior when \u201cA Farewell to Arms\u201d is generated. Intriguingly, after the first token of each book is generated, the document posterior flattens. This observation suggests that the generator can complete the titles without depending on specific documents. In other words, the model\u2019s parametric knowledge is sufficient to complete the titles. We find evidence for this hypothesis by feeding the BART-only baseline with the partial decoding \"The Sun. BART completes the generation \"The Sun Also Rises\" is a novel by this author of \"The Sun Also Rises\" indicating the title \"The Sun Also Rises\" is stored in BART\u2019s parameters. Similarly, BART will complete the partial decoding \"The Sun Also Rises\" is a novel by this author of \"A with \"The Sun Also Rises\" is a novel by this author of \"A Farewell to Arms\". This example shows how parametric and non-parametric memories work together\u2014the non-parametric component helps to guide the generation, drawing out specific knowledge stored in the parametric memory.\n\n4.4 Fact Verification\n\nTable 2 shows our results on FEVER. For 3-way classification, RAG scores are within 4.3% of state-of-the-art models, which are complex pipeline systems with domain-specific architectures and substantial engineering, trained using intermediate retrieval supervision, which RAG does not require.\n---\n## Document 1\n\nhis works are considered classics of American literature ... His wartime experiences formed the basis for his novel \"A Farewell to Arms\" (1929) ...\n\n## Document 2\n\n... artists of the 1920s \"Lost Generation\" expatriate community. His debut novel, \"The Sun Also Rises\", was published in 1926.\n\nFigure 2: RAG-Token document posterior p(zi|x, yi, y\u2212i) for each generated token for input \u201cHemingway\" for Jeopardy generation with 5 retrieved documents. The posterior for document 1 is high when generating \u201cA Farewell to Arms\" and for document 2 when generating \u201cThe Sun Also Rises\".\n\n|Task|Input|Model|Generation|\n|---|---|---|---|\n|define middle ear|BART|?|The middle ear is the part of the ear between the middle ear and the nose.|\n| |RAG-T| |The middle ear is the portion of the ear internal to the eardrum.|\n|MS-MARCO|what currency needed in Scotland|BART|The currency needed in Scotland is Pound sterling.|\n| |RAG-T| |Pound is the currency needed in Scotland.|\n| |RAG-S| |The currency needed in Scotland is the pound sterling.|\n| |BART|?|This state has the largest number of counties in the U.S.|\n|Jeopardy Question Generation|Washington|RAG-T|It\u2019s the only U.S. state named for a U.S. president|\n| |RAG-S| |It\u2019s the state where you\u2019ll find Mount Rainier National Park|\n| |BART|*|This epic poem by Dante is divided into 3 parts: the Inferno, the Purgatorio & the Purgatorio|\n|The Divine Comedy|RAG-T| |Dante\u2019s \"Inferno\" is the first part of this epic poem|\n| |RAG-S| |This 14th century work is divided into 3 sections: \"Inferno\", \"Purgatorio\" & \"Paradiso\"|\n\nFor 2-way classification, we compare against Thorne and Vlachos [57], who train RoBERTa [35] to classify the claim as true or false given the gold evidence sentence. RAG achieves an accuracy within 2.7% of this model, despite being supplied with only the claim and retrieving its own evidence. We also analyze whether documents retrieved by RAG correspond to documents annotated as gold evidence in FEVER. We calculate the overlap in article titles between the top k documents retrieved by RAG and gold evidence annotations. We find that the top retrieved document is from a gold article in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.\n\n## Additional Results\n\nGeneration Diversity: Section 4.3 shows that RAG models are more factual and specific than BART for Jeopardy question generation. Following recent work on diversity-promoting decoding, we also investigate generation diversity by calculating the ratio of distinct ngrams to total ngrams generated by different models. Table 5 shows that RAG-Sequence\u2019s generations are more diverse than RAG-Token\u2019s, and both are significantly more diverse than BART without needing any diversity-promoting decoding.\n\nRetrieval Ablations: A key feature of RAG is learning to retrieve relevant information for the task. To assess the effectiveness of the retrieval mechanism, we run ablations where we freeze the retriever during training. As shown in Table 6, learned retrieval improves results for all tasks.\n\nWe compare RAG\u2019s dense retriever to a word overlap-based BM25 retriever. Here, we replace RAG\u2019s retriever with a fixed BM25 system, and use BM25 retrieval scores as logits when calculating p(z|x). Table 6 shows the results. For FEVER, BM25 performs best, perhaps since FEVER claims are heavily entity-centric and thus well-suited for word overlap-based retrieval. Differentiable retrieval improves results on all other tasks, especially for Open-Domain QA, where it is crucial.\n\nIndex hot-swapping: An advantage of non-parametric memory models like RAG is that knowledge can be easily updated at test time. Parametric-only models like T5 or BART need further training to update their behavior as the world changes. To demonstrate, we build an index using the DrQA Wikipedia dump from December 2016 and compare outputs from RAG using this index to the newer index from our main results (December 2018). We prepare a list of 82 world leaders who had changed\n---\n|Model|NQ|TQA|WQ|CT|Jeopardy-QGen|MSMarco|FVR-3|FVR-2|\n|---|---|---|---|---|---|---|---|---|\n|RAG-Token-BM25|29.7|41.5|32.1|33.1|17.5|22.3|55.5|48.4|75.1|91.6|\n|RAG-Sequence-BM25|31.8|44.1|36.6|33.8|11.1|19.5|56.5|46.9|\n|RAG-Token-Frozen|37.8|50.1|37.1|51.1|16.7|21.7|55.9|49.4|72.9|89.4|\n|RAG-Sequence-Frozen|41.2|52.1|41.8|52.6|11.8|19.6|56.7|47.3|\n|RAG-Token|43.5|54.8|46.5|51.9|17.9|22.6|56.2|49.4|74.5|90.6|\n|RAG-Sequence|44.0|55.8|44.9|53.4|15.3|21.5|57.2|47.5|\n\n|Content|Page Number|\n|---|---|\n|Table 4: Human assessments for the Jeopardy Question Generation Task.| |\n|Table 5: Ratio of distinct to total tri-grams for generation tasks.| |\n|Table 6: Ablations on the dev set. As FEVER is a classification task, both RAG models are equivalent.| |\n\nBetween these dates and use a template \u201cWho is {position}?\u201d (e.g. \u201cWho is the President of Peru?\u201d) to query our NQ RAG model with each index. RAG answers 70% correctly using the 2016 index for 2016 world leaders and 68% using the 2018 index for 2018 world leaders. Accuracy with mismatched indices is low (12% with the 2018 index and 2016 leaders, 4% with the 2016 index and 2018 leaders). This shows we can update RAG\u2019s world knowledge by simply replacing its non-parametric memory.\n\nEffect of Retrieving more documents: Models are trained with either 5 or 10 retrieved latent documents, and we do not observe significant differences in performance between them. We have the flexibility to adjust the number of retrieved documents at test time, which can affect performance and runtime. Figure 3 (left) shows that retrieving more documents at test time monotonically improves Open-domain QA results for RAG-Sequence, but performance peaks for RAG-Token at 10 retrieved documents. Figure 3 (right) shows that retrieving more documents leads to higher Rouge-L for RAG-Token at the expense of Bleu-1, but the effect is less pronounced for RAG-Sequence.\n\n| |NQ Answer Recall @ K|\n|---|---|\n|NQ Exact Match|80|\n\nFigure 3: Left: NQ performance as more documents are retrieved. Center: Retrieval recall performance in NQ. Right: MS-MARCO Bleu-1 and Rouge-L as more documents are retrieved.\n\nRelated Work: Single-Task Retrieval - Prior work has shown that retrieval improves performance across a variety of NLP tasks when considered in isolation. Such tasks include open-domain question answering, fact checking, fact completion, long-form question answering, Wikipedia article generation, dialogue, translation, and language modeling. Our work unifies previous successes in incorporating retrieval into individual tasks, showing that a single retrieval-based architecture is capable of achieving strong performance across several tasks.\n---\n## General-Purpose Architectures for NLP\n\nPrior work on general-purpose architectures for NLP tasks has shown great success without the use of retrieval. A single, pre-trained language model has been shown to achieve strong performance on various classification tasks in the GLUE benchmarks [60, 61] after fine-tuning [49, 8]. GPT-2 [50] later showed that a single, left-to-right, pre-trained language model could achieve strong performance across both discriminative and generative tasks. For further improvement, BART [32] and T5 [51, 52] propose a single, pre-trained encoder-decoder model that leverages bi-directional attention to achieve stronger performance on discriminative and generative tasks. Our work aims to expand the space of possible tasks with a single, unified architecture, by learning a retrieval module to augment pre-trained, generative language models.\n\n## Learned Retrieval\n\nThere is significant work on learning to retrieve documents in information retrieval, more recently with pre-trained, neural language models [44, 26] similar to ours. Some work optimizes the retrieval module to aid in a specific, downstream task such as question answering, using search [46], reinforcement learning [6, 63, 62], or a latent variable approach [31, 20] as in our work. These successes leverage different retrieval-based architectures and optimization techniques to achieve strong performance on a single task, while we show that a single retrieval-based architecture can be fine-tuned for strong performance on a variety of tasks.\n\n## Memory-based Architectures\n\nOur document index can be seen as a large external memory for neural networks to attend to, analogous to memory networks [64, 55]. Concurrent work [14] learns to retrieve a trained embedding for each entity in the input, rather than to retrieve raw text as in our work. Other work improves the ability of dialog models to generate factual text by attending over fact embeddings [15, 13]. A key feature of our memory is that it is comprised of raw text rather distributed representations, which makes the memory both (i) human-readable, lending a form of interpretability to our model, and (ii) human-writable, enabling us to dynamically update the model\u2019s memory by editing the document index. This approach has also been used in knowledge-intensive dialog, where generators have been conditioned on retrieved text directly, albeit obtained via TF-IDF rather than end-to-end learnt retrieval [9].\n\n## Retrieve-and-Edit approaches\n\nOur method shares some similarities with retrieve-and-edit style approaches, where a similar training input-output pair is retrieved for a given input, and then edited to provide a final output. These approaches have proved successful in a number of domains including Machine Translation [18, 22] and Semantic Parsing [21]. Our approach does have several differences, including less of emphasis on lightly editing a retrieved item, but on aggregating content from several pieces of retrieved content, as well as learning latent retrieval, and retrieving evidence documents rather than related training pairs. This said, RAG techniques may work well in these settings, and could represent promising future work.\n\n## Discussion\n\nIn this work, we presented hybrid generation models with access to parametric and non-parametric memory. We showed that our RAG models obtain state of the art results on open-domain QA. We found that people prefer RAG\u2019s generation over purely parametric BART, finding RAG more factual and specific. We conducted an thorough investigation of the learned retrieval component, validating its effectiveness, and we illustrated how the retrieval index can be hot-swapped to update the model without requiring any retraining. In future work, it may be fruitful to investigate if the two components can be jointly pre-trained from scratch, either with a denoising objective similar to BART or some another objective. Our work opens up new research directions on how parametric and non-parametric memories interact and how to most effectively combine them, showing promise in being applied to a wide variety of NLP tasks.\n---\n## Broader Impact\n\nThis work offers several positive societal benefits over previous work: the fact that it is more strongly grounded in real factual knowledge (in this case Wikipedia) makes it \u201challucinate\u201d less with generations that are more factual, and offers more control and interpretability. RAG could be employed in a wide variety of scenarios with direct benefit to society, for example by endowing it with a medical index and asking it open-domain questions on that topic, or by helping people be more effective at their jobs.\n\nWith these advantages also come potential downsides: Wikipedia, or any potential external knowledge source, will probably never be entirely factual and completely devoid of bias. Since RAG can be employed as a language model, similar concerns as for GPT-2 [50] are valid here, although arguably to a lesser extent, including that it might be used to generate abuse, faked or misleading content in the news or on social media; to impersonate others; or to automate the production of spam/phishing content [54]. Advanced language models may also lead to the automation of various jobs in the coming decades [16]. In order to mitigate these risks, AI systems could be employed to fight against misleading content and automated spam/phishing.\n\n## Acknowledgments\n\nThe authors would like to thank the reviewers for their thoughtful and constructive feedback on this paper, as well as HuggingFace for their help in open-sourcing code to run RAG models. The authors would also like to thank Kyunghyun Cho and Sewon Min for productive discussions and advice. EP thanks supports from the NSF Graduate Research Fellowship. PL is supported by the FAIR PhD program.\n\n## References\n\n[1] Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu, Rangan Majumder, Andrew McNamara, Bhaskar Mitra, Tri Nguyen, Mir Rosenberg, Xia Song, Alina Stoica, Saurabh Tiwary, and Tong Wang. MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. arXiv:1611.09268 [cs], November 2016. URL http://arxiv.org/abs/1611.09268. arXiv: 1611.09268.\n[2] Petr Baudi\u0161 and Jan \u0160ediv` y. Modeling of pe question answering task in pe yodaqa system. In International Conference of pe Cross-Language Evaluation Forum for European Languages, pages 222\u2013228. Springer, 2015. URL https://link.springer.com/chapter/10.1007%2F978-3-319-24027-5_20.\n[3] Jonapan Berant, Andrew Chou, Roy Frostig, and Percy Liang. Semantic Parsing on Freebase from Question-Answer Pairs. In Proceedings of pe 2013 Conference on Empirical Mepods in Natural Language Processing, pages 1533\u20131544, Seattle, Washington, USA, October 2013. Association for Computational Linguistics. URL http://www.aclweb.org/anpology/D13-1160.\n[4] Bin Bi, Chenliang Li, Chen Wu, Ming Yan, and Wei Wang. Palm: Pre-training an autoencod-ing&autoregressive language model for context-conditioned generation. ArXiv, abs/2004.07159, 2020. URL https://arxiv.org/abs/2004.07159.\n[5] Danqi Chen, Adam Fisch, Jason Weston, and Antoine Bordes. Reading Wikipedia to Answer Open-Domain Questions. In Proceedings of pe 55p Annual Meeting of pe Association for Computational Linguistics (Volume 1: Long Papers), pages 1870\u20131879, Vancouver, Canada, July 2017. Association for Computational Linguistics. doi: 10.18653/v1/P17-1171. URL https://www.aclweb.org/anpology/P17-1171.\n[6] Eunsol Choi, Daniel Hewlett, Jakob Uszkoreit, Illia Polosukhin, Alexandre Lacoste, and Jonapan Berant. Coarse-to-fine question answering for long documents. In Proceedings of pe 55p Annual Meeting of pe Association for Computational Linguistics (Volume 1: Long Papers), pages 209\u2013220, Vancouver, Canada, July 2017. Association for Computational Linguistics. doi: 10.18653/v1/P17-1020. URL https://www.aclweb.org/anpology/P17-1020.\n---\nChristopher Clark and Matt Gardner. Simple and Effective Multi-Paragraph Reading Comprehension. arXiv:1710.10723 [cs], October 2017. URL http://arxiv.org/abs/1710.10723. arXiv: 1710.10723.\nJacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of pe 2019 Conference of pe Norp American Chapter of pe Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pages 4171\u20134186, Minneapolis, Minnesota, June 2019. Association for Computational Linguistics. doi: 10.18653/v1/N19-1423. URL https://www.aclweb.org/anpology/N19-1423.\nEmily Dinan, Stephen Roller, Kurt Shuster, Angela Fan, Michael Auli, and Jason Weston. Wizard of wikipedia: Knowledge-powered conversational agents. In International Conference on Learning Representations, 2019. URL https://openreview.net/forum?id=r1l73iRqKm.\nMatpew Dunn, Levent Sagun, Mike Higgins, V. Ugur Guney, Volkan Cirik, and Kyunghyun Cho. SearchQA: A New Q&A Dataset Augmented wip Context from a Search Engine. arXiv:1704.05179 [cs], April 2017. URL http://arxiv.org/abs/1704.05179. arXiv: 1704.05179.\nAngela Fan, Mike Lewis, and Yann Dauphin. Hierarchical neural story generation. In Proceedings of pe 56p Annual Meeting of pe Association for Computational Linguistics (Volume 1: Long Papers), pages 889\u2013898, Melbourne, Australia, July 2018. Association for Computational Linguistics. doi: 10.18653/v1/P18-1082. URL https://www.aclweb.org/anpology/P18-1082.\nAngela Fan, Yacine Jernite, Epan Perez, David Grangier, Jason Weston, and Michael Auli. ELI5: Long form question answering. In Proceedings of pe 57p Annual Meeting of pe Association for Computational Linguistics, pages 3558\u20133567, Florence, Italy, July 2019. Association for Computational Linguistics. doi: 10.18653/v1/P19-1346. URL https://www.aclweb.org/anpology/P19-1346.\nAngela Fan, Claire Gardent, Chloe Braud, and Antoine Bordes. Augmenting transformers wip KNN-based composite memory, 2020. URL https://openreview.net/forum?id=H1gx1CNKPH.\nThibault F\u00e9vry, Livio Baldini Soares, Nicholas FitzGerald, Eunsol Choi, and Tom Kwiatkowski. Entities as experts: Sparse memory access wip entity supervision. ArXiv, abs/2004.07202, 2020. URL https://arxiv.org/abs/2004.07202.\nMarjan Ghazvininejad, Chris Brockett, Ming-Wei Chang, Bill Dolan, Jianfeng Gao, Wentau Yih, and Michel Galley. A knowledge-grounded neural conversation model. In AAAI Conference on Artificial Intelligence, 2018. URL https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16710.\nKatja Grace, John Salvatier, Allan Dafoe, Baobao Zhang, and Owain Evans. When will AI exceed human performance? evidence from AI experts. CoRR, abs/1705.08807, 2017. URL http://arxiv.org/abs/1705.08807.\nJiatao Gu, Yong Wang, Kyunghyun Cho, and Victor O.K. Li. Search engine guided neural machine translation. In AAAI Conference on Artificial Intelligence, 2018. URL https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/17282.\nJiatao Gu, Yong Wang, Kyunghyun Cho, and Victor O.K. Li. Search engine guided neural machine translation. In 32nd AAAI Conference on Artificial Intelligence, AAAI 2018, 32nd AAAI Conference on Artificial Intelligence, AAAI 2018, pages 5133\u20135140. AAAI press, 2018. 32nd AAAI Conference on Artificial Intelligence, AAAI 2018 ; Conference date: 02-02-2018 Through 07-02-2018.\nKelvin Guu, Tatsunori B. Hashimoto, Yonatan Oren, and Percy Liang. Generating sentences by editing prototypes. Transactions of pe Association for Computational Linguistics, 6:437\u2013450, 2018. doi: 10.1162/tacl_a_00030. URL https://www.aclweb.org/anpology/Q18-1031.\n---\n## References\n\n[20] Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. REALM: Retrieval-augmented language model pre-training. ArXiv, abs/2002.08909, 2020. URL https://arxiv.org/abs/2002.08909.\n[21] Tatsunori B Hashimoto, Kelvin Guu, Yonatan Oren, and Percy S Liang. A retrieve-and-edit framework for predicting structured outputs. In S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett, editors, Advances in Neural Information Processing Systems 31, pages 10052\u201310062. Curran Associates, Inc., 2018. URL http://papers.nips.cc/paper/8209-a-retrieve-and-edit-framework-for-predicting-structured-outputs.pdf.\n[22] Nabil Hossain, Marjan Ghazvininejad, and Luke Zettlemoyer. Simple and effective retrieve-edit-rerank text generation. In Proceedings of pe 58p Annual Meeting of pe Association for Computational Linguistics, pages 2532\u20132538, Online, July 2020. Association for Computational Linguistics. doi: 10.18653/v1/2020.acl-main.228. URL https://www.aclweb.org/anpology/2020.acl-main.228.\n[23] Jeff Johnson, Matpijs Douze, and Herv\u00e9 J\u00e9gou. Billion-scale similarity search wip gpus. arXiv preprint arXiv:1702.08734, 2017. URL https://arxiv.org/abs/1702.08734.\n[24] Mandar Joshi, Eunsol Choi, Daniel Weld, and Luke Zettlemoyer. TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension. In Proceedings of pe 55p Annual Meeting of pe Association for Computational Linguistics (Volume 1: Long Papers), pages 1601\u20131611, Vancouver, Canada, July 2017. Association for Computational Linguistics. doi: 10.18653/v1/P17-1147. URL https://www.aclweb.org/anpology/P17-1147.\n[25] Armand Joulin and Tomas Mikolov. Inferring algoripmic patterns wip stack-augmented recurrent nets. In Proceedings of pe 28p International Conference on Neural Information Processing Systems - Volume 1, NIPS\u201915, page 190\u2013198, Cambridge, MA, USA, 2015. MIT Press. URL https://papers.nips.cc/paper/5857-inferring-algoripmic-patterns-wip-stack-augmented-recurrent-nets.\n[26] Vladimir Karpukhin, Barlas Oguz, Sewon Min, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. Dense passage retrieval for open-domain question answering. arXiv preprint arXiv:2004.04906, 2020. URL https://arxiv.org/abs/2004.04906.\n[27] Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, and Mike Lewis. Generalization prough memorization: Nearest neighbor language models. In International Conference on Learning Representations, 2020. URL https://openreview.net/forum?id=HklBjCEKvH.\n[28] Diederik P. Kingma and Jimmy Ba. Adam: A mepod for stochastic optimization. In Yoshua Bengio and Yann LeCun, editors, 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, 2015. URL http://arxiv.org/abs/1412.6980.\n[29] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Matpew Kelcey, Jacob Devlin, Kenton Lee, Kristina N. Toutanova, Llion Jones, Ming-Wei Chang, Andrew Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. Natural Questions: a Benchmark for Question Answering Research. Transactions of pe Association of Computational Linguistics, 2019. URL https://tomkwiat.users.x20web.corp.google.com/papers/natural-questions/main-1455-kwiatkowski.pdf.\n[30] Guillaume Lample, Alexandre Sablayrolles, Marc\u2019 Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou. Large memory layers wip product keys. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d\u2019 Alch\u00e9-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32, pages 8548\u20138559. Curran Associates, Inc., 2019. URL http://papers.nips.cc/paper/9061-large-memory-layers-wip-product-keys.pdf.\n[31] Kenton Lee, Ming-Wei Chang, and Kristina Toutanova. Latent retrieval for weakly supervised open domain question answering. In Proceedings of pe 57p Annual Meeting of pe Association\n---\n## References\n\n[32] Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461, 2019. URL https://arxiv.org/abs/1910.13461.\n[33] Jiwei Li, Michel Galley, Chris Brockett, Jianfeng Gao, and Bill Dolan. A diversity-promoting objective function for neural conversation models. In Proceedings of pe 2016 Conference of pe Norp American Chapter of pe Association for Computational Linguistics: Human Language Technologies, pages 110\u2013119, San Diego, California, June 2016. Association for Computational Linguistics. doi: 10.18653/v1/N16-1014. URL https://www.aclweb.org/anpology/N16-1014.\n[34] Margaret Li, Jason Weston, and Stephen Roller. Acute-eval: Improved dialogue evaluation wip optimized questions and multi-turn comparisons. ArXiv, abs/1909.03087, 2019. URL https://arxiv.org/abs/1909.03087.\n[35] Hairong Liu, Mingbo Ma, Liang Huang, Hao Xiong, and Zhongjun He. Robust neural machine translation wip joint textual and phonetic embedding. In Proceedings of pe 57p Annual Meeting of pe Association for Computational Linguistics, pages 3044\u20133049, Florence, Italy, July 2019. Association for Computational Linguistics. doi: 10.18653/v1/P19-1291. URL https://www.aclweb.org/anpology/P19-1291.\n[36] Peter J. Liu*, Mohammad Saleh*, Etienne Pot, Ben Goodrich, Ryan Sepassi, Lukasz Kaiser, and Noam Shazeer. Generating wikipedia by summarizing long sequences. In International Conference on Learning Representations, 2018. URL https://openreview.net/forum?id=Hyg0vbWC-.\n[37] Yury A. Malkov and D. A. Yashunin. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE Transactions on Pattern Analysis and Machine Intelligence, 42:824\u2013836, 2016. URL https://arxiv.org/abs/1603.09320.\n[38] Gary Marcus. The next decade in ai: four steps towards robust artificial intelligence. arXiv preprint arXiv:2002.06177, 2020. URL https://arxiv.org/abs/2002.06177.\n[39] Luca Massarelli, Fabio Petroni, Aleksandra Piktus, Myle Ott, Tim Rockt\u00e4schel, Vassilis Plachouras, Fabrizio Silvestri, and Sebastian Riedel. How decoding strategies affect pe verifiability of generated text. arXiv preprint arXiv:1911.03587, 2019. URL https://arxiv.org/abs/1911.03587.\n[40] Paulius Micikevicius, Sharan Narang, Jonah Alben, Gregory Diamos, Erich Elsen, David Garcia, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, and Hao Wu. Mixed precision training. In ICLR, 2018. URL https://openreview.net/forum?id=r1gs9JgRZ.\n[41] Nikita Moghe, Siddharpa Arora, Suman Banerjee, and Mitesh M. Khapra. Towards exploiting background knowledge for building conversation systems. In Proceedings of pe 2018 Conference on Empirical Mepods in Natural Language Processing, pages 2322\u20132332, Brussels, Belgium, October-November 2018. Association for Computational Linguistics. doi: 10.18653/v1/D18-1255. URL https://www.aclweb.org/anpology/D18-1255.\n[42] Preksha Nema and Mitesh M. Khapra. Towards a better metric for evaluating question generation systems. In Proceedings of pe 2018 Conference on Empirical Mepods in Natural Language Processing, pages 3950\u20133959, Brussels, Belgium, October-November 2018. Association for Computational Linguistics. doi: 10.18653/v1/D18-1429. URL https://www.aclweb.org/anpology/D18-1429.\n[43] Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, Rangan Majumder, and Li Deng. MS MARCO: A human generated machine reading comprehension dataset. In Tarek Richard Besold, Antoine Bordes, Artur S. d\u2019Avila Garcez, and Greg Wayne, editors, Proceedings of pe Workshop on Cognitive Computation: Integrating neural and symbolic.\n---\napproaches 2016 co-located with the 30th Annual Conference on Neural Information Processing Systems (NIPS 2016), Barcelona, Spain, December 9, 2016, volume 1773 of CEUR Workshop Proceedings. CEUR-WS.org, 2016. URL http://ceur-ws.org/Vol-1773/CoCoNIPS_2016_paper9.pdf.\n\n[44] Rodrigo Nogueira and Kyunghyun Cho. Passage re-ranking with BERT. arXiv preprint arXiv:1901.04085, 2019. URL https://arxiv.org/abs/1901.04085.\n\n[45] Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, and Michael Auli. fairseq: A fast, extensible toolkit for sequence modeling. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations), pages 48\u201353, Minneapolis, Minnesota, June 2019. Association for Computational Linguistics. doi: 10.18653/v1/N19-4009. URL https://www.aclweb.org/anthology/N19-4009.\n\n[46] Ethan Perez, Siddharth Karamcheti, Rob Fergus, Jason Weston, Douwe Kiela, and Kyunghyun Cho. Finding generalizable evidence by learning to convince q&a models. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 2402\u20132411, Hong Kong, China, November 2019. Association for Computational Linguistics. doi: 10.18653/v1/D19-1244. URL https://www.aclweb.org/anthology/D19-1244.\n\n[47] Fabio Petroni, Tim Rockt\u00e4schel, Sebastian Riedel, Patrick Lewis, Anton Bakhtin, Yuxiang Wu, and Alexander Miller. Language models as knowledge bases? In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 2463\u20132473, Hong Kong, China, November 2019. Association for Computational Linguistics. doi: 10.18653/v1/D19-1250. URL https://www.aclweb.org/anthology/D19-1250.\n\n[48] Fabio Petroni, Patrick Lewis, Aleksandra Piktus, Tim Rockt\u00e4schel, Yuxiang Wu, Alexander H. Miller, and Sebastian Riedel. How context affects language models\u2019 factual predictions. In Automated Knowledge Base Construction, 2020. URL https://openreview.net/forum?id=025X0zPfn.\n\n[49] Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. Improving Language Understanding by Generative Pre-Training, 2018. URL https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf.\n\n[50] Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are unsupervised multitask learners, 2019. URL https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf.\n\n[51] Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. arXiv e-prints, 2019. URL https://arxiv.org/abs/1910.10683.\n\n[52] Adam Roberts, Colin Raffel, and Noam Shazeer. How much knowledge can you pack into the parameters of a language model? arXiv e-prints, 2020. URL https://arxiv.org/abs/2002.08910.\n\n[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and beyond. Found. Trends Inf. Retr., 3(4):333\u2013389, April 2009. ISSN 1554-0669. doi: 10.1561/1500000019. URL https://doi.org/10.1561/1500000019.\n\n[54] Irene Solaiman, Miles Brundage, Jack Clark, Amanda Askell, Ariel Herbert-Voss, Jeff Wu, Alec Radford, and Jian-Bing Wang. Release strategies and the social impacts of language models. ArXiv, abs/1908.09203, 2019.\n\n[55] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, Advances in Neural Information Processing Systems 28, pages 2440\u20132448. Curran Associates, Inc., 2015. URL http://papers.nips.cc/paper/5846-end-to-end-memory-networks.pdf.\n---\n# References\n\n[56] James Thorne, Andreas Vlachos, Christos Christodoulopoulos, and Arpit Mittal. FEVER: a large-scale dataset for fact extraction and VERification. In Proceedings of pe 2018 Conference of pe Norp American Chapter of pe Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pages 809\u2013819, New Orleans, Louisiana, June 2018. Association for Computational Linguistics. doi: 10.18653/v1/N18-1074. URL https://www.aclweb.org/anpology/N18-1074.\n[57] James H. Thorne and Andreas Vlachos. Avoiding catastrophic forgetting in mitigating model biases in sentence-pair classification wip elastic weight consolidation. ArXiv, abs/2004.14366, 2020. URL https://arxiv.org/abs/2004.14366.\n[58] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanapan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30, pages 5998\u20136008. Curran Associates, Inc., 2017. URL http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf.\n[59] Ashwin Vijayakumar, Michael Cogswell, Ramprasaap Selvaraju, Qing Sun, Stefan Lee, David Crandall, and Dhruv Batra. Diverse beam search for improved description of complex scenes. AAAI Conference on Artificial Intelligence, 2018. URL https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/17329.\n[60] Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel Bowman. GLUE: A multi-task benchmark and analysis platform for natural language understanding. In Proceedings of pe 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP, pages 353\u2013355, Brussels, Belgium, November 2018. Association for Computational Linguistics. doi: 10.18653/v1/W18-5446. URL https://www.aclweb.org/anpology/W18-5446.\n[61] Alex Wang, Yada Pruksachatkun, Nikita Nangia, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel Bowman. SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32, pages 3261\u20133275. Curran Associates, Inc., 2019. URL https://arxiv.org/abs/1905.00537.\n[62] Shuohang Wang, Mo Yu, Xiaoxiao Guo, Zhiguo Wang, Tim Klinger, Wei Zhang, Shiyu Chang, Gerry Tesauro, Bowen Zhou, and Jing Jiang. R3: Reinforced ranker-reader for open-domain question answering. In Sheila A. McIlraip and Kilian Q. Weinberger, editors, Proceedings of pe Thirty-Second AAAI Conference on Artificial Intelligence, (AAAI-18), pe 30p innovative Applications of Artificial Intelligence (IAAI-18), and pe 8p AAAI Symposium on Educational Advances in Artificial Intelligence (EAAI-18), New Orleans, Louisiana, USA, February 2-7, 2018, pages 5981\u20135988. AAAI Press, 2018. URL https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16712.\n[63] Shuohang Wang, Mo Yu, Jing Jiang, Wei Zhang, Xiaoxiao Guo, Shiyu Chang, Zhiguo Wang, Tim Klinger, Gerald Tesauro, and Murray Campbell. Evidence aggregation for answer re-ranking in open-domain question answering. In ICLR, 2018. URL https://openreview.net/forum?id=rJl3yM-Ab.\n[64] Jason Weston, Sumit Chopra, and Antoine Bordes. Memory networks. In Yoshua Bengio and Yann LeCun, editors, 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, 2015. URL http://arxiv.org/abs/1410.3916.\n[65] Jason Weston, Emily Dinan, and Alexander Miller. Retrieve and refine: Improved sequence generation models for dialogue. In Proceedings of pe 2018 EMNLP Workshop SCAI: The 2nd International Workshop on Search-Oriented Conversational AI, pages 87\u201392, Brussels, Belgium, October 2018. Association for Computational Linguistics. doi: 10.18653/v1/W18-5713. URL https://www.aclweb.org/anpology/W18-5713.\n---\n## References\n\n[66] Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anpony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. Huggingface\u2019s transformers: State-of-pe-art natural language processing. ArXiv, abs/1910.03771, 2019.\n[67] Shiyue Zhang and Mohit Bansal. Addressing semantic drift in question generation for semi-supervised question answering. In Proceedings of pe 2019 Conference on Empirical Mepods in Natural Language Processing and pe 9p International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 2495\u20132509, Hong Kong, China, November 2019. Association for Computational Linguistics. doi: 10.18653/v1/D19-1253. URL https://www.aclweb.org/anpology/D19-1253.\n[68] Wanjun Zhong, Jingjing Xu, Duyu Tang, Zenan Xu, Nan Duan, Ming Zhou, Jiahai Wang, and Jian Yin. Reasoning over semantic-level graph for fact checking. ArXiv, abs/1909.03745, 2019. URL https://arxiv.org/abs/1909.03745.\n---\n## Appendices for Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\n\n### Implementation Details\n\nFor Open-domain QA we report test numbers using 15 retrieved documents for RAG-Token models. For RAG-Sequence models, we report test results using 50 retrieved documents, and we use the Thorough Decoding approach since answers are generally short. We use greedy decoding for QA as we did not find beam search improved results. For Open-MSMarco and Jeopardy question generation, we report test numbers using ten retrieved documents for both RAG-Token and RAG-Sequence, and we also train a BART-large model as a baseline. We use a beam size of four, and use the Fast Decoding approach for RAG-Sequence models, as Thorough Decoding did not improve performance.\n\n### Human Evaluation\n\n|Which sentence is more factually true?|Select option|\n|---|---|\n|Noje: Scna Guesucn?|snterzt \"The8r Nuso Rists|\n|IncicateFich|Farcncllic AM; on Im|\n|Iclbwng sentarces Is Mca luclualy Injb[ealecllo|ZLbko Uaino cnoc urdoco|\n\nFigure 4: Annotation interface for human evaluation of factuality. A pop-out for detailed instructions and a worked example appear when clicking \"view tool guide\".\n\nFigure 4 shows the user interface for human evaluation. To avoid any biases for screen position, which model corresponded to sentence A and sentence B was randomly selected for each example. Annotators were encouraged to research the topic using the internet, and were given detailed instructions and worked examples in a full instructions tab. We included some gold sentences in order to assess the accuracy of the annotators. Two annotators did not perform well on these examples and their annotations were removed from the results.\n\n### Training Setup Details\n\nWe train all RAG models and BART baselines using Fairseq. We train with mixed precision floating point arithmetic, distributing training across 8, 32GB NVIDIA V100 GPUs, though training and inference can be run on one GPU. We find that doing Maximum Inner Product Search with FAISS is sufficiently fast on CPU, so we store document index vectors on CPU, requiring approximately 100 GB of CPU memory for all of Wikipedia. After submission, We have ported our code to HuggingFace Transformers, which achieves equivalent performance to the previous version but is a cleaner and easier to use implementation. This version is also open-sourced. We also compress the document index using FAISS\u2019s compression tools, reducing the CPU memory requirement to 36GB. Scripts to run experiments with RAG can be found at https://github.com/huggingface/transformers/blob/master/examples/rag/README.md and an interactive demo of a RAG model can be found at https://huggingface.co/rag/\n\n2. https://github.com/pytorch/fairseq\n\n3. https://github.com/huggingface/transformers\n---\n## Further Details on Open-Domain QA\n\nFor open-domain QA, multiple answer annotations are often available for a given question. These answer annotations are exploited by extractive models during training as typically all the answer annotations are used to find matches within documents when preparing training data. For RAG, we also make use of multiple annotation examples for Natural Questions and WebQuestions by training the model with each (q, a) pair separately, leading to a small increase in accuracy. For TriviaQA, there are often many valid answers to a given question, some of which are not suitable training targets, such as emoji or spelling variants. For TriviaQA, we filter out answer candidates if they do not occur in top 1000 documents for the query.\n\n## CuratedTrec preprocessing\n\nThe answers for CuratedTrec are given in the form of regular expressions, which has been suggested as a reason why it is unsuitable for answer-generation models [20]. To overcome this, we use a pre-processing step where we first retrieve the top 1000 documents for each query, and use the answer that most frequently matches the regex pattern as the supervision target. If no matches are found, we resort to a simple heuristic: generate all possible permutations for each regex, replacing non-deterministic symbols in the regex nested tree structure with a whitespace.\n\n## TriviaQA Evaluation setups\n\nThe open-domain QA community customarily uses public development datasets as test datasets, as test data for QA datasets is often restricted and dedicated to reading comprehension purposes. We report our results using the datasets splits used in DPR [26], which are consistent with common practice in Open-domain QA. For TriviaQA, this test dataset is the public TriviaQA Web Development split. Roberts et al. [52] used the TriviaQA official Wikipedia test set instead. F\u00e9vry et al. [14] follow this convention in order to compare with Roberts et al. [52] (See appendix of [14]). We report results on both test sets to enable fair comparison to both approaches. We find that our performance is much higher using the official Wiki test set, rather than the more conventional open-domain test set, which we attribute to the official Wiki test set questions being simpler to answer from Wikipedia.\n\n## Further Details on FEVER\n\nFor FEVER classification, we follow the practice from [32], and first re-generate the claim, and then classify using the representation of the final hidden state, before finally marginalizing across documents to obtain the class probabilities. The FEVER task traditionally has two sub-tasks. The first is to classify the claim as either \"Supported\", \"Refuted\" or \"Not Enough Info\", which is the task we explore in the main paper. FEVER\u2019s other sub-task involves extracting sentences from Wikipedia as evidence supporting the classification prediction. As FEVER uses a different Wikipedia dump to us, directly tackling this task is not straightforward. We hope to address this in future work.\n\n## Null Document Probabilities\n\nWe experimented with adding \"Null document\" mechanism to RAG, similar to REALM [20] in order to model cases where no useful information could be retrieved for a given input. Here, if k documents were retrieved, we would additionally \"retrieve\" an empty document and predict a logit for the null document, before marginalizing over k + 1 predictions. We explored modelling this null document logit by learning (i) a document embedding for the null document, (ii) a static learnt bias term, or (iii) a neural network to predict the logit. We did not find that these improved performance, so in the interests of simplicity, we omit them. For Open MS-MARCO, where useful retrieved documents cannot always be retrieved, we observe that the model learns to always retrieve a particular set of documents for questions that are less likely to benefit from retrieval, suggesting that null document mechanisms may not be necessary for RAG.\n\n## Parameters\n\nOur RAG models contain the trainable parameters for the BERT-base query and document encoder of DPR, with 110M parameters each (although we do not train the document encoder ourselves) and 406M trainable parameters from BART-large, 406M parameters, making a total of 626M trainable.\n---\n|Task|Train|Development|Test|\n|---|---|---|---|\n|Natural Questions|79169|8758|3611|\n|TriviaQA|78786|8838|11314|\n|WebQuestions|3418|362|2033|\n|CuratedTrec|635|134|635|\n|Jeopardy Question Generation|97392|13714|26849|\n|MS-MARCO|153726|12468|101093*|\n|FEVER-3-way|145450|10000|10000|\n|FEVER-2-way|96966|6666|6666|\n\nparameters. The best performing \"closed-book\" (parametric only) open-domain QA model is T5-11B\nwith 11 Billion trainable parameters. The T5 model with the closest number of parameters to our\nmodels is T5-large (770M parameters), which achieves a score of 28.9 EM on Natural Questions [52],\nsubstantially below the 44.5 that RAG-Sequence achieves, indicating that hybrid parametric/non-\nparametric models require far fewer trainable parameters for strong open-domain QA performance.\nThe non-parametric memory index does not consist of trainable parameters, but does consists of 21M\n728 dimensional vectors, consisting of 15.3B values. These can be easily be stored at 8-bit floating\npoint precision to manage memory and disk footprints.\n\n## Retrieval Collapse\n\nIn preliminary experiments, we observed that for some tasks such as story generation [11], the\nretrieval component would \u201ccollapse\u201d and learn to retrieve the same documents regardless of the\ninput. In these cases, once retrieval had collapsed, the generator would learn to ignore the documents,\nand the RAG model would perform equivalently to BART. The collapse could be due to a less-explicit\nrequirement for factual knowledge in some tasks, or the longer target sequences, which could result\nin less informative gradients for the retriever. Perez et al. [46] also found spurious retrieval results\nwhen optimizing a retrieval component in order to improve performance on downstream tasks.\n\n## Number of instances per dataset\n\nThe number of training, development and test datapoints in each of our datasets is shown in Table 7.",
asyncio.run(query_info_with_gpt(paper_id, arxiv_paper_markdown, arxiv_metadata, user_query))

'RAG models combine pre-trained language models with a Wikipedia index for NLP tasks, achieving state-of-the-art results on QA tasks.'

### Relevance score (placeholder)

# Processing loops

Extract search results from Google based on user query

In [261]:
import json
import time

class PageEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Page):
            return obj.dict()  # Convert Page to a dictionary
        elif isinstance(obj, Paper):
            return obj.dict()  # Convert Paper to a dictionary
        elif isinstance(obj, Link):
            return obj.dict()  # Convert Link to a dictionary
        return json.JSONEncoder.default(self, obj)  # Handle other types

def fetch_and_process(link, query, title, snippet, job_id):
    conn = None
    try:
        conn = connection()  # Open a new connection
        c = conn.cursor()  # Create a new cursor

        c.execute("SELECT scraping_status, html FROM google_search_results WHERE url = %s", (link,))
        result = c.fetchone()
        if result and result[0] == '200':
            print(f"Status: {result[0]}, already fetched for URL: {link}")
            html_content = result[1].replace('\x00', '')  # Sanitize HTML content from database
        else:
            response = fetch_url_content(link)
            print(f"Status:{response['status']} for URL: {link}")
            html_content = response['soup'].decode('utf-8', 'replace') if response['status'] == 200 else ""
            html_content = html_content.replace('\x00', '')
            insert_scraping_results(link, html_content, str(response['status']), query, title, snippet, job_id)

        if html_content:
            insert_arxiv_links_into_db(html_content, query, job_id)  # Adjust 'insert_arxiv_links_into_db' to take 'conn' and 'c' as additional parameters

    except psycopg2.OperationalError as e:
        print(f"Database operation failed for URL: {link}, Error: {e}")
        if conn:
            conn.rollback()  # Roll back any changes due to error

    finally:
        if c:
            c.close()  # Close the cursor
        if conn:
            conn.close()  # Close the connection

def search_and_fetch_google(job_id):
    conn = connection()  # Open a new connection
    c = conn.cursor()  # Create a new cursor

    c.execute("SELECT query, keyword_search_queries, paper_search_queries FROM jobs WHERE job_id = %s", (job_id,))
    job = c.fetchone()
    if job:
        # Unpack the fields from the job row
        query, keyword_search_queries, paper_search_queries = job
        # Convert keyword_search_queries from JSON format to Python list
        paper_search_queries = json.loads(paper_search_queries) if paper_search_queries else []
        keyword_search_queries = json.loads(keyword_search_queries)
        all_search_queries = [query] + keyword_search_queries
        print(f"General Queries: {all_search_queries}, Paper Specific Queries: {paper_search_queries}")
    else:
        print(f"No job found with job_id {job_id}")
        return

    # Now that you have the search queries, iterate over them
    for search_query in all_search_queries:
        search_results = search_google(search_query)  # Pass each search query to your search function
        print(search_query, search_results)

        # Sequential execution
        for result in search_results:  # Each 'result' is a dictionary
            try:
                # Extract the URL from the result dictionary
                url = result['link']  # Correctly access the 'link' from the dictionary
                # Optionally, you can also pass 'title' and 'snippet' if needed
                title = result['title']
                snippet = result['snippet']
                # Assuming fetch_and_process can handle these additional data, adjust accordingly
                data = fetch_and_process(url, query, title, snippet, job_id)  # Modify this line as necessary based on your function's parameters
            except Exception as exc:
                print(f'fetch_and_process exception: {exc}')

    # Second loop: Iterate over paper-specific search queries
    for search_query in paper_search_queries:
        search_results = search_google_for_specific_papers(search_query)  # Pass each search query to your search function
        print(search_query, search_results)

    print('Finished extracting search results pages')

# Example usage
job_id = 3
search_and_fetch_google(job_id)


Keyword search queries: ['Top academic papers on LLMs', 'most cited academic papers on LLMs', 'peer-reviewed research on large language models', 'high impact publications on LLMs']
Top academic papers on LLMs [{'link': 'https://www.topbots.com/top-llm-research-papers-2023/', 'title': '10 Transformative LLM Research Papers of 2023 from ...', 'snippet': 'Top LLM Research Papers 2023 · 1. LLaMA by Meta AI · 2. LLaMA 2 by Meta AI · 3. GPT-4 by OpenAI · 4. Sparks of AGI by Microsoft · 5. BLIP-2 by ...'}, {'link': 'https://medium.com/@thedatabeast/top-10-breakthrough-research-papers-on-large-language-models-llms-in-2023-pioneering-7abfcb69da7f', 'title': 'Top 10 Breakthrough Research Papers on Large ...', 'snippet': 'Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications.'}, {'link': 'https://levelup.gitconnected.com/best-papers-on-large-language-models-ac01b13b94b3', 'title': 'Best Papers on Large Language Models (LLMs

Extracting and processing arxiv papers: pdf, markdown, metadata, citations, versions

In [263]:
import json
import psycopg2.extras
import concurrent.futures

def get_scholar_citations_versions_parallel(arxiv_link):
    try:
        # Fetch citations and versions
        number_of_citations, number_of_versions = get_scholar_citations_versions(arxiv_link)
        return arxiv_link, number_of_citations, number_of_versions
    except Exception as e:
        print(f"An error occurred while processing paper {arxiv_link}: {e}")
        return arxiv_link, None, None  # Return None values if error

def get_scholar_citations_versions_loop(job_id):
    conn = connection()  # Ensure this is a valid connection function
    c = conn.cursor()

    try:
        # Fetch the first 30 Query_Papers rows associated with the given query
        c.execute("""
            SELECT id, arxiv_link FROM Query_Papers 
            WHERE job_id = %s 
            LIMIT 30
        """, (job_id,))
        query_papers_to_update = c.fetchall()

        # Initialize the lists for batch updates
        papers_updates = []
        query_papers_updates = []

        # Prepare for parallel execution
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Map arxiv_links to future results
            future_to_arxiv_link = {executor.submit(get_scholar_citations_versions_parallel, arxiv_link): (paper_id, arxiv_link) 
                                    for paper_id, arxiv_link in query_papers_to_update}

            # Collect results as they complete
            for future in concurrent.futures.as_completed(future_to_arxiv_link):
                paper_id, arxiv_link = future_to_arxiv_link[future]
                try:
                    arxiv_link, number_of_citations, number_of_versions = future.result()
                    if number_of_citations is not None and number_of_versions is not None:
                        # Append data for batch update in Papers table
                        papers_updates.append((number_of_citations, number_of_versions, arxiv_link))
                        # Create JSON object with citations and versions, append for batch update in Query_Papers
                        paper_stats_json = json.dumps({'citations': number_of_citations, 'versions': number_of_versions})
                        query_papers_updates.append((paper_stats_json, paper_id))
                except Exception as e:
                    print(f"An error occurred while processing future for paper {arxiv_link}: {e}")

        # Perform batch updates
        psycopg2.extras.execute_batch(c, "UPDATE Papers SET citations = %s, versions = %s WHERE arxiv_link = %s",
                                      papers_updates)
        psycopg2.extras.execute_batch(c, "UPDATE Query_Papers SET paper_stats = %s WHERE id = %s",
                                      query_papers_updates)

        # Commit all changes
        conn.commit()

    except Exception as e:
        # If an exception occurs, roll back all database changes
        conn.rollback()
        print(f"An error occurred while fetching Query_Papers for the job_id '{job_id}': {e}")

    finally:
        # Ensure resources are cleaned up
        c.close()
        conn.close()

# Example usage
job_id = 3
get_scholar_citations_versions_loop(job_id)

In [266]:
def fetch_arxiv_paper_from_url_loop(job_id):
    conn = connection()  # Ensure this is a function that returns a DB connection
    c = conn.cursor()

    # Select records from Query_Papers related to the specific query and with final_rank between 1 and 10
    try:
        c.execute("""
            SELECT Query_Papers.id, Papers.paper_title, Papers.arxiv_link
            FROM Query_Papers
            JOIN Papers ON Query_Papers.arxiv_link = Papers.arxiv_link
            WHERE Query_Papers.job_id = %s AND final_rank BETWEEN 1 AND 20
            ORDER BY final_rank ASC
        """, (job_id,))

        papers_to_update = c.fetchall()

        for q_id, paper_title, arxiv_link in papers_to_update:
            print(f"Updating missing information for paper: {paper_title}")
            if arxiv_link:
                try:
                    # Fetch paper metadata from arXiv
                    xml_data, pdf_url, title, file_name, abstract, published_date, authors = fetch_arxiv_paper_from_url(arxiv_link)

                    # Update Papers table with fetched metadata
                    c.execute("""
                        UPDATE Papers 
                        SET arxiv_title = %s, arxiv_abstract = %s, arxiv_metadata = %s, arxiv_filename = %s 
                        WHERE arxiv_link = %s
                    """, (title, abstract, xml_data, file_name, arxiv_link))

                    # Update Query_Papers table with filtered metadata and download link
                    paper_metadata_filtered = {'title': title, 'abstract': abstract, 'published_date': published_date, 'authors': authors}
                    c.execute("""
                        UPDATE Query_Papers 
                        SET paper_metadata_filtered = %s, download_link = %s 
                        WHERE id = %s
                    """, (json.dumps(paper_metadata_filtered), pdf_url, q_id))

                    # Commit the transaction
                    conn.commit()

                except Exception as e:
                    print(f"An error occurred while updating paper {paper_title}: {e}")
            else:
                print(f"No arXiv link found for paper: {paper_title}")
    except Exception as e:
        print(f"An error occurred while fetching Query_Papers for the job_id '{job_id}': {e}")
    finally:
        if conn is not None:
            c.close()
            conn.close()

# Example usage
job_id = 3
fetch_arxiv_paper_from_url_loop(job_id)

In [245]:
def download_pdf_loop(query):
    conn = connection()  # Make sure this is a function that returns a DB connection
    c = conn.cursor()

    # Select records from Query_Papers related to the specific query and with final_rank between 1 and 10
    try:
        c.execute("""
            SELECT Query_Papers.id, Papers.paper_title, Papers.arxiv_link, Papers.arxiv_filename
            FROM Query_Papers
            JOIN Papers ON Query_Papers.arxiv_link = Papers.arxiv_link
            WHERE Query_Papers.query = %s AND final_rank BETWEEN 1 AND 10
            ORDER BY final_rank ASC
        """, (query,))

        papers_metadata = c.fetchall()

        for id, paper_title, arxiv_link, file_name in papers_metadata:
            print(f"Downloading PDF for paper: {paper_title}")
            if arxiv_link and file_name:
                # Typically, the PDF URL is derived from the arXiv link, adjust as necessary
                pdf_url = f'https://arxiv.org/pdf/{arxiv_link.split("/")[-1]}.pdf'  # Adjust based on actual URL format

                # Download the PDF
                file_path_or_error = download_pdf(pdf_url, file_name)
                if 'Failed' not in file_path_or_error:
                    print(f"Download successful: {file_path_or_error}")
                else:
                    print(f"Download failed for paper: {paper_title}")
            else:
                print(f"No valid arXiv link or filename found for paper: {paper_title}")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if conn is not None:
            c.close()
            conn.close()


In [246]:
def convert_pdf_to_markdown_loop():
    # Connect to SQLite database
    conn = connection()
    c = conn.cursor()

    # Update papers with missing arxiv_paper_markdown
    c.execute("SELECT id, arxiv_filename FROM Papers WHERE (arxiv_paper_markdown IS NULL OR arxiv_paper_markdown = '' OR arxiv_paper_markdown = 'None') AND arxiv_filename IS NOT NULL AND arxiv_filename != ''")
    papers_to_update = c.fetchall()

    for id, arxiv_filename in papers_to_update:
        try:
            # Convert PDF to Markdown
            markdown_content = convert_pdf_to_markdown(arxiv_filename)

            # Update Papers table with Markdown content
            c.execute("UPDATE Papers SET arxiv_paper_markdown = %s WHERE id = %s", (markdown_content, rowid))
            conn.commit()
        except Exception as e:
            print(f"An error occurred while updating paper id {id}: {e}")

    print("Finished converting pdfs to markdown loop")
    if conn is not None:
        # Close the cursor and connection
        c.close()
        conn.close()


Process papers against user query to arrive at the relevant answer and relevance score

abstract

In [247]:
import asyncio
import aiopg

# Construct DSN (Data Source Name) string from environment variables
dsn = (
    f"dbname={os.environ['MY_INTEGRATION_DATABASE']} "
    f"user={os.environ['MY_INTEGRATION_USER']} "
    f"password={os.environ['MY_INTEGRATION_PASSWORD']} "
    f"host={os.environ['MY_INTEGRATION_HOST']} "
    f"port={os.environ['MY_INTEGRATION_PORT']}"
)

async def LLM_process_abstract_loop(query):
    async with aiopg.create_pool(dsn) as pool:  # Use a connection pool
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                # Execute your SELECT query
                await cur.execute("""
                    SELECT id, query, arxiv_link, relevance_score, final_rank, relevant_answer, paper_stats, paper_metadata_filtered, download_link
                    FROM Query_Papers
                    WHERE (relevant_answer IS NULL OR relevant_answer = '')
                    AND query = %s AND final_rank BETWEEN 1 AND 10
                    ORDER BY final_rank
                """, (query,))
                query_papers_to_update = await cur.fetchall()
                print(f"Total papers to process for '{query}': {len(query_papers_to_update)}")

                # Map tasks to their papers and prepare for concurrent processing
                tasks = {asyncio.create_task(query_info_with_gpt(paper[0], paper[2], paper[3], query)): paper for paper in query_papers_to_update}

                # Process tasks as they complete
                for future in asyncio.as_completed(tasks):
                    result = await future  # In this context, result is just for logging or additional processing
                    print(f"Processing completed with result: {result}")

                print("Finished processing query papers.")

# Example usage
user_query = "Top academic papers on LLMs"
asyncio.run(LLM_process_abstract_loop(user_query))

Total papers to process for 'Top academic papers on LLMs': 0
Finished processing query papers.


________________________________________________________________________________________________
# RANKING
________________________________________________________________________________________________

ranking by citations, versions

In [264]:
def update_final_ranks(job_id):
    conn = connection()
    c = conn.cursor()

    # Fetch the corresponding papers along with their paper_stats where paper_stats is not null
    c.execute("""
        SELECT id, paper_stats 
        FROM Query_Papers 
        WHERE job_id = %s AND final_rank IS NULL AND paper_stats IS NOT NULL
    """, (job_id,))
    papers = c.fetchall()

    # Initialize lists to store rankings based on citations and versions
    citation_ranks = []
    version_ranks = []

    # First loop to collect citation and version counts
    for paper in papers:
        id, stats_json = paper
        if stats_json:
            # Check if stats_json is not null
            stats = json.loads(stats_json)
            citations = stats.get('citations', 0) or 0  # Ensure default is 0 if None
            versions = stats.get('versions', 0) or 0  # Ensure default is 0 if None
            citation_ranks.append((id, citations))
            version_ranks.append((id, versions))

    # Sort and rank based on citations and versions separately
    citation_ranks.sort(key=lambda x: x[1], reverse=True)
    version_ranks.sort(key=lambda x: x[1], reverse=True)
    citation_rank_dict = {paper_id: rank + 1 for rank, (paper_id, _) in enumerate(citation_ranks)}
    version_rank_dict = {paper_id: rank + 1 for rank, (paper_id, _) in enumerate(version_ranks)}

    # Combine the rankings to calculate the final rank
    final_ranks = []
    for id, _ in papers:
        # Calculate average of the ranks; use large number if paper doesn't have rank in either
        citation_rank = citation_rank_dict.get(id, len(papers))
        version_rank = version_rank_dict.get(id, len(papers))
        avg_rank = (citation_rank + version_rank) / 2.0
        final_ranks.append((id, avg_rank))

    # Sort papers based on the average rank
    final_ranks.sort(key=lambda x: x[1])

    # Update the final_rank column based on this ordering
    for rank, (id, _) in enumerate(final_ranks, start=1):
        # start=1 for ranking starting from 1
        c.execute("UPDATE Query_Papers SET final_rank = %s WHERE id = %s", (rank, id))

    # Commit the changes to the database
    conn.commit()

    # Close the database connection
    conn.close()
    print("Finished updating final ranks for query papers.")

# Example execution
job_id = 3
update_final_ranks(job_id)

Finished updating final ranks for query papers.


reranking with publication date

In [265]:
def update_final_ranks_with_date(job_id):
    conn = connection()
    c = conn.cursor()
    try:
        # For each query, fetch the corresponding papers along with their paper_stats
        c.execute("SELECT id, paper_stats, paper_metadata_filtered FROM Query_Papers WHERE job_id = %s AND final_rank IS NULL", (job_id,)) #
        papers = c.fetchall()
        print(f"Total papers to process for '{job_id}': {len(papers)}")
        final_ranks = []

        # First loop to collect citation and version counts
        for paper in papers:
            id, stats_json, metadata_json = paper
            print(f"Processing paper {id}")
            # Parse paper statistics and metadata
            if stats_json and metadata_json:
                print(f"Stats and metadata found for paper {id}")
                stats = json.loads(stats_json)
                metadata = json.loads(metadata_json)

                # Extract citations, versions, and publication date
                citations = stats.get('citations', 0) or 0
                versions = stats.get('versions', 0) or 0
                published_date_str = metadata.get('published_date')
                
                # Calculate days since publication
                if published_date_str:
                    published_date = datetime.strptime(published_date_str.split('T')[0], '%Y-%m-%d')
                    days_since_published = (datetime.now() - published_date).days
                    days_since_published = max(days_since_published, 1)  # Avoid division by zero

                    # Adjust citations and versions based on days since publication
                    citations_per_day = citations / days_since_published
                    versions_per_day = versions / days_since_published

                    final_ranks.append((id, citations_per_day, versions_per_day))
            else:
                print(f"Missing stats or metadata for paper {id}")

        # Combine the rankings based on adjusted citations and versions
        # Use geometric mean of citations_per_day and versions_per_day for final ranking score
        final_ranks = [(paper_id, (citations * versions) ** 0.5) for paper_id, citations, versions in final_ranks]
        final_ranks.sort(key=lambda x: x[1], reverse=True)  # Sort based on the final ranking score, highest first

        # Update the final_rank column based on this ordering
        for rank, (id, _) in enumerate(final_ranks, start=1):
            print(f"Attempting to update final rank for paper {id} to {rank}")
            c.execute("UPDATE Query_Papers SET final_rank = %s WHERE id = %s", (rank, id))
            print(f"Updated final rank for paper {id} to {rank}")

        # Commit the changes to the database
        conn.commit()
    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Roll back the transaction on error
    finally:
        conn.close()
    print(f"Finished updating final ranks for '{job_id}' in table query_papers.")

# Example execution
job_id = 3
update_final_ranks(job_id)

Finished updating final ranks for query papers.


capturing terminal output in logs

In [250]:
from datetime import datetime

def print_and_update_terminal_output(job_id, new_text):
    # Print the new text to the terminal
    print(new_text)

    # Prepare the message with a timestamp
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    log_message = f"[{timestamp}] - {new_text}\n"

    # Ensure the 'logs' directory exists
    logs_dir = os.path.join(os.getcwd(), 'logs')
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    # Define the path for the log file, naming it with the job_id
    log_file_path = os.path.join(logs_dir, f"{job_id}.log")

    # Write the log message to the file
    with open(log_file_path, 'a') as log_file:
        log_file.write(log_message)

# Final loop

In [251]:
import asyncio

# Connect to the SQLite database
conn = connection()
c = conn.cursor()

try:
    while True:  # Infinite loop to keep checking for new jobs
        # Query to find new jobs with status 'new'
        c.execute("SELECT job_id, query FROM jobs WHERE job_status = 'new'")
        new_jobs = c.fetchall()
        GPT=None
        Perplexity=None
        # Check if there are any new jobs
        if new_jobs:
            print("Found new jobs:", new_jobs)
            # Process the new jobs
            for job in new_jobs:
                job_id, job_query = job  # Get the job_id and query from the tuple

                # Update the job_status to 'running' for the new job
                c.execute("UPDATE jobs SET job_status = 'running' WHERE job_id = %s", (job_id,))
                conn.commit()
                print(f"Updated job: {job_id}, query: {job_query} to 'running'")                

                Perplexity=query_perplexity_response(job_id, user_query)
                GPT=query_answer_with_gpt(job_id, user_query)
                get_sub_queries(job_id, user_query)
                queries_based_on_LLM_responses(job_id, GPT, Perplexity)

                print_and_update_terminal_output(job_id, f"search_and_fetch_google")
                search_and_fetch_google(job_id)

                print_and_update_terminal_output(job_id, f"get_scholar_citations_versions_loop")
                get_scholar_citations_versions_loop(job_id)

                print_and_update_terminal_output(job_id, f"update_final_ranks")
                update_final_ranks(job_id)

                print_and_update_terminal_output(job_id, f"fetch_arxiv_paper_from_url_loop")
                fetch_arxiv_paper_from_url_loop(job_id)

                update_final_ranks_with_date(job_id)
                print_and_update_terminal_output(job_id, f"update_final_ranks_with_date")

                # print_and_update_terminal_output(job_id, f"LLM_process_abstract_loop")
                # try:
                #     asyncio.run(LLM_process_abstract_loop(job_query))
                # except RuntimeError:  # asyncio.run() cannot be called from a running event loop
                #     loop = asyncio.get_event_loop()
                #     if loop.is_running():
                #         loop.create_task(LLM_process_abstract_loop(job_query))
                #     else:
                #         loop.run_until_complete(LLM_process_abstract_loop(job_query))
                
                # Update the job_status to 'done' after processing is complete
                c.execute("UPDATE jobs SET job_status = 'done' WHERE job_id = %s", (job_id,))
                conn.commit()
                print(f"Updated job {job_id} to 'done'")
        
        # Wait for half a second before checking again
        time.sleep(0.5)
except KeyboardInterrupt:
    print("Stopped by user")
finally:
    # Close the database connection when done
    conn.close()


Stopped by user


In [252]:
# def erase_all_data():
#     # List of all your table names
#     table_names = ['google_search_results', 'Papers', 'Query_Papers', 'jobs']

#     # Open a new connection
#     conn = connection()
#     c = conn.cursor()

#     try:

#         # Truncate each table
#         for table in table_names:
#             c.execute(f"TRUNCATE TABLE {table} RESTART IDENTITY CASCADE;")  # RESTART IDENTITY resets serial counters, CASCADE deletes data in dependent tables as well

#         # Commit the transaction
#         conn.commit()
#         print("All data has been erased from all tables.")
#     except Exception as e:
#         # If an error occurs, rollback any changes made during the transaction
#         conn.rollback()
#         print(f"An error occurred: {e}. Transaction rolled back.")
#     finally:
#         # Close the cursor and connection
#         c.close()
#         conn.close()

# # Call the function
# erase_all_data()


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6d52007a-f237-4857-b1f1-3ccb95216ee4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>