# SEMA Semantic Agent. Arxiv search powered by LLMs

What it does:
- Convert user query into keyword search queries
- Google search top 10 results with SERP API
- Scrape html for each result, convert to markdown
- Structure output using function calling -> json to get paper, title
- Call arxiv to get paper, abstract, metadata
- Call Google Scholar to get citations, ...
- Use LLM to answer user query based on the paper, evaluate answer relevance
- Rank results based on citations, relevance to user query
- Print results in structured format, give links to download, or to use in notebook LM

# Setup

load secret variables

In [40]:
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.environ.get("OPENAI_API_KEY")
serp_api_key = os.environ.get("SERP_API_KEY")
gemini_api_key = os.environ.get("GEMINI_API_KEY")
llamaindex_api_key = os.environ.get("LLAMAINDEX_API_KEY")

# Hide part of the key
openai_api_key_hidden = openai_api_key[:3] + "*" * (len(openai_api_key) - 6) + openai_api_key[-3:]
serp_api_key_hidden = serp_api_key[:3] + "*" * (len(serp_api_key) - 6) + serp_api_key[-3:]
gemini_api_key_hidden = gemini_api_key[:3] + "*" * (len(gemini_api_key) - 6) + gemini_api_key[-3:]
llamaindex_api_key_hidden = llamaindex_api_key[:3] + "*" * (len(llamaindex_api_key) - 6) + llamaindex_api_key[-3:]

# Print the hidden keys
print(f"OpenAI API Key (hidden): {openai_api_key_hidden}")
print(f"Serp API Key (hidden): {serp_api_key_hidden}")
print(f"Gemini API Key (hidden): {gemini_api_key_hidden}")
print(f"Llamaindex API Key (hidden): {llamaindex_api_key_hidden}")

OpenAI API Key (hidden): sk-*********************************************0jF
Serp API Key (hidden): 68c**********************************************************266
Gemini API Key (hidden): AIz*********************************MUc
Llamaindex API Key (hidden): llx**********************************************hA3


In [41]:
import requests
import json

# Set up your SERP API key
# It's better to use an environment variable for API keys

def search_google(query):
    params = {
        "engine": "google",
        "q": query,
        "api_key": serp_api_key,
        "location": "San Francisco Bay Area, United States",
        "google_domain": "google.com",
        "gl": "us",
        "hl": "en",
        "num": "10"
    }
    response = requests.get("https://serpapi.com/search", params=params)
    response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
    results = response.json()
    # Extracting only the needed information
    formatted_data = {
        "organic_results": [
            {
                "link": result["link"],
                "title": result["title"],
                "snippet": result.get("snippet", "")
            } for result in results.get("organic_results", [])
        ]
    }
    # Assuming search_results is your JSON dictionary obtained from the search
    organic_results = formatted_data.get('organic_results', [])

    # Initialize an empty list to store all the links
    all_links = []

    # Loop through each result in the organic results
    for result in organic_results:
        # Extract the link if it exists and add it to the list
        if 'link' in result:
            all_links.append(result['link'])
    return all_links

# Example usage
query = "Top academic papers on LLMs"
search_google(query)

['https://www.topbots.com/top-llm-research-papers-2023/',
 'https://medium.com/@thedatabeast/top-10-breakthrough-research-papers-on-large-language-models-llms-in-2023-pioneering-7abfcb69da7f',
 'https://levelup.gitconnected.com/best-papers-on-large-language-models-ac01b13b94b3',
 'https://community.openai.com/t/foundational-must-read-gpt-llm-papers/197003',
 'https://www.reddit.com/r/MLQuestions/comments/ze9e5x/can_anyone_recommend_an_llm_that_handles_research/',
 'https://analyticsindiamag.com/13-not-to-miss-research-papers-on-llms/',
 'https://github.com/Hannibal046/Awesome-LLM',
 'https://yousefhosni.medium.com/top-important-llm-papers-for-the-week-from-01-01-to-07-01-4e3be08ac69b',
 'https://magazine.sebastianraschka.com/p/10-ai-research-papers-2023']

# Main logic

### Scrape the content of the page displayed in the search results

In [104]:
import requests
from bs4 import BeautifulSoup

def fetch_url_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)

    # Initialize the default response structure
    result = {
        "status": response.status_code,
        "soup": None
    }

    # Check if the request was successful
    if response.status_code == 200:
        # Adjusts encoding to match what the response seems to use
        response.encoding = response.apparent_encoding
        
        # Now using response.text to utilize the corrected encoding rather than response.content
        result['soup'] = BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    return result

# Test the function with a URL
url = 'https://medium.com/@thedatabeast/top-10-breakthrough-research-papers-on-large-language-models-llms-in-2023-pioneering-7abfcb69da7f'
response = fetch_url_content(url)
print("Status Code:", response['status'])
print("Soup:", response['soup'])

Status Code: 200
Soup: <!DOCTYPE html>
<html lang="en"><head><title data-rh="true">Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications | by The Data Beast | Medium</title><meta charset="utf-8" data-rh="true"/><meta content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1" data-rh="true" name="viewport"/><meta content="#000000" data-rh="true" name="theme-color"/><meta content="Medium" data-rh="true" name="twitter:app:name:iphone"/><meta content="828256236" data-rh="true" name="twitter:app:id:iphone"/><meta content="Medium" data-rh="true" property="al:ios:app_name"/><meta content="828256236" data-rh="true" property="al:ios:app_store_id"/><meta content="com.medium.reader" data-rh="true" property="al:android:package"/><meta content="542599432471018" data-rh="true" property="fb:app_id"/><meta content="Medium" data-rh="true" property="og:site_name"/><meta content="article" data-rh="true" property="

### Convert resulting html into markdown

In [43]:
import html2text
# Function to convert HTML to Markdown
def html_to_markdown(html_content):
    # Create a converter object
    converter = html2text.HTML2Text()
    converter.ignore_links = False  # Set to True if you want to ignore converting links
    
    # Convert the HTML content to Markdown
    markdown = converter.handle(html_content)

    return markdown

markdown = html_to_markdown(str(response['soup']))
print(markdown)

[Open in
app](https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F7abfcb69da7f&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderUser&source=---two_column_layout_nav----------------------------------)

Sign up

[Sign
in](/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40thedatabeast%2Ftop-10-breakthrough-
research-papers-on-large-language-models-llms-
in-2023-pioneering-7abfcb69da7f&source=post_page---two_column_layout_nav
-----------------------global_nav-----------)

[](/?source=---two_column_layout_nav----------------------------------)

[Write](/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2Fnew-
story&source=---two_column_layout_nav-----------------------
new_post_topnav-----------)

[](/search?source=---two_column_layout_nav----------------------------------)

Sign up

[Sign
in](/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40thedatabeast%2Ftop-10-breakthrough-
research-papers-on-large-language-models-llms-
in

### Convert markdown into a structured JSON format using function calling

we'll structure the JSON to include the page title, page summary, and details for each paragraph (title, content, and links)

In [44]:
from pydantic import BaseModel, HttpUrl
from typing import List
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

# Define your Pydantic models
class Link(BaseModel):
    url: HttpUrl

class Paper(BaseModel):
    paper_title: str
    content: str
    links: List[Link] = []

class Page(BaseModel):
    title: str
    summary: str
    paragraphs: List[Paper]

# Define the OpenAI Pydantic program
def process_markdown(markdown: str, query: str):
    max_length: int = 16000  # Updated max length for token count

    # Check token length before splitting
    token_count = count_tokens(markdown)  # Implement this function
    if token_count > max_length:
        markdown_parts = split_into_parts(markdown, max_length)
    else:
        markdown_parts = [markdown]  # No need to split

    results = []
    for part in markdown_parts:
        print("Current part length (tokens):", count_tokens(part))

        # Define the OpenAI Pydantic program
        prompt_template_str = """
        Given the following markdown_content, extract only structured information about academic papers including paper title, content, and links. The papers should reflect answers to the user query {user_query}:
        {markdown_content}
        """
        program = OpenAIPydanticProgram.from_defaults(
            output_cls=Page,
            llm=OpenAI(model="gpt-3.5-turbo-1106"),
            prompt_template_str=prompt_template_str,
            allow_multiple=False,
            verbose=True,
        )

        # Run the program to get structured output
        description_str = f"Structured json of search results based on a user {query}"
        try:
            output = program(markdown_content=part, user_query=query, description=description_str)
            results.append(output)
        except Exception as e:
            # Catch all exceptions
            if hasattr(e, 'error') and 'message' in e.error:
                print(f"Error: {e.error['message']}")
            elif hasattr(e, 'args') and e.args:
                print(f"Error: {e.args[0]}")
            else:
                print(f"An unexpected error occurred: {e}")
            continue

    # Combine results from all parts or handle as needed
    combined_result = combine_page_results(results)
    return combined_result

# Function to count tokens (replace with your implementation)
def count_tokens(text: str) -> int:
    # Use your preferred tokenizer (e.g., tiktoken)
    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
    return len(tokenizer(text))

# Assuming 'results' is a list of Page objects or similar structured data
def combine_page_results(results: List[Page]) -> Page:
    if not results:
        return None  # Or some default value
    
    # Start with the title and summary from the first result
    combined_title = results[0].title
    combined_summary = results[0].summary
    combined_paragraphs = []

    # Iterate through all results and combine the paragraphs
    for result in results:
        combined_paragraphs.extend(result.paragraphs)  # Assuming 'paragraphs' is a list of 'Paper' objects
    
    # Create a new combined Page object
    combined_page = Page(
        title=combined_title,
        summary=combined_summary,
        paragraphs=combined_paragraphs
    )
    return combined_page

def split_into_parts(text: str, max_length: int) -> List[str]:
    paragraphs = text.split('\n\n')
    parts = []
    current_part = ""

    for paragraph in paragraphs:
        if count_tokens(current_part) + count_tokens(paragraph) + 2 > max_length:  # +2 for the two newlines
            parts.append(current_part)
            current_part = paragraph  # Start new part with the current paragraph
        else:
            # Add paragraph to current part, include two newlines if it's not the first paragraph
            current_part += ('\n\n' + paragraph) if current_part else paragraph

    if current_part:  # Add the last part if not empty
        parts.append(current_part)
    
    return parts


result = process_markdown(markdown, query)
print(result)

Current part length (tokens): 2513
Function call: Page with args: {"title":"Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications","summary":"The top 10 breakthrough research papers on large language models (LLMs) in 2023, including practical applications and pioneering developments.","paragraphs":[{"paper_title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","content":"Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.","links":[{"url":"https://arxiv.org/abs/1810.04805"}]},{"paper_title":"BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage","content":"From Meta AI, BlenderBot 3, with its 175 billion parameters, can scour the internet, setting it apart from other conversational bots.","links":[{"url":"https://arxiv.org/abs/2208

In [45]:
import json

# Assuming `output` is your object and it has a method `.dict()` to convert it to a dictionary.
# If `output` is already a dictionary, you can skip the `.dict()` conversion.
output_dict = result.dict() if hasattr(result, 'dict') else result

# Convert to JSON string with indentation for readability
pretty_output = json.dumps(output_dict, indent=4, default=str)

# Print with added line breaks
print(pretty_output)

{
    "title": "Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications",
    "summary": "The top 10 breakthrough research papers on large language models (LLMs) in 2023, including practical applications and pioneering developments.",
    "paragraphs": [
        {
            "paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
            "content": "Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.",
            "links": [
                {
                    "url": "https://arxiv.org/abs/1810.04805"
                }
            ]
        },
        {
            "paper_title": "BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage",
            "content": "From Meta AI, BlenderBot 3, with its 175 billion parameters,

### Put data into a local database

establish db connection

In [129]:
import psycopg2
import os

def connection():
    """Creates and returns a new database connection."""
    try:
        conn = psycopg2.connect(
            user=os.environ["MY_INTEGRATION_USER"],
            password=os.environ["MY_INTEGRATION_PASSWORD"],
            host=os.environ["MY_INTEGRATION_HOST"],
            port=os.environ["MY_INTEGRATION_PORT"],
            database=os.environ["MY_INTEGRATION_DATABASE"]
        )
        
        # Test the connection
        with conn.cursor() as cursor:
            cursor.execute("SELECT version();")
            record = cursor.fetchone()
            print("You are connected to - ", record)
        
        return conn  # Return the connection object if successful

    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to database", error)
        return None  # Return None if connection was not successful

conn = connection()

In [48]:
import psycopg2
import os

# Function to create tables in the database
def create_tables():
    # Define your SQL statements for creating tables
    sql_commands = [
        """
        CREATE TABLE IF NOT EXISTS google_search_results (
            url TEXT PRIMARY KEY,
            html TEXT,
            scraping_status TEXT,
            processed_markdown TEXT,
            query TEXT
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS Papers (
            id SERIAL PRIMARY KEY,
            paper_title TEXT,
            source_content TEXT,
            links TEXT,
            arxiv_link TEXT UNIQUE,
            arxiv_title TEXT,
            arxiv_abstract TEXT,
            arxiv_metadata TEXT,
            arxiv_filename TEXT,
            arxiv_paper_markdown TEXT,
            citations INTEGER,
            versions INTEGER
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS Query_Papers (
            id SERIAL PRIMARY KEY,
            query TEXT,
            arxiv_link TEXT,
            relevance_score REAL,
            final_rank INTEGER,
            relevant_answer TEXT,
            paper_stats TEXT,
            paper_metadata_filtered TEXT,
            download_link TEXT,
            CONSTRAINT unique_query_arxiv_link UNIQUE (query, arxiv_link)
        );
        """,
        """
        CREATE TABLE IF NOT EXISTS jobs (
            job_id SERIAL PRIMARY KEY,
            query TEXT,
            job_status TEXT,
            printed_ranks INTEGER DEFAULT 0
        );
        """
    ]
    try:
        with conn.cursor() as cursor:
            # Execute each SQL command separately
            for sql_command in sql_commands:
                cursor.execute(sql_command)
            conn.commit()  # Commit the transaction
            print("All tables are created successfully.")

    except (Exception, psycopg2.Error) as error:
        print("Failed to create tables", error)
        conn.rollback()  # Rollback the transaction on error

    finally:
        if conn:
            conn.close()
            print("Database connection is closed.")
# Main script execution
try:
    connection()
    create_tables()

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to database", error)


You are connected to -  ('PostgreSQL 15.1 (Ubuntu 15.1-1.pgdg20.04+1) on aarch64-unknown-linux-gnu, compiled by gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, 64-bit',)
All tables are created successfully.
Database connection is closed.


In [117]:
def insert_arxiv_links_into_db(html_content, user_query):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all hyperlinks within the HTML content
    links = soup.find_all('a', href=True)
    
    # Filter for links containing 'arxiv.org'
    arxiv_links = [link['href'] for link in links if 'arxiv.org' in link['href']]
    print(f"arxiv_links[{len(arxiv_links)}]: {arxiv_links}")

    if arxiv_links:
        try:
            conn = connection()
            c = conn.cursor()
            # Assuming auto-commit is enabled by default; otherwise, manage transactions explicitly if needed
            # If transactions need to be managed manually, ensure this is done outside of a transaction block

            # Insert arxiv links into Papers table in bulk if there are any
            arxiv_links_data = [(link,) for link in arxiv_links]  # Prepare data for bulk insert
            psycopg2.extras.execute_batch(
                c, 
                "INSERT INTO Papers (arxiv_link) VALUES (%s) ON CONFLICT (arxiv_link) DO NOTHING",
                arxiv_links_data
            )
            
            # Insert records associated with user query in Query_Papers table in bulk
            query_papers_data = [(user_query, link) for link in arxiv_links]  # Prepare data
            psycopg2.extras.execute_batch(
                c, 
                "INSERT INTO Query_Papers (query, arxiv_link) VALUES (%s, %s) ON CONFLICT (query, arxiv_link) DO NOTHING",
                query_papers_data
            )

            # Commit the transaction
            conn.commit()
            # print(f"Successfully inserted records associated with the query '{user_query}' into the database.")

        except Exception as e:
            # Rollback any changes if an error occurs
            conn.rollback()
            print(f"Transaction rolled back. Error occurred: {e}")
        if conn:
            conn.close()
# Connect to the database

# Example user query
user_query = "Example Query for Testing"
# Example HTML content
html_content = """
<html>
    <body>
        <p>Here are some arXiv papers that might interest you:</p>
        <a href="https://arxiv.org/abs/12457457234623434.56789">Paper 1</a>
        <a href="https://arxiv.org/abs/98724723463246234623466.54321">Paper 2</a>
        <a href="http://example.com">Non-arXiv link</a>
        <a href="https://arxiv.org/abs/11223472347234722.3344">Paper 3</a>
    </body>
</html>
"""
insert_arxiv_links_into_db(html_content, user_query)

arxiv_links[3]: ['https://arxiv.org/abs/12457457234623434.56789', 'https://arxiv.org/abs/98724723463246234623466.54321', 'https://arxiv.org/abs/11223472347234722.3344']


In [50]:
def insert_papers_into_db(result, query):
    arxiv_links = []
    if result is None:
        print("No data to insert into Papers table.")
        return
    # Parse JSON data
    output_dict = result.dict() if hasattr(result, 'dict') else result
    pretty_output = json.dumps(output_dict, indent=4, default=str)
    data = json.loads(pretty_output)
    if data is None or 'paragraphs' not in data:
        print("Invalid or empty data.")
        print('Parsed website stuctured data=', data)
        return

    # Connect to SQLite database
    conn = connection()
    c = conn.cursor()

    try:
        # Start transaction
        c.execute("BEGIN;")
        # Insert data into Papers table
        for paragraph in data['paragraphs']:
            paper_title = paragraph['paper_title']
            source_content = paragraph['content']
            links = json.dumps(paragraph['links'])  # Convert list of links to JSON string
            # Initialize an empty arXiv link
            arxiv_link = None
            # Search for the arXiv link among the links
            for link in paragraph['links']:
                if 'arxiv.org' in link['url']:
                    temp_link = link['url'].replace('.pdf', '')  # Remove .pdf if present
                    # Remove any trailing file identifiers after the arXiv ID
                    temp_link = temp_link.split('/abs/')[1] if '/abs/' in temp_link else temp_link.split('/')[-1]
                    arxiv_link = 'https://arxiv.org/abs/' + temp_link  # Construct the cleaned arXiv link
                    # Add the arXiv link to the list
                    if arxiv_link not in arxiv_links:
                        arxiv_links.append(arxiv_link)
                    break  # Stop searching once the arXiv link is found
            # Check if the arxiv_link already exists in the database
            c.execute('SELECT COUNT(*) FROM Papers WHERE arxiv_link = %s', (arxiv_link,))
            if c.fetchone()[0] == 0:  # If the count is 0, then the link does not exist
                # SQL statement for inserting data
                insert_sql = '''
                INSERT INTO Papers (paper_title, source_content, links, arxiv_link) VALUES (%s, %s, %s, %s)
                '''
                c.execute(insert_sql, (paper_title, source_content, links, arxiv_link))
            else:
                print(f'Skipping insert: arXiv link already exists in the database: {arxiv_link}')

        for link in arxiv_links:
            # Insert new row into Query_Papers if it does not exist
            c.execute("INSERT INTO Query_Papers (query, arxiv_link) SELECT %s, %s WHERE NOT EXISTS (SELECT 1 FROM Query_Papers WHERE query = %s AND arxiv_link = %s)", (query, link, query, link))

        # Commit the transaction
        conn.commit()
        print(f"Processed and inserted links associated with the query '{query}' into the database.")

    except Exception as e:
        # Rollback the transaction on error
        conn.rollback()
        print(f"An error occurred: {e}. Transaction was rolled back.")

    finally:
        if conn:
            conn.close()
        pass

# Example data
query = "Example Query for Testing"
result_data = {
    "title": "Top 10 Breakthrough Research Papers on Large Language Models (LLMs) in 2023: Pioneering Developments and Practical Applications",
    "summary": "The following are the top 10 breakthrough research papers on large language models (LLMs) in 2023, along with their practical applications and details.",
    "paragraphs": [
        {"paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "content": "Released by Google AI Language team, BERT introduced a deep bidirectional architecture, which enhanced transfer learning demonstrated by unsupervised pre-training.", "links": [{"url": "https://arxiv.org/abs/1810.04805"}]},
        {"paper_title": "BlenderBot 3: A deployed conversational agent that continually learns to responsibly engage", "content": "From Meta AI, BlenderBot 3, with its 175 billion parameters, can scour the internet, setting it apart from other conversational bots.", "links": [{"url": "https://arxiv.org/abs/2208.03188"}]},
        # Add more papers as needed...
    ]
}
# result_string = json.dumps(result_data)
insert_papers_into_db(result_data, query)

You are connected to -  ('PostgreSQL 15.1 (Ubuntu 15.1-1.pgdg20.04+1) on aarch64-unknown-linux-gnu, compiled by gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, 64-bit',)
Skipping insert: arXiv link already exists in the database: https://arxiv.org/abs/1810.04805
Skipping insert: arXiv link already exists in the database: https://arxiv.org/abs/2208.03188
Processed and inserted links associated with the query 'Example Query for Testing' into the database.


In [51]:
def print_papers_table():
    conn = connection()
    c = conn.cursor()

    try:
        # Start transaction (mainly useful if there are preceding data manipulations)
        c.execute("BEGIN;")

        # Query all records from Papers table
        query_sql = 'SELECT * FROM Papers'  # Add any condition if necessary
        c.execute(query_sql)

        # Fetch all rows from the query
        all_rows = c.fetchall()

        # Get the column names
        field_names = [description[0] for description in c.description]

        # Check if the table is not empty
        if all_rows:
            print("Preview of Papers Table:")
            for row_counter, row in enumerate(all_rows, start=1):
                print(f"Row {row_counter}:")
                row_with_field_names = {
                    field_name: (content[:60] + '...' if isinstance(content, str) and len(content) > 60 else content) 
                    for field_name, content in zip(field_names, row)
                }
                for field, content in row_with_field_names.items():
                    print(f"{field}: {content}")
                print("-------------")  # Separator for readability
        else:
            print("The Papers table is currently empty.")

        # Commit if there were preceding changes; otherwise, this is optional for read-only operations
        conn.commit()

    except Exception as e:
        # Rollback any changes if an exception occurs
        conn.rollback()
        print(f"An error occurred: {e}")

    finally:
        if conn:
            conn.close()
        pass

# Call the function
print_papers_table()

arxiv_filename: FlashAttention_Fast_and_Memory-Efficient_Exact_Attention_wit...
arxiv_paper_markdown: None
citations: 599
versions: 9
id: 20
-------------
Row 140:
paper_title: None
source_content: None
links: None
arxiv_link: https://arxiv.org/abs/2303.15647
arxiv_title: Scaling Down to Scale Up: A Guide to Parameter-Efficient Fin...
arxiv_abstract: This paper presents a systematic overview and comparison of
...
arxiv_metadata: <?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://w...
arxiv_filename: Scaling_Down_to_Scale_Up_A_Guide_to_Parameter-Efficient_Fine...
arxiv_paper_markdown: None
citations: 61
versions: 2
id: 25
-------------
Row 141:
paper_title: None
source_content: None
links: None
arxiv_link: https://arxiv.org/abs/2203.02155
arxiv_title: Training language models to follow instructions with human f...
arxiv_abstract: Making language models bigger does not inherently make them ...
arxiv_metadata: <?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://w...
arxiv_f

Define database function to insert scraping results

In [108]:
def insert_scraping_results(url, html, status, query):    
    conn = connection()
    c = conn.cursor()
    try:
        # Check if the URL already exists in the table
        c.execute('SELECT COUNT(*) FROM google_search_results WHERE url = %s', (url,))
        count = c.fetchone()[0]

        if count == 0:
            # URL does not exist, insert new row
            c.execute('''
                INSERT INTO google_search_results (url, html, scraping_status, query)
                VALUES (%s, %s, %s, %s)
            ''', (url, html, status, query))
        else:
            # URL exists, skip inserting
            print("URL already exists in google_search_results. Skipping insert.")

        # Commit the transaction
        conn.commit()

    except Exception as e:
        # Rollback the transaction if an error occurs
        conn.rollback()
        print(f"An error occurred: {e}. Transaction was rolled back.")
    if conn:
        conn.close()

# Example
insert_scraping_results('https://www.topbots.com/top-llm-research-papers-2023/', '<html lang="en-US"><head>..</html>', 200, 'example query')

URL already exists in google_search_results. Skipping insert.


define a database function to check for processed markdown

In [53]:
def check_processed_markdown(url: str) -> bool:
    conn = connection()
    """Check if the markdown for a given URL has already been processed."""
    c = conn.cursor()
    c.execute("SELECT processed_markdown FROM google_search_results WHERE url = %s", (url,))
    result = c.fetchone()
    if result and result[0]:
        # If there's processed markdown, return True
        return True
    return False
    if conn:
        conn.close()

# Example usage:
url = 'https://www.topbots.com/top-llm-research-papers-2023/'
check_processed_markdown(url) 

You are connected to -  ('PostgreSQL 15.1 (Ubuntu 15.1-1.pgdg20.04+1) on aarch64-unknown-linux-gnu, compiled by gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, 64-bit',)


False

define a database function to insert processed markdown

In [54]:
def insert_processed_markdown(url: str, processed_markdown: dict):  # processed_markdown should be a dict based on usage
    try:
        conn = connection()
        c = conn.cursor()
        
        # First, check if the URL exists in the database
        c.execute('SELECT COUNT(*) FROM google_search_results WHERE url = %s', (url,))
        url_exists = c.fetchone()[0]
        
        if url_exists:
            # Convert processed_markdown to a JSON string
            processed_markdown_str = json.dumps(processed_markdown, indent=4)  # Assuming processed_markdown is always a dict based on your usage
            
            # Update the row where the URL matches, setting the processed_markdown column
            c.execute('''
                UPDATE google_search_results 
                SET processed_markdown = %s 
                WHERE url = %s;
            ''', (processed_markdown_str, url))
            conn.commit()
            print(f"Processed markdown inserted successfully for URL: {url}")
        else:
            print(f"No entry found in the database for URL: {url}. Update skipped.")
        
    except Exception as e:
        print(f"An error occurred while inserting processed markdown: {e}")
    finally:
        if conn:
            conn.close()
            
url = 'https://test.url'
processed_markdown = {
    "Function call": "Page",
    "args": {
        "title": "Top academic papers on LLMs",
        "summary": "A list of academic papers and resources related to Large Language Models (LLMs) and their applications.",
        "paragraphs": [
            {
                "paper_title": "Awesome-LLM-hallucination",
                "content": "LLM hallucination paper list.",
                "links": [
                    {"url": "https://github.com/LuckyyySTA/Awesome-LLM-hallucination"}
                ]
            },
            {
                "paper_title": "awesome-hallucination-detection",
                "content": "List of papers on hallucination detection in LLMs.",
                "links": [
                    {"url": "https://github.com/EdinburghNLP/awesome-hallucination-detection"}
                ]
            },
            {
                "paper_title": "LLMsPracticalGuide",
                "content": "A curated (still actively updated) list of practical guide resources of LLMs",
                "links": [
                    {"url": "https://github.com/Mooler0410/LLMsPracticalGuide"}
                ]
            },
            # Add other papers here in the same format
        ]
    }
}

insert_processed_markdown(url, processed_markdown)

You are connected to -  ('PostgreSQL 15.1 (Ubuntu 15.1-1.pgdg20.04+1) on aarch64-unknown-linux-gnu, compiled by gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0, 64-bit',)
No entry found in the database for URL: https://test.url. Update skipped.


### Get metadata from arxiv for the paper

In [55]:
import xml.etree.ElementTree as ET

def fetch_arxiv_paper_from_url(arxiv_url):
    # Extract the arXiv ID from the provided URL
    arxiv_id = arxiv_url.split('/')[-1]
    # Ensure that .pdf is not part of the arXiv ID
    arxiv_id = arxiv_id.replace('.pdf', '')  # Remove '.pdf' if it's part of the ID

    print("Fetching information for arXiv ID:", arxiv_id)

    # Define the base URL for the arXiv API
    base_url = 'http://export.arxiv.org/api/query?'
    query_params = 'id_list={}&max_results=1'.format(arxiv_id)
    final_url = base_url + query_params  # Construct the final URL
    print("Final API Request URL:", final_url)  # Debug: print the URL to be requested

    # Make the request
    response = requests.get(final_url)

    # Check if the request was successful
    if response.status_code == 200:
        print("Raw XML response received")
        xml_data = response.text
        root = ET.fromstring(xml_data)
        ns = {'atom': 'http://www.w3.org/2005/Atom'}  # Namespace for parsing

        # Extract paper details
        link_element = root.find('.//atom:entry/atom:link[@rel="related"]', ns)
        if link_element is not None:
            pdf_url = link_element.attrib['href']
        else:
            pdf_url = None
        title = root.find('.//atom:entry/atom:title', ns).text.strip()
        abstract = root.find('.//atom:entry/atom:summary', ns).text.strip()
        published_date = root.find('.//atom:entry/atom:published', ns).text.strip()

        # Extract authors
        authors = [author.find('atom:name', ns).text for author in root.findall('.//atom:entry/atom:author', ns)]

        # Generate a sanitized file name from the title
        file_name = title.replace(':', '').replace(' ', '_') + '.pdf'

        # Print extracted information for debugging
        print(f"PDF URL: {pdf_url}")
        print(f"Title: {title}")
        print(f"File Name: {file_name}")
        print(f"Abstract: {abstract[:100]}..." if len(abstract) > 100 else abstract)
        print(f"Published Date: {published_date}")
        print(f"Authors: {', '.join(authors)}")

        # Return the collected information
        return xml_data, pdf_url, title, file_name, abstract, published_date, authors
    else:
        print("Failed to fetch data from arXiv API. Status code:", response.status_code)
        return None, None, None, None, None, None, None

# Example usage
arxiv_url = 'https://arxiv.org/abs/2302.13971'
fetch_arxiv_paper_from_url(arxiv_url)

Fetching information for arXiv ID: 2302.13971
Final API Request URL: http://export.arxiv.org/api/query?id_list=2302.13971&max_results=1
Raw XML response received
PDF URL: http://arxiv.org/pdf/2302.13971v1
Title: LLaMA: Open and Efficient Foundation Language Models
File Name: LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf
Abstract: We introduce LLaMA, a collection of foundation language models ranging from
7B to 65B parameters. We...
Published Date: 2023-02-27T17:11:15Z
Authors: Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample


('<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3D%26id_list%3D2302.13971%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=&amp;id_list=2302.13971&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/qJuhZNxbRqWajNrNkNtkRSmyBuQ</id>\n  <updated>2024-03-08T00:00:00-05:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/2302.13971v1</id>\n    <updated>2023-02-27T17:11:15Z</updated>\n    <published>2023-02-27T17:11:15Z</published>\n    <title>LLaMA: Open and Efficient Foundation Language 

### Download pdf of the paper

In [56]:
def download_pdf(pdf_url, file_name):
    # Create the "papers" directory if it doesn't exist
    papers_dir = "papers"
    if not os.path.exists(papers_dir):
        os.makedirs(papers_dir)

    # Construct the full file path
    file_path = os.path.join(papers_dir, file_name)

    # Check if the file already exists
    if os.path.exists(file_path):
        print("The paper already exists.")
        return file_path  # Return the file path

    # Send a GET request to download the PDF
    response = requests.get(pdf_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Write the PDF content to the file
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print("The paper has been downloaded successfully.")
        return file_path  # Return the file path
    else:
        # Return a status error message
        error_message = f"Failed to download the paper. Status code: {response.status_code}"
        return error_message

# Example usage
download_pdf('http://arxiv.org/pdf/2302.13971v1', 'LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf')



The paper already exists.


'papers/LLaMA_Open_and_Efficient_Foundation_Language_Models.pdf'

### Convert pdf into markdown

In [57]:
import nest_asyncio
from llama_parse import LlamaParse

# This function will convert a given PDF file to Markdown format using LlamaParse
def convert_pdf_to_markdown(file_name):
    # Necessary for running async code in notebooks or scripts
    nest_asyncio.apply()

    # Initialize the LlamaParse parser
    parser = LlamaParse(
        api_key=llamaindex_api_key,
        result_type="markdown",  # Choose "markdown" as the output format
        verbose=True,  # Enable verbose output to see detailed logs
    )
    
    # Define the path to your PDF file
    pdf_file_path = os.path.join("./papers/", file_name)
    print(pdf_file_path, "type:", type(pdf_file_path))
    # Convert the PDF to Markdown
    # This is a synchronous call, you can also use asynchronous calls as shown in the documentation
    documents = parser.load_data(pdf_file_path)

    # Return the converted documents
    return documents

# Define the path to your PDF file
file_name = "Retrieval-Augmented_Generation_for_Knowledge-Intensive_NLP_Tasks.pdf"
documents = convert_pdf_to_markdown(file_name)

./papers/Retrieval-Augmented_Generation_for_Knowledge-Intensive_NLP_Tasks.pdf type: <class 'str'>
Started parsing the file under job_id 6a8f5721-ae50-4ddc-90a3-6bb2dbddeb07


In [58]:
markdown_content = None
if documents:
    # Assuming the first document contains the content
    # Use the get_text() method to retrieve the Markdown content
    markdown_content = documents[0].get_text()
    print(markdown_content)

    # Optionally, write the markdown content to a file
    with open('converted_markdown.md', 'w', encoding='utf-8') as markdown_file:
        markdown_file.write(markdown_content)

## Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

Patrick Lewis†‡, Ethan Perez⋆, Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†

arXiv:2005.11401v4 [cs.CL] 12 Apr 2021

Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†

†Facebook AI Research; ‡University College London; ⋆New York University;

plewis@fb.com

### Abstract

Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory have so far b

### Get citations and number of versions from Google Scholar

In [59]:
from serpapi import GoogleSearch

def get_scholar_citations_versions(query_url):
    params = {
        "api_key": serp_api_key,  # Ensure serp_api_key is defined elsewhere
        "engine": "google_scholar",
        "q": query_url,
        "hl": "en"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    # Initialize the return values
    number_of_citations = None
    number_of_versions = None

    # Extracting number of citations and versions
    if 'organic_results' in results:
        if 'inline_links' in results['organic_results'][0]:
            if 'cited_by' in results['organic_results'][0]['inline_links']:
                number_of_citations = results["organic_results"][0]["inline_links"]["cited_by"]["total"]

            if 'versions' in results['organic_results'][0]['inline_links']:
                number_of_versions = results["organic_results"][0]["inline_links"]["versions"]["total"]

    return number_of_citations, number_of_versions

query_url = 'https://arxiv.org/abs/2302.13971'
citations, versions = get_scholar_citations_versions(query_url)
print("Number of citations:", citations)
print("Number of versions:", versions)

Number of citations: 4397
Number of versions: 13


### Gemini summary and relevance score

Gemini Set up

In [60]:
import google.generativeai as genai
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel('gemini-1.0-pro')

In [61]:
model

genai.GenerativeModel(
    model_name='models/gemini-1.0-pro',
    generation_config={},
    safety_settings={},
    tools=None,
)

Given `arxiv` structure, summarize and evaluate against user prompt. Give a heuritic score.

In [62]:
import re
import json

def process_arxiv(mkdn, metdata, query):
  # 1 - `arxiv` dict
  def extract_markdown(markdown_text, pattern):
    # Use re.findall to find all matches of the pattern in the markdown text
    matches = re.findall(pattern, markdown_text, re.MULTILINE)

    # Return the first match (if any)
    if matches:
        return matches[0]
    else:
        return None

  paper_title = extract_markdown(mkdn, r'^##\s+(.*)$')
  if paper_title is None:
    print("extract_markdown for paper_title isn't working")

  abstract = extract_markdown(mkdn, r'^Abstract(.*)#')
  if abstract is None:
    print("extract_markdown for abstract isn't working...hardcoding the abstract instead")
    abstract = '''We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.'''

  arxiv = {'paper_title': paper_title, 'abstract': abstract, 'metadata': metadata, 'paper': mkdn}

  # 2 - Summarizer
  prompt = '''Please summarize the following paper in one sentence given the user query "{query}". The paper is provided in a structured format {paper_format} \n\nDocument: {document}'''.format(query=query, document=arxiv, paper_format={key: "" for key in arxiv.keys()})
  print(prompt, "\nGenerating summarization............")

  if model.count_tokens(prompt).total_tokens > 28_000:
    print("The prompt is too long, visiting https://aistudio.google.com/app/prompts/new_freeform to manually use Gemini 1.5 pro instead with the prompt above.")
  relevant_answer = model.generate_content(prompt).text

  print(relevant_answer)

  # 3 - Relevance scorer
  prompt = '''From a scale of 1 to 5, rate how relevant the following paper is with the user query "{query}". The paper is provided in a structured format {paper_format}. Please provide the score in the format of a json object with one key, 'score'. Example: {{"score": 5}}. Also please provide reasoning why it doesn't have a higher or lower relevance score. \n\nDocument: {document}'''.format(query=query, document=arxiv, paper_format={key: "" for key in arxiv.keys()})
  print(prompt, "\nGenerating............")

  if model.count_tokens(prompt).total_tokens > 28_000:
    print("The prompt is too long, visiting https://aistudio.google.com/app/prompts/new_freeform to manually use Gemini 1.5 pro instead with the prompt above.")

  model_response = model.generate_content(prompt).text

  re_match = re.search(r'"score": (\d+)', model_response)
  relevance_score = re_match.group(1)

  print("relevance score: " + relevance_score)

  return {
      'relevance_score': relevance_score,
      'relevant_answer': relevant_answer
  }

  query

  #@title `mkdn` and `metadata`
metadata = markdown_content #right now it's just the entire paper pdf

#@title Extractors to process `mkdn` and `metadata` into `arxiv` dict

import re

def extract_markdown(markdown_text, pattern):
  # Use re.findall to find all matches of the pattern in the markdown text
  matches = re.findall(pattern, markdown_text, re.MULTILINE)

  # Return the first match (if any)
  if matches:
      return matches[0]
  else:
      return None

paper_title = extract_markdown(metadata, r'^##\s+(.*)$')
if paper_title is None:
  print("extract_markdown for paper_title isn't working")

abstract = extract_markdown(metadata, r'^Abstract(.*)#')
if abstract is None:
  print("extract_markdown for abstract isn't working...hardcoding the abstract instead")
  abstract = '''We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.'''

arxiv = {'paper_title': paper_title, 'abstract': abstract, 'metadata': metadata, 'paper': metadata}



import json

# Convert to JSON string with indentation for readability
pretty_arxiv_output = json.dumps(arxiv, indent=4, default=str)

# Print with added line breaks
print("\narxiv=",)
print(pretty_arxiv_output)

extract_markdown for abstract isn't working...hardcoding the abstract instead

arxiv=
{
    "paper_title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    "abstract": "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.",
    "metadata": "## Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\n\nPatrick Lewis\u2020\u2021, Ethan Perez\u22c6, Aleksandra Piktus\u2020, Fabio Petroni\u2020, Vladimir Karpukhin\u2020, Naman Goyal\u2020, Heinrich K\u00fcttler\u2020\n\narXiv:2005.11401v4 [cs.CL] 12 Apr 2

### OpenAI summary

In [134]:
import openai
from openai import AsyncOpenAI
import sqlite3

async def query_info_with_gpt(arxiv_paper_markdown, arxiv_metadata, user_query):
    MAX_CONTEXT_LENGTH = 15500 

    # Initial context setup and trimming
    context = f"Context: {arxiv_metadata}\n{arxiv_paper_markdown}"
    while count_tokens(context) > MAX_CONTEXT_LENGTH:
        char_to_token_ratio = len(context) / count_tokens(context)
        max_char_length = int(MAX_CONTEXT_LENGTH * char_to_token_ratio)
        context = context[:max_char_length]

    # Constructing the prompt
    prompt = (
        f"Summarize this in 100 characters based on the user query {user_query}: {context}"
    )

    # Setting up OpenAI client
    client = AsyncOpenAI(api_key=openai_api_key)

    # Making an asynchronous API call
    try:
        response = await client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt}
            ],
            model="gpt-4"  # You can switch to other models if needed
        )
        answer = response.choices[0].message.content  # Extracting the response
        return answer
    except openai.error.InternalServerError as e:
        print(f"OpenAI API Internal Server Error: {e}")

# Define the main operation
async def test():
    # Connect to the SQLite database
    conn = connection()
    c = conn.cursor()

    # Fetch data from the Papers table
    c.execute("SELECT * FROM Papers WHERE arxiv_metadata IS NOT NULL LIMIT 1")
    row = c.fetchone()

    # Check if data exists
    if row:
        _, _, _, _, _, _, _, arxiv_metadata, _, arxiv_paper_markdown, _, _ = row
        user_query = "Insert your query here"  # Define the user query as needed

        # Call the GPT function and handle response
        response = await query_info_with_gpt(arxiv_paper_markdown, arxiv_metadata, user_query)
        print("LLM response:", response)

        # Here, insert the response into the database if needed, or handle it as necessary

    else:
        print("No data found in the Papers table.")
    if conn is not None:
        conn.close()  # Ensure the connection is closed if it's not None
# Execute the main operation
import asyncio
asyncio.run(test())

LLM response: The file is about FEVER, a large-scale dataset used for fact extraction and verification.


### Relevance score (placeholder)

# Processing loops

Extract search results from Google based on user query

In [109]:
import json
import time

class PageEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Page):
            return obj.dict()  # Convert Page to a dictionary
        elif isinstance(obj, Paper):
            return obj.dict()  # Convert Paper to a dictionary
        elif isinstance(obj, Link):
            return obj.dict()  # Convert Link to a dictionary
        return json.JSONEncoder.default(self, obj)  # Handle other types

def fetch_and_process(link, query):
    conn = None
    try:
        conn = connection()  # Open a new connection
        c = conn.cursor()  # Create a new cursor

        c.execute("SELECT scraping_status, html FROM google_search_results WHERE url = %s", (link,))
        result = c.fetchone()
        if result and result[0] == '200':
            print(f"Status: {result[0]}, already fetched for URL: {link}")
            html_content = result[1].replace('\x00', '')  # Sanitize HTML content from database
        else:
            response = fetch_url_content(link)
            print(f"Status:{response['status']} for URL: {link}")
            html_content = response['soup'].decode('utf-8', 'replace') if response['status'] == 200 else ""
            html_content = html_content.replace('\x00', '')
            insert_scraping_results(link, html_content, str(response['status']), query)  # Ensure 'insert_scraping_results' correctly uses the 'conn' and 'c' objects

        if html_content:
            insert_arxiv_links_into_db(html_content, query)  # Adjust 'insert_arxiv_links_into_db' to take 'conn' and 'c' as additional parameters

    except psycopg2.OperationalError as e:
        print(f"Database operation failed for URL: {link}, Error: {e}")
        if conn:
            conn.rollback()  # Roll back any changes due to error

    finally:
        if c:
            c.close()  # Close the cursor
        if conn:
            conn.close()  # Close the connection

def search_and_fetch_google(query):
    search_results = search_google(query)  # Ensure this function is defined elsewhere
    print(query, search_results)

    # Sequential execution
    for link in search_results:
        try:
            data = fetch_and_process(link, query)
        except Exception as exc:
            print(f'fetch_and_process exception: {exc}')

    print('Finished extracting search results pages')

# Example usage
query = "Top academic papers on Chain of Thought"
search_and_fetch_google(query)


Top academic papers on Chain of Thought ['https://arxiv.org/abs/2201.11903', 'https://openreview.net/pdf?id=_VjQlMeSB_J', 'https://www.kdnuggets.com/2023/07/power-chain-thought-prompting-large-language-models.html', 'https://www.linkedin.com/pulse/chain-thought-new-frontier-prompt-engineering-tiran-dagan-wkuce', 'https://research.google/pubs/self-consistency-improves-chain-of-thought-reasoning-in-language-models/', 'https://openreview.net/forum?id=_VjQlMeSB_J', 'https://arxiv.org/pdf/2201.11903', 'https://medium.com/@JerryCuomo/lets-think-step-by-step-advanced-reasoning-in-business-with-chain-of-thought-prompting-dd5ae8a6008', 'https://www.searchenginejournal.com/google-chain-of-thought-prompting/450106/', 'https://www.youtube.com/watch?v=538uaE-AACs']
Status: 200, already fetched for URL: https://arxiv.org/abs/2201.11903
arxiv_links[34]: ['https://info.arxiv.org/about/ourmembers.html', 'https://info.arxiv.org/about/donate.html', 'https://info.arxiv.org/help', 'https://arxiv.org/search

Extracting and processing arxiv papers: pdf, markdown, metadata, citations, versions

In [132]:
import json
import psycopg2.extras

def get_scholar_citations_versions_loop(query):
    conn = connection()  # Ensure this is a valid connection function
    c = conn.cursor()

    try:
        # Fetch the first 30 Query_Papers rows associated with the given query
        c.execute("""
            SELECT id, arxiv_link FROM Query_Papers 
            WHERE query = %s 
            LIMIT 30
        """, (query,))  # Limit to the first 30 results
        query_papers_to_update = c.fetchall()

        # Prepare batch update lists
        papers_updates = []
        query_papers_updates = []

        for paper_id, arxiv_link in query_papers_to_update:
            try:
                # Fetch citations and versions
                number_of_citations, number_of_versions = get_scholar_citations_versions(arxiv_link)
                
                # Append data for batch update in Papers table
                papers_updates.append((number_of_citations, number_of_versions, arxiv_link))
                
                # Create JSON object with citations and versions, append for batch update in Query_Papers
                paper_stats_json = json.dumps({'citations': number_of_citations, 'versions': number_of_versions})
                query_papers_updates.append((paper_stats_json, paper_id))
                
            except Exception as e:
                print(f"An error occurred while processing paper {arxiv_link}: {e}")

        # Perform batch updates
        psycopg2.extras.execute_batch(c, "UPDATE Papers SET citations = %s, versions = %s WHERE arxiv_link = %s",
                                      papers_updates)
        psycopg2.extras.execute_batch(c, "UPDATE Query_Papers SET paper_stats = %s WHERE id = %s",
                                      query_papers_updates)

        # Commit all changes
        conn.commit()

    except Exception as e:
        # If an exception occurs, roll back all database changes
        conn.rollback()
        print(f"An error occurred while fetching Query_Papers for the query '{query}': {e}")

    finally:
        # Ensure resources are cleaned up
        c.close()
        conn.close()

# Assuming get_scholar_citations_versions and connection are correctly defined elsewhere


In [69]:
def fetch_arxiv_paper_from_url_loop(query):
    conn = connection()  # Ensure this is a function that returns a DB connection
    c = conn.cursor()

    # Select records from Query_Papers related to the specific query and with final_rank between 1 and 10
    try:
        c.execute("""
            SELECT Query_Papers.id, Papers.paper_title, Papers.arxiv_link
            FROM Query_Papers
            JOIN Papers ON Query_Papers.arxiv_link = Papers.arxiv_link
            WHERE Query_Papers.query = %s AND final_rank BETWEEN 1 AND 10
            ORDER BY final_rank ASC
        """, (query,))

        papers_to_update = c.fetchall()

        for q_id, paper_title, arxiv_link in papers_to_update:
            print(f"Updating missing information for paper: {paper_title}")
            if arxiv_link:
                try:
                    # Fetch paper metadata from arXiv
                    xml_data, pdf_url, title, file_name, abstract, published_date, authors = fetch_arxiv_paper_from_url(arxiv_link)

                    # Update Papers table with fetched metadata
                    c.execute("""
                        UPDATE Papers 
                        SET arxiv_title = %s, arxiv_abstract = %s, arxiv_metadata = %s, arxiv_filename = %s 
                        WHERE arxiv_link = %s
                    """, (title, abstract, xml_data, file_name, arxiv_link))

                    # Update Query_Papers table with filtered metadata and download link
                    paper_metadata_filtered = {'title': title, 'abstract': abstract, 'published_date': published_date, 'authors': authors}
                    c.execute("""
                        UPDATE Query_Papers 
                        SET paper_metadata_filtered = %s, download_link = %s 
                        WHERE id = %s
                    """, (json.dumps(paper_metadata_filtered), pdf_url, q_id))

                    # Commit the transaction
                    conn.commit()

                except Exception as e:
                    print(f"An error occurred while updating paper {paper_title}: {e}")
            else:
                print(f"No arXiv link found for paper: {paper_title}")
    except Exception as e:
        print(f"An error occurred while fetching Query_Papers for the query '{query}': {e}")
    finally:
        if conn is not None:
            c.close()
            conn.close()

In [70]:
def download_pdf_loop(query):
    conn = connection()  # Make sure this is a function that returns a DB connection
    c = conn.cursor()

    # Select records from Query_Papers related to the specific query and with final_rank between 1 and 10
    try:
        c.execute("""
            SELECT Query_Papers.id, Papers.paper_title, Papers.arxiv_link, Papers.arxiv_filename
            FROM Query_Papers
            JOIN Papers ON Query_Papers.arxiv_link = Papers.arxiv_link
            WHERE Query_Papers.query = %s AND final_rank BETWEEN 1 AND 10
            ORDER BY final_rank ASC
        """, (query,))

        papers_metadata = c.fetchall()

        for id, paper_title, arxiv_link, file_name in papers_metadata:
            print(f"Downloading PDF for paper: {paper_title}")
            if arxiv_link and file_name:
                # Typically, the PDF URL is derived from the arXiv link, adjust as necessary
                pdf_url = f'https://arxiv.org/pdf/{arxiv_link.split("/")[-1]}.pdf'  # Adjust based on actual URL format

                # Download the PDF
                file_path_or_error = download_pdf(pdf_url, file_name)
                if 'Failed' not in file_path_or_error:
                    print(f"Download successful: {file_path_or_error}")
                else:
                    print(f"Download failed for paper: {paper_title}")
            else:
                print(f"No valid arXiv link or filename found for paper: {paper_title}")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if conn is not None:
            c.close()
            conn.close()


In [71]:
def convert_pdf_to_markdown_loop():
    # Connect to SQLite database
    conn = connection()
    c = conn.cursor()

    # Update papers with missing arxiv_paper_markdown
    c.execute("SELECT id, arxiv_filename FROM Papers WHERE (arxiv_paper_markdown IS NULL OR arxiv_paper_markdown = '' OR arxiv_paper_markdown = 'None') AND arxiv_filename IS NOT NULL AND arxiv_filename != ''")
    papers_to_update = c.fetchall()

    for id, arxiv_filename in papers_to_update:
        try:
            # Convert PDF to Markdown
            markdown_content = convert_pdf_to_markdown(arxiv_filename)

            # Update Papers table with Markdown content
            c.execute("UPDATE Papers SET arxiv_paper_markdown = %s WHERE id = %s", (markdown_content, rowid))
            conn.commit()
        except Exception as e:
            print(f"An error occurred while updating paper id {id}: {e}")

    print("Finished converting pdfs to markdown loop")
    if conn is not None:
        # Close the cursor and connection
        c.close()
        conn.close()


Process papers against user query to arrive at the relevant answer and relevance score

abstract

In [73]:
import asyncio

async def LLM_process_abstract_loop(query):
    # Connect to the database (for reading and writing)
    conn = connection()
    c = conn.cursor()

    # Modify the SQL query to fetch rows for a given query with final_rank between 1 and 10
    # and where the relevant_answer is missing
    c.execute("""
        SELECT id, query, arxiv_link, relevance_score, final_rank, relevant_answer, paper_stats, paper_metadata_filtered, download_link
        FROM Query_Papers
        WHERE (relevant_answer IS NULL OR relevant_answer = 'None')
        AND query = %s AND final_rank BETWEEN 1 AND 10
        ORDER BY final_rank
    """, (query,))
    query_papers_to_update = c.fetchall()

    # Count and print the total number of papers to process
    total_papers = len(query_papers_to_update)
    print(f"Total papers to process for '{query}': {total_papers}")

    for i, query_paper in enumerate(query_papers_to_update):
        id, _, arxiv_link, relevance_score, final_rank, _, paper_stats, paper_metadata_filtered, download_link = query_paper
        print(f"Papers to process: ({total_papers - i}). Processing query paper: {arxiv_link}")

        # Retrieve corresponding paper's abstract and metadata from Papers table
        c.execute("SELECT arxiv_abstract, arxiv_metadata FROM Papers WHERE arxiv_link = %s", (arxiv_link,))
        paper_data = c.fetchone()

        if paper_data:
            arxiv_abstract, arxiv_metadata = paper_data

            if arxiv_abstract and arxiv_metadata:
                # Assuming query_info_with_gpt is an asynchronous function
                relevant_answer = await query_info_with_gpt(arxiv_abstract, arxiv_metadata, query)  # Use await since this function is now async
                print(f"Relevant answer: {relevant_answer}")

                # Update the Query_Papers table with the relevant_answer
                c.execute("UPDATE Query_Papers SET relevant_answer = %s WHERE id = %s", (relevant_answer, id))
                conn.commit()

            else:
                print(f"Missing content or metadata for paper: {arxiv_link}")

        else:
            print(f"No corresponding paper found for query paper: {arxiv_link}")

    if conn is not None:
        # Close the cursor and connection
        c.close()
        conn.close()

    print("Finished processing query papers.")


________________________________________________________________________________________________
# RANKING
________________________________________________________________________________________________

In [130]:
def update_final_ranks(query):
    conn = connection()
    c = conn.cursor()

    # Fetch all unique queries from the Query_Papers table where final_rank is null
    c.execute("SELECT DISTINCT query FROM Query_Papers WHERE final_rank IS NULL")
    queries = c.fetchall()

    for query in queries:
        # For each query, fetch the corresponding papers along with their paper_stats
        c.execute("SELECT id, paper_stats FROM Query_Papers WHERE query = %s AND final_rank IS NULL", (query[0],))
        papers = c.fetchall()

        # Initialize lists to store rankings based on citations and versions
        citation_ranks = []
        version_ranks = []

        # First loop to collect citation and version counts
        for paper in papers:
            id, stats_json = paper
            if stats_json:
                # Check if stats_json is not null
                stats = json.loads(stats_json)
                citations = stats.get('citations', 0) or 0  # Ensure default is 0 if None
                versions = stats.get('versions', 0) or 0  # Ensure default is 0 if None
                citation_ranks.append((id, citations))
                version_ranks.append((id, versions))

        # Sort and rank based on citations and versions separately
        citation_ranks.sort(key=lambda x: x[1], reverse=True)
        version_ranks.sort(key=lambda x: x[1], reverse=True)
        citation_rank_dict = {paper_id: rank + 1 for rank, (paper_id, _) in enumerate(citation_ranks)}
        version_rank_dict = {paper_id: rank + 1 for rank, (paper_id, _) in enumerate(version_ranks)}

        # Combine the rankings to calculate the final rank
        final_ranks = []
        for id, _ in papers:
            # Calculate average of the ranks; use large number if paper doesn't have rank in either
            citation_rank = citation_rank_dict.get(id, len(papers))
            version_rank = version_rank_dict.get(id, len(papers))
            avg_rank = (citation_rank + version_rank) / 2.0
            final_ranks.append((id, avg_rank))

        # Sort papers based on the average rank
        final_ranks.sort(key=lambda x: x[1])

        # Update the final_rank column based on this ordering
        for rank, (id, _) in enumerate(final_ranks, start=1):
            # start=1 for ranking starting from 1
            c.execute("UPDATE Query_Papers SET final_rank = %s WHERE id = %s", (rank, id))

        # Commit the changes to the database
        conn.commit()

    # Close the database connection
    conn.close()
    print("Finished updating final ranks for query papers.")

# Example execution
update_final_ranks("Top academic papers on RAG")

Finished updating final ranks for query papers.


# Final loop

In [133]:
import asyncio

# Connect to the SQLite database
conn = connection()
c = conn.cursor()

try:
    while True:  # Infinite loop to keep checking for new jobs
        # Query to find new jobs with status 'new'
        c.execute("SELECT job_id, query FROM jobs WHERE job_status = 'new'")
        new_jobs = c.fetchall()
        
        # Check if there are any new jobs
        if new_jobs:
            print("Found new jobs:", new_jobs)
            # Process the new jobs
            for job in new_jobs:
                job_id, job_query = job  # Get the job_id and query from the tuple
                # Update the job_status to 'running' for the new job
                c.execute("UPDATE jobs SET job_status = 'running' WHERE job_id = %s", (job_id,))
                conn.commit()
                print(f"Updated job: {job_id}, query: {job_query} to 'running'")                

                # Now, process the query using your functions
                print(f"search_and_fetch_google: {job_query}")
                search_and_fetch_google(job_query)
                print(f"get_scholar_citations_versions_loop: {job_query}")
                get_scholar_citations_versions_loop(job_query)
                print(f"update_final_ranks: {job_query}")
                update_final_ranks(job_query)
                print(f"fetch_arxiv_paper_from_url_loop: {job_query}")
                fetch_arxiv_paper_from_url_loop(job_query)
                # Process abstract loop for LLM
                print(f"LLM_process_abstract_loop: {job_query}")

                #download_pdf_loop(query)

                try:
                    asyncio.run(LLM_process_abstract_loop(job_query))
                except RuntimeError:  # asyncio.run() cannot be called from a running event loop
                    loop = asyncio.get_event_loop()
                    if loop.is_running():
                        loop.create_task(LLM_process_abstract_loop(job_query))
                    else:
                        loop.run_until_complete(LLM_process_abstract_loop(job_query))
                
                # Update the job_status to 'done' after processing is complete
                c.execute("UPDATE jobs SET job_status = 'done' WHERE job_id = %s", (job_id,))
                conn.commit()
                print(f"Updated job {job_id} to 'done'")
        
        # Wait for half a second before checking again
        time.sleep(0.5)
except KeyboardInterrupt:
    print("Stopped by user")
finally:
    # Close the database connection when done
    conn.close()


Found new jobs: [(6, 'Top academic papers on RAG')]
Updated job: 6, query: Top academic papers on RAG to 'running'
search_and_fetch_google: Top academic papers on RAG
Top academic papers on RAG ['https://isamu-website.medium.com/literature-review-on-rag-retrieval-augmented-generation-for-custom-domains-325bcef98be4', 'https://paperswithcode.com/method/rag', 'https://typeset.io/questions/what-are-the-latest-papers-on-rag-42ftizufgr', 'https://arxiv.org/abs/2312.10997', 'https://www.pinecone.io/blog/rag-study/', 'https://www.promptingguide.ai/research/rag', 'https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00530/114590/Improving-the-Domain-Adaptation-of-Retrieval', 'https://nexla.com/ai-infrastructure/retrieval-augmented-generation/', 'https://medium.com/@thedatabeast/revolutionizing-ai-with-rag-implementing-retrieval-augmented-generation-for-breakthrough-f1509b5c9db0', 'https://arxiv.org/html/2401.05856v1']
Status:200 for URL: https://isamu-website.medium.com/literature-review-on-

In [115]:
# def erase_all_data():
#     # List of all your table names
#     table_names = ['google_search_results', 'Papers', 'Query_Papers', 'jobs']

#     # Open a new connection
#     conn = connection()
#     c = conn.cursor()

#     try:

#         # Truncate each table
#         for table in table_names:
#             c.execute(f"TRUNCATE TABLE {table} RESTART IDENTITY CASCADE;")  # RESTART IDENTITY resets serial counters, CASCADE deletes data in dependent tables as well

#         # Commit the transaction
#         conn.commit()
#         print("All data has been erased from all tables.")
#     except Exception as e:
#         # If an error occurs, rollback any changes made during the transaction
#         conn.rollback()
#         print(f"An error occurred: {e}. Transaction rolled back.")
#     finally:
#         # Close the cursor and connection
#         c.close()
#         conn.close()

# # Call the function
# erase_all_data()


All data has been erased from all tables.


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6d52007a-f237-4857-b1f1-3ccb95216ee4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>