In [1]:
import os
import shutil
import requests
import base64
from PIL import Image
from PyPDF2 import PdfReader
from unstructured.partition.pdf import partition_pdf
from pix2text import Pix2Text
import fitz  # PyMuPD
from IPython.display import display, Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to fetch papers from arXiv API
def fetch_arxiv_papers(keyword, max_results=5):
    import arxiv  # Ensure arxiv library is installed
    search = arxiv.Search(
        query=keyword,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    papers = []
    for result in search.results():
        papers.append({
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "summary": result.summary,
            "pdf_url": result.pdf_url
        })
    return papers

In [3]:
# Function to clear the downloads folder
def clear_downloads_folder(output_dir):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)  # Remove all files and subdirectories
    os.makedirs(output_dir)  # Recreate the folder

# Function to download PDF
def download_pdf(pdf_url, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)
    pdf_name = pdf_url.split("/")[-1] + ".pdf"
    pdf_path = os.path.join(output_dir, pdf_name)

    # Save the PDF locally
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        return pdf_path
    else:
        print(f"Failed to download {pdf_url} with status code {response.status_code}")
        return None

In [4]:
# Function to extract text using fitz and Pix2Text
def extract_text_from_pdf_with_latex(pdf_file):
    math_extractor = Pix2Text()
    pdf_document = fitz.open(pdf_file)
    extracted_text = ""

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        # Extract raw text
        text = page.get_text()
        extracted_text += f"Page {page_num + 1} Text:\n{text}\n"

        # Extract images for potential LaTeX expressions
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            # Extract LaTeX code using Pix2Text
            latex_code = math_extractor(image)
            extracted_text += f"\nMath Expression (Image {img_index + 1}):\n{latex_code}\n"

    pdf_document.close()
    return extracted_text

In [5]:
# Function to process PDF for tables and images using partition_pdf
def process_pdf_with_partition(file_path, output_path):
    chunks = partition_pdf(
        filename=file_path,
        infer_table_structure=True,
        strategy="hi_res",
        extract_image_block_types=["Image","Table"],
        image_output_dir_path=output_path,
        extract_image_block_to_payload=True,
    )

    tables = []
    images = []

    for chunk in chunks:
        if "Table" in str(type(chunk)):
            tables.append(chunk)
        elif "Image" in str(type(chunk)):
            images.append(chunk.metadata.image_base64)

    return tables, images

In [6]:
def display_base64_image(base64_code):
    """Display a base64-encoded image inline (for Jupyter Notebook or similar)."""
    image_data = base64.b64decode(base64_code)
    display(Image(data=image_data))

In [7]:
from snowflake.snowpark import Session
import snowflake.connector as sf
from snowflake.snowpark.types import StructType, StructField, StringType, IntegerType

In [16]:
import uuid
import json
import snowflake.connector as sf
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Snowflake connection setup
def connect_to_snowflake():
    return sf.connect(
        user=snowflake_user,
        password=snowflake_password,
        account=snowflake_account,
        database=snowflake_database,
        warehouse=snowflake_warehouse,
        schema="public"
    )


In [10]:
# Chunk text using RecursiveCharacterTextSplitter
def chunk_text_with_langchain(text, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)

In [None]:
keyword = "electron"
max_results = 3
output_dir = "downloads"

# Step 1: Fetch papers from arXiv
papers = fetch_arxiv_papers(keyword, max_results)

# Step 2: Clear downloads folder
clear_downloads_folder(output_dir)

# Hierarchical structure for extracted data
extracted_data = {}

# Step 3: Process papers
for paper in papers:
    print(f"Processing: {paper['title']}")
    pdf_path = download_pdf(paper["pdf_url"], output_dir)

    if pdf_path:
        print(f"Saved PDF: {pdf_path}")

        # Generate a unique ID for the paper
        paper_id = str(uuid.uuid4())

        # Initialize storage for this paper
        extracted_data[paper_id] = {
            "title": paper["title"],
            "text_chunks": [],
            "tables": [],
            "images": [],
        }

        # Extract text and chunk it
        text = extract_text_from_pdf_with_latex(pdf_path)
        text_chunks = chunk_text_with_langchain(text)
        extracted_data[paper_id]["text_chunks"].extend(text_chunks)

        # Extract tables and images
        tables, images = process_pdf_with_partition(pdf_path, output_dir)
        extracted_data[paper_id]["tables"].extend([table.text for table in tables])
        extracted_data[paper_id]["images"].extend(images)

        print(f"Data collected for paper: {paper['title']}")
    else:
        print(f"Failed to process {paper['title']}")

  for result in search.results():
 


Processing: Impact of Electron-Electron Cusp on Configuration Interaction Energies
Saved PDF: downloads/0102536v1.pdf
Data collected for paper: Impact of Electron-Electron Cusp on Configuration Interaction Energies
Processing: Electron thermal conductivity owing to collisions between degenerate electrons
Saved PDF: downloads/0608371v1.pdf
Data collected for paper: Electron thermal conductivity owing to collisions between degenerate electrons
Processing: Electron pairing: from metastable electron pair to bipolaron
Saved PDF: downloads/1802.06593v1.pdf
Data collected for paper: Electron pairing: from metastable electron pair to bipolaron


In [41]:


for paper_id, content in extracted_data.items():
    #curr pdf
    tables=content["tables"]
    tableSummary=[]
    for table in tables:
        prompt = f"""You are an assistant tasked with summarizing tables for retrieval. \
        These summaries will be embedded and used to retrieve the raw table elements. \
        Give a concise summary of the table that is well optimized for retrieval. {table} """
        sendToMistral(prompt)
        print("\n")
        extracted_data[paper_id]["tableSummary"]=tableSummary
    
    break

  stream = Complete(
 


<class 'str'>
 The table presents data on the performance of HF − QMC (Hartree-Fock minus Quantum Monte Carlo) calculations for different values of rc (a.u.) and cmin. The table includes the following columns:

- **rc (a.u.)**: Values ranging from 3 × 10^−2 to 0.00.
- **cmin**: Values ranging from 1 × 10^−3 to 0.0.
- **HF − QMC**: Numerical values corresponding to the calculations.
- **Percentages**: Percentage values indicating the relative performance or accuracy of the calculations.

The table shows how the HF − QMC values and their corresponding percentages change with different combinations of rc and cmin. The percentages range from 11.01% to 100.00%, indicating varying levels of accuracy or performance.

This summary is optimized for retrieval by highlighting the key parameters (rc, cmin) and the main data points (HF − QMC values and percentages).

<class 'str'>
 The table presents data for HF and QMC methods across different values of rc (1 × 10−2, 1 × 10−3, 1 × 10−4) and cmin (

In [None]:
def encode_image(image_path):
    ''' Getting the base64 string '''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

![alt text](image.png)

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag",
                     embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))
# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))
# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in img_base64_list]
summary_img = [
    Document(page_content=s, metadata={id_key: img_ids[i]})
    for i, s in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, img_base64_list)))

In [None]:
# Store base64 encoded images
img_base64_list = []
# Store image summaries
image_summaries = []
# Prompt : Our prompt here is customized to the type of images we have which is chart in our case
prompt = "Describe the image in detail. Be specific about graphs, such as bar plots."
# Read images, encode to base64 strings
for img_file in sorted(os.listdir(path)):
    if img_file.endswith('.jpg'):
        img_path = os.path.join(path, img_file)
        base64_image = encode_image(img_path)
        img_base64_list.append(base64_image)
        img_capt = image_captioning(base64_image,prompt)
        time.sleep(60)
        image_summaries.append(image_captioning(img_capt,prompt))

In [47]:
# Insert data into Snowflake
def insert_into_snowflake(conn, paper_id, section_type, content, metadata):
    metadata_json = json.dumps(metadata)  # Convert dictionary to JSON string
    query = f"""
    INSERT INTO parsed_papers (paper_id, section_type, content, metadata)
    VALUES (%s, %s, %s, PARSE_JSON(%s))
    """
    with conn.cursor() as cur:
        cur.execute(query, (paper_id, section_type, content, metadata_json))


In [24]:
from snowflake.snowpark import Session
from snowflake.cortex import Complete
 
snowflake_config = {
    "account": snowflake_account,
    "user": snowflake_user,
    "password": snowflake_password,
    "role": snowflake_role,
    "warehouse": snowflake_warehouse,
    "database": snowflake_database,
    # "schema": "<your_schema>"
}

session = Session.builder.configs(snowflake_config).create()

In [37]:
from snowflake.cortex import Complete
def sendToMistral(prompt):
    # session = connect_to_snowflake()
    stream = Complete(
    "mistral-large2",
    prompt,
    session=session)  
    # print(f"stream {stream} , type {type(stream)}"  )
    for update in stream:
        print(update, end = "")
        
    

In [38]:
type(sendToMistral("hi"))

  stream = Complete(
 


<class 'str'>
 Hello! How can I assist you today? Let's chat about anything you'd like. 😊

NoneType

In [None]:
prompt_text = """You are an assistant tasked with summarizing tables for retrieval. \
These summaries will be embedded and used to retrieve the raw table elements. \
Give a concise summary of the table that is well optimized for retrieval. Table:{element} """

In [48]:
# Step 4: Insert collected data into Snowflake


try:
    for paper_id, content in extracted_data.items():
        # Insert text chunks
        for chunk in content["text_chunks"]:
            insert_into_snowflake(conn, paper_id, "TEXT", chunk, {"title": content["title"]})

        # Insert tables
        # for table in content["tables"]:
        #     insert_into_snowflake(conn, paper_id, "TABLE", table, {"title": content["title"]})

        # # Insert images
        # for image_path in content["images"]:
        #     insert_into_snowflake(conn, paper_id, "IMAGE", image_path, {"title": content["title"]})

        print(f"Inserted data for paper ID: {paper_id}")
finally:
    conn.close()


ProgrammingError: 002014 (22000): SQL compilation error:
Invalid expression [PARSE_JSON('{"title": "Impact of Electron-Electron Cusp on Configuration Interaction Energies"}')] in VALUES clause

In [None]:
# Define the stage and local file path
# stage_name = 'research_papers_db.public.fomc'
# file_path = './downloads/0608371v1.pdf'

# # Upload the PDF to the Snowflake stage
# with conn.cursor() as cur:
#     cur.execute(f"PUT file://{file_path} @{stage_name} AUTO_COMPRESS=FALSE")
#     print("File uploaded successfully.")


In [None]:
# # Define the stage name and file name
# stage_name = 'RESEARCH_PAPERS_DB.PUBLIC.FOMC'
# file_name = '0608371v1.pdf.gz'  # Use the .gz file name since it's compressed

# # SQL query to call the parse_document function
# query = f"""
# SELECT TO_VARCHAR(
#     SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
#         '@{stage_name}', 
#         '{file_name}', 
#         '{{"mode": "LAYOUT"}}'
#     ):content
# ) AS LAYOUT;
# """

# # Execute the query and fetch the result
# with conn.cursor() as cur:
#     cur.execute(query)
#     result = cur.fetchall()
#     for row in result:
#         print(row)