In [6]:
!pip install python-dotenv
!pip install snowflake-connector-python
!pip install arxiv requests PyPDF2 cortex
!pip install streamlit

Collecting streamlit
  Using cached streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting numpy<3,>=1.23 (from streamlit)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pandas<3,>=1.4.0 (from streamlit)
  Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pillow<12,>=7.1.0 (from streamlit)
  Using cached pillow-11.1.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting protobuf<6,>=3.20 (from streamlit)
  Using cached protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2

In [5]:
import streamlit as st
from snowflake.snowpark import Session

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

snowflake_account = os.getenv("SNOWFLAKE_ACCOUNT")
snowflake_user = os.getenv("SNOWFLAKE_USER")
snowflake_password = os.getenv("SNOWFLAKE_PASSWORD")
snowflake_role = os.getenv("SNOWFLAKE_ROLE")
snowflake_warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
snowflake_database = os.getenv("SNOWFLAKE_DATABASE")
snowflake_schema = os.getenv("SNOWFLAKE_SCHEMA")

# Access secrets securely
# snowflake_account = st.secrets["SNOWFLAKE_ACCOUNT"]
# snowflake_user = st.secrets["SNOWFLAKE_USER"]
# snowflake_password = st.secrets["SNOWFLAKE_PASSWORD"]
# snowflake_role = st.secrets["SNOWFLAKE_ROLE"]
# snowflake_warehouse = st.secrets["SNOWFLAKE_WAREHOUSE"]
# snowflake_database = st.secrets["SNOWFLAKE_DATABASE"]
# snowflake_schema = st.secrets["SNOWFLAKE_SCHEMA"]

# Create a Snowflake session
session = Session.builder.configs({
    "account": snowflake_account,
    "user": snowflake_user,
    "password": snowflake_password,
    "role": snowflake_role,
    "warehouse": snowflake_warehouse,
    "database": snowflake_database,
    "schema": snowflake_schema,
}).create()

st.write("Connected to Snowflake successfully!")


ModuleNotFoundError: No module named 'streamlit'

In [9]:
import arxiv
import requests
import os
import snowflake.connector
from PyPDF2 import PdfReader

In [10]:
# Function to fetch papers from arXiv API
def fetch_arxiv_papers(keyword, max_results=5):
    search = arxiv.Search(
        query=keyword,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    papers = []
    for result in search.results():
        papers.append({
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "summary": result.summary,
            "pdf_url": result.pdf_url
        })
    return papers



In [11]:
# Function to download PDF and extract text
def download_and_extract_pdf(pdf_url, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)
    response = requests.get(pdf_url)
    pdf_path = os.path.join(output_dir, pdf_url.split("/")[-1])

    # Save the PDF locally
    with open(pdf_path, "wb") as f:
        f.write(response.content)

    # Extract text from the PDF
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    # Clean up (optional)
    text = " ".join(text.split())  # Remove excessive whitespace
    return text

In [None]:
# Snowflake connection parameters
SNOWFLAKE_CONFIG = {
    "account": "your_account",
    "user": "your_user",
    "password": "your_password",
    "warehouse": "your_warehouse",
    "database": "your_database",
    "schema": "your_schema",
}


In [None]:
# Function to store data in Snowflake
def store_in_snowflake(papers):
    # Connect to Snowflake
    conn = snowflake.connector.connect(
        account=SNOWFLAKE_CONFIG["account"],
        user=SNOWFLAKE_CONFIG["user"],
        password=SNOWFLAKE_CONFIG["password"],
        warehouse=SNOWFLAKE_CONFIG["warehouse"],
        database=SNOWFLAKE_CONFIG["database"],
        schema=SNOWFLAKE_CONFIG["schema"],
    )
    cursor = conn.cursor()

    # Create a table for storing papers
    cursor.execute("""
        CREATE OR REPLACE TABLE arxiv_papers (
            title STRING,
            authors STRING,
            summary STRING,
            pdf_url STRING,
            content STRING
        )
    """)

    # Insert paper data
    for paper in papers:
        cursor.execute("""
            INSERT INTO arxiv_papers (title, authors, summary, pdf_url, content)
            VALUES (%s, %s, %s, %s, %s)
        """, (
            paper["title"],
            ", ".join(paper["authors"]),
            paper["summary"],
            paper["pdf_url"],
            paper["content"]
        ))

    # Commit and close connection
    conn.commit()
    cursor.close()
    conn.close()




In [None]:
# Main function
def main():
    keyword = "neural networks"  # Example keyword
    max_results = 5

    # Step 1: Fetch papers from arXiv
    papers = fetch_arxiv_papers(keyword, max_results)

    # Step 2: Download PDFs and extract text
    for paper in papers:
        paper["content"] = download_and_extract_pdf(paper["pdf_url"])
        print(paper["content"])

    

    # Step 4: Store data in Snowflake
    # store_in_snowflake(papers)
    # print(f"Stored {len(papers)} papers in Snowflake.")

# Run the pipeline
if __name__ == "__main__":
    main()