# Install Ollama in Colab

In [None]:
# enable support for gpu
!apt-get install lshw

# download ollama
!curl -fsSL https://ollama.com/install.sh | sh

# start ollama in background to orevent blocking the terminal
!nohup ollama serve &

# run ollama with llama2 model
!ollama run llama2

# or run ollama with mistral model
!ollama run mistral

^C


### Import libraries


In [None]:
!pip install langchain_community
!pip install pypdf
!pip install fastembed
!pip install chromadb



In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

### Split the doucment into Chunks & Store them in Vector Store

In [None]:
def ingest():
    # Get the doc
    loader = PyPDFLoader("/content/sample_data/SQL Server 2012 T-SQL Recipes.pdf")
    pages = loader.load_and_split()
    # Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    #
    embedding = FastEmbedEmbeddings()
    #Create vector store
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db")

In [None]:
# only run this once to generate vector store
ingest()

Split 23 documents into 47 chunks.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from huggingface_hub import login
login(token = access_token_read)

### Create a RAG chain that retreives relevent chunks and prepares a response

In [None]:
def rag_chain():
    model = ChatOllama(model="llama3")
    #
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context.
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s>
        [Instructions] Question: {input}
        Context: {context}
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    #
    return chain


In [None]:
print("Test")

In [None]:
print("Test")

In [None]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

# Extract Documents

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# Base URL of the list of papers
base_url = 'https://arxiv.org/list/astro-ph/2025-01?skip={}&show=100'

# Folder where PDFs will be saved
output_folder = '/content/arxiv_pdfs'

# Function to create a folder if it doesn't exist
def create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

# Function to download a single PDF
def download_pdf(pdf_url, output_folder):
    # Construct the full URL
    full_url = urljoin('https://arxiv.org', pdf_url)
    pdf_name = pdf_url.split('/')[-1] + '.pdf'
    pdf_path = os.path.join(output_folder, pdf_name)

    # Download and save the PDF
    print(f"Downloading: {pdf_name}")
    response = requests.get(full_url)
    with open(pdf_path, 'wb') as f:
        f.write(response.content)

    print(f"Saved: {pdf_name}")

# Function to scrape PDFs from a single page
def scrape_pdfs(page_url, output_folder):
    print(f"Scraping: {page_url}")

    # Fetch the page content
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links to PDFs
    pdf_links = soup.find_all('a', {'title': 'Download PDF'})

    # Download all PDFs
    for link in pdf_links:
        pdf_url = link.get('href')
        download_pdf(pdf_url, output_folder)

# Main scraping loop
def scrape_arxiv_pdfs(start_page=0, end_page=1000, output_folder=output_folder):
    create_folder(output_folder)  # Ensure the output folder exists

    # Loop through pages
    for skip in range(start_page, end_page, 100):
        page_url = base_url.format(skip)
        scrape_pdfs(page_url, output_folder)

        # Wait before scraping the next page to avoid server overload
        time.sleep(3)

# Start the scraping process
scrape_arxiv_pdfs(start_page=0, end_page=500, output_folder=output_folder)


In [None]:
!zip -r /content/arxiv_pdfs.zip /content/arxiv_pdfs/


  adding: content/arxiv_pdfs/ (stored 0%)
  adding: content/arxiv_pdfs/2501.04834.pdf (deflated 28%)
  adding: content/arxiv_pdfs/2501.04768.pdf (deflated 4%)
  adding: content/arxiv_pdfs/2501.04737.pdf (deflated 13%)
  adding: content/arxiv_pdfs/2501.06950.pdf (deflated 4%)
  adding: content/arxiv_pdfs/2501.05883.pdf (deflated 21%)
  adding: content/arxiv_pdfs/2501.03082.pdf (deflated 20%)
  adding: content/arxiv_pdfs/2501.06453.pdf (deflated 13%)
  adding: content/arxiv_pdfs/2501.04095.pdf (deflated 17%)
  adding: content/arxiv_pdfs/2501.06297.pdf (deflated 23%)
  adding: content/arxiv_pdfs/2501.06498.pdf (deflated 26%)
  adding: content/arxiv_pdfs/2501.05187.pdf (deflated 8%)
  adding: content/arxiv_pdfs/2501.05393.pdf (deflated 6%)
  adding: content/arxiv_pdfs/2501.03791.pdf (deflated 8%)
  adding: content/arxiv_pdfs/2501.06982.pdf (deflated 23%)
  adding: content/arxiv_pdfs/2501.00544.pdf (deflated 1%)
  adding: content/arxiv_pdfs/2501.05114.pdf (deflated 6%)
  adding: content/arx