# Retrieval Agumented Generation

In [1]:
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import requests
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
from pathlib import Path

In [2]:
def find_pdf_links(url):
    """
    Finds and returns all the PDF links present in a webpage.

    Args:
    url (str): The URL of the webpage to scan for PDF links.

    Returns:
    list: A list of URLs (str) that are linked to PDF files.
    """
    # Send a GET request to the specified URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to load page: {url}")

    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags, then filter out those with href ending in '.pdf'
    pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]

    return pdf_links

def download_to_file(pdf_url, filepath):
    """
    Downloads a PDF from the given URL, saving it to the indicated filepath.

    Args:
    pdf_url (str): The URL from where to download the PDF.
    filepath (str): The filepath to save the PDF to.

    Returns:
    int: Updated total size of the downloaded PDF.
    """
    with requests.get(pdf_url, stream=True) as pdf_response:
        if pdf_response.status_code != 200:
            print(f"Failed to download PDF: {pdf_url}")
            return 0

        # Create a file to store the PDF
        with open(Path(filepath) / os.path.basename(pdf_url), 'wb') as f:
            for chunk in pdf_response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)

            # Update total size
            total_size = f.tell()

    return total_size

In [None]:
embed_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

In [3]:
# language_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
embeddings = embed_model.encode(sentences)
print(embeddings)

[[ 1.0951382e-02  5.7414606e-02 -1.1036388e-02 ...  3.5131579e-05
  -2.8092245e-02 -2.1599913e-02]
 [-1.3367120e-02  2.7091343e-02 -2.3367403e-02 ...  2.8799422e-02
  -1.0674847e-02  2.8820729e-02]]


In [5]:
# Retrieve the PDF links from the Moltensalt website
url = 'http://moltensalt.org/references/static/downloads/pdf/index.html'
num_pdfs_to_download = 10
pdf_links = find_pdf_links(url)[:num_pdfs_to_download]
total_size = 0
download_dir = './pdf_dataset'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
for pdf_link in tqdm(pdf_links):
    total_size += download_to_file(pdf_link, download_dir)

print(f'Total size of downloaded PDFs: {total_size / 1e6:.2f} MB')


100%|██████████| 10/10 [00:26<00:00,  2.61s/it]

Total size of downloaded PDFs: 25.98 MB



