In [1]:
from langchain_tavily import TavilySearch
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
# Initialize TavilySearch with the API key from environment variables
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
import httpx
from typing import Optional, List, Dict, Any
import arxiv
from io import BytesIO
import re
import asyncio


In [8]:
tool = TavilySearch(
    max_results=2,
    topic="general",
    include_raw_content = True
    # include_answer=False,
    # include_raw_content=False,
    # include_images=False,
    # include_image_descriptions=False,
    # search_depth="basic",
    # time_range="day",
    # include_domains=None,
    # exclude_domains=None
)

tool = TavilySearch(
    max_results=5,  # Increase for better chances of finding good content
    topic="general",
    include_raw_content=False,  # Start with False - use summary first
    include_answer=True,  # Let Tavily synthesize an answer
    search_depth="advanced",  # Better content extraction
    include_domains=["arxiv.org"],  # Restrict to arxiv for academic papers
    # exclude_domains=["twitter.com", "reddit.com"]  # Filter out low-quality sources
)

#res = tool.invoke({"query": "Explain the first LLAMA paper in detail"})

res = tool.invoke({"query": "LLaMA Large Language Model Meta arxiv"})


In [None]:
final_links = []
for r in resp['results']:
    # Extract URL
    url = r.get('url', 'No URL found')

    # Extract the type of content
    index_where_arxiv = url.find("arxiv.org/") + len("arxiv.org/")

    # # Content type
    last_half = url[index_where_arxiv:].split("/")
    content_type = last_half[0] if index_where_arxiv != -1 else "Unknown"

    if content_type == "pdf":
        final_links.append(url)

    else:


SyntaxError: invalid syntax (378857508.py, line 12)

In [19]:
# Method 1: Using PyPDF2 (simple, good for basic text extraction)
async def get_arxiv_text_pypdf2(arxiv_url: str) -> Dict[str, any]:
    """
    Extract text from arXiv PDF using PyPDF2.
    Install: pip install PyPDF2
    """
    import PyPDF2
    
    try:
        # Get PDF bytes
        arxiv_id = extract_arxiv_id(arxiv_url)
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        
        async with httpx.AsyncClient() as client:
            response = await client.get(pdf_url)
            response.raise_for_status()
            pdf_bytes = response.content
        
        # Extract text
        pdf_file = BytesIO(pdf_bytes)
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        text_content = ""
        for page_num, page in enumerate(pdf_reader.pages):
            text_content += f"\n--- Page {page_num + 1} ---\n"
            text_content += page.extract_text()
        
        return {
            "status": "success",
            "text": text_content,
            "num_pages": len(pdf_reader.pages),
            "word_count": len(text_content.split()),
            "method": "PyPDF2"
        }
        
    except Exception as e:
        return {"status": "error", "error": str(e)}


In [20]:
asyncio.run(get_arxiv_text_pypdf2(arxiv_url = "https://arxiv.org/pdf/2302.13971"))

RuntimeError: asyncio.run() cannot be called from a running event loop

In [8]:
httpx.get('https://arxiv.org/pdf/2302.13971').content.decode()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8f in position 10: invalid start byte

In [None]:
def convert_arxiv_to_pdf(arxiv_url):
    """
    Convert an arXiv URL to a direct PDF link.
    """
    if "arxiv.org/abs/" in arxiv_url:
        pdf_url = arxiv_url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
        return pdf_url + ".pdf"
    return arxiv_url


def scrape_arxiv_paper(link_to_arxiv_pdf):
    

In [9]:
res

{'query': 'LLaMA Large Language Model Meta arxiv',
 'follow_up_questions': None,
 'answer': 'The LLaMA model is a large language model developed by Meta AI, released in 2023. It ranges from 7B to 65B parameters, and is known for its efficient training using publicly available datasets. LLaMA models have shown competitive performance compared to other large language models.',
 'images': [],
 'results': [{'title': 'arXiv:2402.07950v1 [cs.CR] 10 Feb 2024',
   'url': 'https://arxiv.org/pdf/2402.07950v1',
   'content': 'The model was named as LLaMA (Large Language Model Meta AI) [12, 13] and was released in February 2023, and the second model called LLaMA-2 [14], in ... An Adapter Family for Parameter-Efficient Fine-Tuning of Large Language Models." arXiv preprint arXiv:2304.01933 (2023). [4] Chang, Yupeng, Xu Wang, Jindong Wang, Yuan Wu, Linyi Yang, Kaijie',
   'score': 0.7482976,
   'raw_content': None},
  {'title': 'Large Language Models: A Survey - arXiv.org',
   'url': 'https://arxiv.o

In [5]:
print(res['results'][0]['raw_content'])

![Ankur’s Newsletter](https://substackcdn.com/image/fetch/w_80,h_80,c_fill,f_auto,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2793f3d2-b75e-4405-abed-a419427aca14_900x900.png)

# [Ankur’s Newsletter](/)

#### Share this post

![](https://substackcdn.com/image/fetch/w_520,h_272,c_fill,f_auto,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc1c9214-566b-4a0d-8664-927b9fbf9fc2_1121x434.png)
![Ankur’s Newsletter](https://substackcdn.com/image/fetch/w_36,h_36,c_fill,f_auto,q_auto:good,fl_progressive:steep,g_auto/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2793f3d2-b75e-4405-abed-a419427aca14_900x900.png)

# LLaMA 1 vs LLaMA 2: A Deep Dive into Meta’s LLMs

### Discover Meta’s journey into the world of LLMs and how LLaMA 2 compares to its successor, LLaMA 1.

![Ankur A. Patel's avatar](https://substackcdn.com/image/fetch/w_36,h_36,c_fill