<a href="https://colab.research.google.com/github/noambassat/Rag_on_arxiv/blob/main/RAG_with_arxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [149]:
from openai import OpenAI
from google.colab import userdata
import os
from transformers import GPT2TokenizerFast

In [150]:
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
import re

In [151]:
open_ai_key=userdata.get('open_ai_key')
os.environ['OPENAI_API_KEY'] = open_ai_key
client = OpenAI()


In [152]:
"https://arxiv.org/list/cs.HC/" + datetime(2025,3,26).strftime("%Y-%m")

'https://arxiv.org/list/cs.HC/2025-03'

In [153]:
def get_url(date):
  path = "https://arxiv.org/list/cs.HC/" + date.strftime("%Y-%m")
  content = requests.get(path).text
  return [a["href"] for a in bs(content, "html.parser").find_all("a",{"title":"View HTML"})]

In [154]:
# urls = sum([get_url(datetime(2025, 3, 26)-timedelta(days=i*30))
#         for i in trange(5)],[])

urls = get_url(datetime(2025, 3, 26))
print(urls[0])

https://arxiv.org/html/2503.00144v1


In [155]:
def get_article(url):
    content = requests.get(url).text
    article = bs(content, "html.parser").find("div",{"class":"ltx_page_content"})
    return [s.text for s in article.find_all("section")]

In [156]:
articles = pd.DataFrame(
    {
        "url":urls,
        "article":[get_article(url) for url in tqdm(urls)]
    }
)

100%|██████████| 47/47 [00:15<00:00,  2.95it/s]


In [157]:
articles["article"].isna().sum()

np.int64(0)

In [158]:
articles.shape

(47, 2)

In [159]:
articles.head()

Unnamed: 0,url,article
0,https://arxiv.org/html/2503.00144v1,[\n\n1 Introduction\n\nProgramming is an impor...
1,https://arxiv.org/html/2503.00149v2,[\n\n1. Introduction\n\nTactile charts are an ...
2,https://arxiv.org/html/2503.00228v1,[\n\n1. Introduction\n\nSimilarity is a fundam...
3,https://arxiv.org/html/2503.00257v1,[\n\n1. Introduction\n\nThe Japanese manga111M...
4,https://arxiv.org/html/2503.00303v1,[\n\nI Introduction\n\n\nThis document is a mo...


In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [160]:
paragraphs = articles.explode('article').rename(columns={"article":"paragraph"}) # for each url there are several paragraph
paragraphs = paragraphs[
    (paragraphs['paragraph'].str.split().map(len) > 10) &
    (paragraphs['paragraph'].progress_apply(lambda x: len(tokenizer.encode(x)) <= 8000))
]
paragraphs.head()

100%|██████████| 1697/1697 [00:07<00:00, 224.11it/s]


Unnamed: 0,url,paragraph
0,https://arxiv.org/html/2503.00144v1,\n\n1 Introduction\n\nProgramming is an import...
0,https://arxiv.org/html/2503.00144v1,\n\n2 Related Work\n\n\n2.0.1 Learner-System C...
0,https://arxiv.org/html/2503.00144v1,\n\n2.0.1 Learner-System Control Dynamics.\n\n...
0,https://arxiv.org/html/2503.00144v1,\n\n2.0.2 Participatory Design in Educational ...
0,https://arxiv.org/html/2503.00144v1,\n\n3 Participatory Design with Learners and I...


In [161]:
paragraphs.shape

(1668, 2)


# Embeddings

In [162]:
def get_embedding(texts, model='text-embedding-ada-002'):
    text = [text.replace("\n", " ") for text in texts]

    return [res.embedding for res in client.embeddings.create(input = texts, model=model).data]

In [163]:
batch_size=100
embeddings = []
for i in trange(0, len(paragraphs), batch_size):
    embeddings.extend(get_embedding(paragraphs["paragraph"].iloc[i:i+batch_size]))

100%|██████████| 17/17 [00:30<00:00,  1.80s/it]


In [None]:
tqdm.pandas()
paragraphs["embeddings"] = paragraphs["paragraph"].progress_apply(get_embedding)

  1%|▏         | 22/1668 [00:06<07:38,  3.59it/s]