<a href="https://colab.research.google.com/github/noambassat/Rag_on_arxiv/blob/main/RAG_with_arxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [176]:
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
import re

In [177]:
from openai import OpenAI
from google.colab import userdata
import os
from transformers import GPT2TokenizerFast
from sklearn.metrics.pairwise import cosine_similarity

In [178]:
open_ai_key=userdata.get('open_ai_key')
os.environ['OPENAI_API_KEY'] = open_ai_key
client = OpenAI()


In [179]:
"https://arxiv.org/list/cs.HC/" + datetime(2025,3,26).strftime("%Y-%m")

'https://arxiv.org/list/cs.HC/2025-03'

In [180]:
def get_url(date):
  path = "https://arxiv.org/list/cs.HC/" + date.strftime("%Y-%m")
  content = requests.get(path).text
  return [a["href"] for a in bs(content, "html.parser").find_all("a",{"title":"View HTML"})]

In [181]:
# urls = sum([get_url(datetime(2025, 3, 26)-timedelta(days=i*30))
#         for i in trange(5)],[])

urls = get_url(datetime(2025, 3, 26))
print(urls[0])

https://arxiv.org/html/2503.00144v1


In [182]:
def get_article(url):
    content = requests.get(url).text
    article = bs(content, "html.parser").find("div",{"class":"ltx_page_content"})
    return [s.text for s in article.find_all("section")]

In [183]:
articles = pd.DataFrame(
    {
        "url":urls,
        "article":[get_article(url) for url in tqdm(urls)]
    }
)

100%|██████████| 47/47 [00:26<00:00,  1.80it/s]


In [184]:
articles["article"].isna().sum()

np.int64(0)

In [185]:
articles.shape

(47, 2)

In [186]:
articles.head()

Unnamed: 0,url,article
0,https://arxiv.org/html/2503.00144v1,[\n\n1 Introduction\n\nProgramming is an impor...
1,https://arxiv.org/html/2503.00149v2,[\n\n1. Introduction\n\nTactile charts are an ...
2,https://arxiv.org/html/2503.00228v1,[\n\n1. Introduction\n\nSimilarity is a fundam...
3,https://arxiv.org/html/2503.00257v1,[\n\n1. Introduction\n\nThe Japanese manga111M...
4,https://arxiv.org/html/2503.00303v1,[\n\nI Introduction\n\n\nThis document is a mo...


In [187]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [188]:
paragraphs = articles.explode('article').rename(columns={"article":"paragraph"}) # for each url there are several paragraph
paragraphs = paragraphs[
    (paragraphs['paragraph'].str.split().map(len) > 10) &
    (paragraphs['paragraph'].progress_apply(lambda x: len(tokenizer.encode(x)) <= 8000))
]
paragraphs.head()

  0%|          | 0/1697 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1157 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 1697/1697 [00:07<00:00, 216.53it/s]


Unnamed: 0,url,paragraph
0,https://arxiv.org/html/2503.00144v1,\n\n1 Introduction\n\nProgramming is an import...
0,https://arxiv.org/html/2503.00144v1,\n\n2 Related Work\n\n\n2.0.1 Learner-System C...
0,https://arxiv.org/html/2503.00144v1,\n\n2.0.1 Learner-System Control Dynamics.\n\n...
0,https://arxiv.org/html/2503.00144v1,\n\n2.0.2 Participatory Design in Educational ...
0,https://arxiv.org/html/2503.00144v1,\n\n3 Participatory Design with Learners and I...


In [189]:
paragraphs.shape

(1668, 2)


# Embeddings

In [190]:
def get_embedding(texts, model='text-embedding-ada-002'):
    text = [text.replace("\n", " ") for text in texts]

    return [res.embedding for res in client.embeddings.create(input = texts, model=model).data]

In [191]:
batch_size=100
embeddings = []
for i in trange(0, len(paragraphs), batch_size):
    embeddings.extend(get_embedding(paragraphs["paragraph"].iloc[i:i+batch_size]))

100%|██████████| 17/17 [00:34<00:00,  2.04s/it]


In [192]:
paragraphs["embedding"] = embeddings
paragraphs.head(2)

Unnamed: 0,url,paragraph,embedding
0,https://arxiv.org/html/2503.00144v1,\n\n1 Introduction\n\nProgramming is an import...,"[0.00459527550265193, -0.0004130418528802693, ..."
0,https://arxiv.org/html/2503.00144v1,\n\n2 Related Work\n\n\n2.0.1 Learner-System C...,"[-0.0010354060214012861, -0.004327224101871252..."


In [193]:
query = "What are the latest advancements in using LLMs for multimodal learning and human-computer interaction? Provide insights about how to improve retrieval systems using RAG and LangChain."
query_embedding = get_embedding([query])[0]

In [194]:
best_idx = paragraphs['embedding'].map(lambda x: cosine_similarity([x], [query_embedding])[0][0]).idxmax()
best_paragraph = paragraphs.iloc[best_idx]['paragraph']

In [195]:
best_paragraph

'\n\n5.2.2 Preferred learner-system control model.\n\nFor the preferred learner-system control for the type of help, 39 (23%) learners preferred L (learner-controlled), 92 (53%) preferred L-S (mainly controlled by the learner), 34 (20%) preferred S-L (mainly controlled by the AI system), and only 7 (4%) chose S (AI-system-controlled).\nThe preferences for learner-system control models for the level of help were similar, where 38 (22%) learners preferred the L model, 85 (49.4%) preferred L-S, 43 (25%) preferred S-L, and only 6 (3.5%) chose S.\nBy assigning numerical values to these different learner-system control models (L: 0, L-S: 1, S-L: 2, S: 3), we performed a Pearson correlation between learners’ preferred model and their self-efficacy.\nAt α=0.01𝛼0.01\\alpha=0.01italic_α = 0.01 level, We found a significant correlation between preferences for learner-system control models for the level of help (p = 0.008), which was weak negative (r=-0.20).\nThis indicate that learners with highe

In [204]:
prompt = (
    "Here's a piece of text:\n"+

    best_paragraph + '\n\n' +
    "I have a question about this text: "+ query+
    "Please answer in a short"
    )

print(prompt)

Here's a piece of text:


5.2.2 Preferred learner-system control model.

For the preferred learner-system control for the type of help, 39 (23%) learners preferred L (learner-controlled), 92 (53%) preferred L-S (mainly controlled by the learner), 34 (20%) preferred S-L (mainly controlled by the AI system), and only 7 (4%) chose S (AI-system-controlled).
The preferences for learner-system control models for the level of help were similar, where 38 (22%) learners preferred the L model, 85 (49.4%) preferred L-S, 43 (25%) preferred S-L, and only 6 (3.5%) chose S.
By assigning numerical values to these different learner-system control models (L: 0, L-S: 1, S-L: 2, S: 3), we performed a Pearson correlation between learners’ preferred model and their self-efficacy.
At α=0.01𝛼0.01\alpha=0.01italic_α = 0.01 level, We found a significant correlation between preferences for learner-system control models for the level of help (p = 0.008), which was weak negative (r=-0.20).
This indicate that learn

In [205]:
gpt  = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
)

gpt.choices[0].message.content

'The latest advancements in using Large Language Models (LLMs) for multimodal learning and human-computer interaction include leveraging transformer-based models like GPT-3 and CLIP to handle multiple modalities such as text, images, and audio simultaneously. These models enable more natural and intuitive interactions between users and machines. To improve retrieval systems using tools like Retrieval-Augmented Generation (RAG) and LangChain, one can enhance system capabilities by incorporating diverse datasets, fine-tuning the models for specific tasks, and exploring innovative ways to combine retrieval and generation techniques for more accurate and efficient information retrieval.'