In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
#!pip install feedparser

In [5]:
#!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [3]:
import requests
from bs4 import BeautifulSoup
import feedparser
from datetime import datetime
from urllib.parse import quote_plus
import random
import pandas as pd
import jsonlines
import json
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

## Парсинг

In [3]:
import time
def parse_arxiv_only_html_by_category(category: str,
                                      top_n: int = 10,
                                      max_results: int = 50
                                     ) -> list[dict]:
    """
    Parsing meta information of articles with a filter based on those that have an html link.
    """
    
    base_url = f"http://export.arxiv.org/api/query?search_query=cat:{category}&sortBy=lastUpdatedDate&sortOrder=descending"
    start = 0
    data = []

    keywords = ["collection", "proceeding", "volume", "workshop", "multiple papers"]
    query = base_url + f"&start={start}&max_results={max_results}"
    feed = feedparser.parse(query)
    
    
    
    with tqdm(total=top_n, desc=f"{category}: Parsing category html links") as pbar:
        while True:
            query = base_url + f"&start={start}&max_results={max_results}"
            feed = feedparser.parse(query)
            #if len(feed.entries) == 0:
                #print(f'Only {len(data)} papers were found for query "{query}"!')
            for entry in feed.entries:
                html_link = entry.link.replace("abs", "html")
                status_code = requests.get(html_link).status_code
                abstract = entry.get("summary", "").lower()
                check_keywords = any(keyword in abstract for keyword in keywords)
                if (200 <= status_code) and (status_code < 300) and not check_keywords:
                    meta = dict(entry)
                    meta["html_link"] = entry.link.replace("abs", "html")
                    data.append(dict(meta))
                    pbar.update(1)
                    if len(data) == top_n:
                        return data
                time.sleep(6)

            start += max_results
            if start >= 6000:
                break
            time.sleep(6)
    return data

### Выделение текста статьи по html-ссылке

In [4]:
import requests
from bs4 import BeautifulSoup
import re

In [4]:
def fetch_html(url):
    """
    Fetches the HTML content of the given URL.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch the URL: {url}")
        
def parse_article_html(html_content):
    """
    Parses the HTML content of the Arxiv article to extract the clean text.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    article = soup.find("article")
    # Remove unnecessary elements like images, tables, and acknowledgments
    for element in article.find_all(['script', 'style', 'noscript', 'table', 'img', 'figcaption', 'blockquote', "figure"]):
        element.decompose()
    
    article_structure = dict()
    # Extract the title (optional)
    title = article.find('h1', {'class': 'ltx_title ltx_title_document'})
    title_text = title.get_text(strip=True) if title else "No Title Found"
    article_structure["title_text"] = title_text
    # Extract the abstract
    abstract = article.find("div", {"class": "ltx_abstract"})
    if abstract is not None:
        abstract_p = abstract.find("p")
        abstract_text = abstract_p.get_text(strip=True) if abstract_p else "No Abstract Found"
        article_structure["abstract_text"] = abstract_text
    # Extract the main content of the article
    sections = article.find_all("section", recursive=False)
    for section in sections:
        section_head = section.find(re.compile("h"))
        section_name = section_head.get_text(strip=True) if section_head else ""
        if not re.search(r'acknowledgment|references', section_name, re.I):
            section_text = []
            paragraphs = section.find_all("p")
            if len(paragraphs) != 0:
                for paragraph in paragraphs:
                    paragraph_text = paragraph.get_text(strip=True) if paragraph else "No Paragraph Found"
                    section_text.append(paragraph.get_text(strip=True))
                if len(section_text) > 0:
                    section_text = "\n".join(section_text)
                    article_structure[section_name] = section_text

    return article_structure

In [6]:
import json
def write_to_jsonl(data, file_path):
    """
    Write a single JSON object to a JSONL file.
    
    """
    with open(file_path, "a") as file: 
        file.write(json.dumps(data) + "\n")

In [23]:
jsonl_file_path = "articles.jsonl"

In [7]:
def read_from_jsonl(file_path):
    """
    Read all JSON objects from a JSONL file.
    """
    with open(file_path, "r") as file:
        return [json.loads(line.strip()) for line in file]

### Полный запуск

In [35]:
jsonl_file_path = "articles.jsonl"

In [8]:
num_articles_by_category={"CS.AI": 1000,
                          "cs.CC": 1000,
                          "cs.DB": 1000,
                          "cs.DM": 500,
                          "cs.DS": 700,
                          "cs.IR": 1000,
                          "cs.LG": 1000,
                          "stat.ML": 500
                         }

In [None]:
import copy
jsonl_file_path = "articles.jsonl"
meta_by_category = {}
for cat, num_articles in tqdm(num_articles_by_category.items(), desc="Total progress parsing meta for all categories"):
    articles_metadata = parse_arxiv_only_html_by_category(category=cat, top_n=num_articles, max_results=100)
    meta_by_category[cat] = articles_metadata
    for i, article_meta in tqdm(enumerate(articles_metadata), desc=f"{cat}: Parsing text from html links"):
        article_html = article_meta["html_link"]
        try:
            article_structure = copy.deepcopy(article_meta)
            article_structure["parsing_result"] = parse_article_html(fetch_html(article_html))
            article_structure["category"] = cat
            write_to_jsonl(article_structure, jsonl_file_path)
            del article_structure
        except Exception as e:
            print(f"Something wrong with article cat={cat}, index={i}, html_link={article_html}:\n{e}")
        time.sleep(6)

Total progress parsing meta for all categories:   0%|          | 0/8 [00:00<?, ?it/s]

CS.AI: Parsing category html links:   0%|          | 0/1000 [00:00<?, ?it/s]

CS.AI: Parsing text from html links: 0it [00:00, ?it/s]

Something wrong with article cat=CS.AI, index=147, html_link=http://arxiv.org/html/2404.11064v3:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=157, html_link=http://arxiv.org/html/2412.13461v1:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=165, html_link=http://arxiv.org/html/2412.13441v1:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=179, html_link=http://arxiv.org/html/2412.13386v1:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=228, html_link=http://arxiv.org/html/2412.13238v1:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=785, html_link=http://arxiv.org/html/2410.08760v2:
'NoneType' object has no attribute 'find_all'
Something wrong with article cat=CS.AI, index=896, html_link=http://arxiv.org/html/2406.05348v2:
'NoneType' object has no attribute 'find_all'

cs.CC: Parsing category html links:   0%|          | 0/1000 [00:00<?, ?it/s]

cs.CC: Parsing text from html links: 0it [00:00, ?it/s]

### Парсинг для RAGAS

In [9]:
df = pd.read_parquet("hf://datasets/taesiri/arxiv_qa/data/train-00000-of-00001.parquet")

In [10]:
df.tail(20)

Unnamed: 0,paper_id,model,question,answer
210560,1709.07807,claude-2.0,Design 10 in-depth questions about method prop...,Here are 10 in-depth questions about the meth...
210561,1709.07807,claude-2.0,Summarize the paper in one sentence.,The paper introduces a category theoretic fra...
210562,1709.07807,claude-2.0,Summarize the paper in one paragraphs.,Here is a one paragraph summary of the paper:...
210563,1709.07807,claude-2.0,Design 10 in-depth questions about the method ...,Here are 10 potential in-depth questions abou...
210564,1709.07807,claude-2.0,Write a very high-quality paragraph summarizin...,Here is a high-quality paragraph summarizing ...
210565,909.091,claude-2.0,What is the central research question or hypot...,This paper addresses the question of whether ...
210566,909.091,claude-2.0,What is the main contribution of this paper?,This paper presents a new Lax pair for the 2D...
210567,909.091,claude-2.0,"What is the TL;DR (Too Long, Didn't Read) vers...",Here is a one sentence TL;DR summary of the p...
210568,909.091,claude-2.0,How does this paper compare to other research ...,This paper presents a new method for generati...
210569,909.091,claude-2.0,What future research directions do the authors...,Here are some key future research directions ...


In [11]:
df.shape

(210580, 4)

In [11]:
paper_ids = list(df["paper_id"].value_counts(dropna=False).sort_values(ascending=False).index)

In [14]:
available_cats = num_articles_by_category.keys()
available_cats

dict_keys(['CS.AI', 'cs.CC', 'cs.DB', 'cs.DM', 'cs.DS', 'cs.IR', 'cs.LG', 'stat.ML'])

In [17]:
def ragas_parsing(paper_ids, available_cats, top_n):
    batch_size = 100
    data = []
    with tqdm(total=top_n, desc="Total process") as pbar:
        for i in range(0, len(paper_ids), batch_size):
            idx_batch = paper_ids[i: i + batch_size]
            articles_ids = ",".join(idx_batch)
            query = f"http://export.arxiv.org/api/query?id_list={articles_ids}&sortBy=lastUpdatedDate"
            feed = feedparser.parse(query)
            time.sleep(6)
            for entry in feed.entries:
                html_link = entry.link.replace("abs", "html")
                status_code = requests.get(html_link).status_code
                time.sleep(6)
                if (200 <= status_code) and (status_code < 300):
                    cats = [tag["term"] for tag in entry.tags]
                    if any(cat in available_cats for cat in cats):
                        meta = dict(entry)
                        meta["html_link"] = entry.link.replace("abs", "html")
                        try:
                            meta["parsing_result"] = parse_article_html(fetch_html(html_link))
                            meta["category"] = cats[0]
                            time.sleep(6)
                            data.append(meta)
                            write_to_jsonl(meta, "articles.jsonl")
                            pbar.update(1)
                            if len(data) == top_n:
                                return data
                        except Exception as e:
                            print(f"article: {html_link}\nexception:{e}")
    print(f"Find only {len(data)} papers")
    return data

In [18]:
ragas_data = ragas_parsing(paper_ids, available_cats, top_n=25)

Total process:   0%|          | 0/25 [00:00<?, ?it/s]

### Добавление Attention is all you need 

In [19]:
article_id = "1706.03762"
article_link = "http://arxiv.org/abs/" + article_id
base_url = f"http://export.arxiv.org/api/query?id_list={article_id}&sortBy=lastUpdatedDate"
feed = feedparser.parse(base_url)

In [23]:
feed.entries

[{'id': 'http://arxiv.org/abs/1706.03762v7',
  'guidislink': True,
  'link': 'http://arxiv.org/abs/1706.03762v7',
  'updated': '2023-08-02T00:41:18Z',
  'updated_parsed': time.struct_time(tm_year=2023, tm_mon=8, tm_mday=2, tm_hour=0, tm_min=41, tm_sec=18, tm_wday=2, tm_yday=214, tm_isdst=0),
  'published': '2017-06-12T17:57:34Z',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=6, tm_mday=12, tm_hour=17, tm_min=57, tm_sec=34, tm_wday=0, tm_yday=163, tm_isdst=0),
  'title': 'Attention Is All You Need',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://export.arxiv.org/api/query?id_list=1706.03762&sortBy=lastUpdatedDate',
   'value': 'Attention Is All You Need'},
  'summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network 

In [24]:
html_link = feed.entries[0].link.replace("abs", "html")

In [25]:
requests.get(html_link).status_code

200

In [26]:
data = pd.read_json("articles.jsonl", lines=True)

In [30]:
feed.entries[0].id in data["id"]

False

In [33]:
meta = dict(feed.entries[0])
meta["html_link"] = feed.entries[0].link.replace("abs", "html")

In [35]:
meta["parsing_result"] = parse_article_html(fetch_html(meta["html_link"]))

In [36]:
meta["category"] = "cs.LG"

In [37]:
write_to_jsonl(meta, "articles.jsonl")

### Устранение дубликатов

In [38]:
data = pd.read_json("articles.jsonl", lines=True)

In [39]:
data.head()

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,links,arxiv_primary_category,tags,html_link,parsing_result,category,arxiv_comment,arxiv_doi,arxiv_journal_ref,arxiv_affiliation
0,http://arxiv.org/abs/2412.14170v1,True,http://arxiv.org/abs/2412.14170v1,2024-12-18T18:59:53Z,"[2024, 12, 18, 18, 59, 53, 2, 353, 0]",2024-12-18T18:59:53Z,"[2024, 12, 18, 18, 59, 53, 2, 353, 0]",E-CAR: Efficient Continuous Autoregressive Ima...,"{'type': 'text/plain', 'language': None, 'base...",Recent advances in autoregressive (AR) models ...,...,"[{'href': 'http://arxiv.org/abs/2412.14170v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14170v1,{'title_text': 'E-CAR: Efficient Continuous Au...,CS.AI,,,,
1,http://arxiv.org/abs/2412.14167v1,True,http://arxiv.org/abs/2412.14167v1,2024-12-18T18:59:49Z,"[2024, 12, 18, 18, 59, 49, 2, 353, 0]",2024-12-18T18:59:49Z,"[2024, 12, 18, 18, 59, 49, 2, 353, 0]",VideoDPO: Omni-Preference Alignment for Video ...,"{'type': 'text/plain', 'language': None, 'base...",Recent progress in generative diffusion models...,...,"[{'href': 'http://arxiv.org/abs/2412.14167v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14167v1,{'title_text': 'VideoDPO: Omni-Preference Alig...,CS.AI,,,,
2,http://arxiv.org/abs/2412.14158v1,True,http://arxiv.org/abs/2412.14158v1,2024-12-18T18:53:22Z,"[2024, 12, 18, 18, 53, 22, 2, 353, 0]",2024-12-18T18:53:22Z,"[2024, 12, 18, 18, 53, 22, 2, 353, 0]",AKiRa: Augmentation Kit on Rays for optical vi...,"{'type': 'text/plain', 'language': None, 'base...",Recent advances in text-conditioned video diff...,...,"[{'href': 'http://arxiv.org/abs/2412.14158v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14158v1,{'title_text': 'AKiRa: Augmentation Kit on Ray...,CS.AI,,,,
3,http://arxiv.org/abs/2412.14146v1,True,http://arxiv.org/abs/2412.14146v1,2024-12-18T18:44:08Z,"[2024, 12, 18, 18, 44, 8, 2, 353, 0]",2024-12-18T18:44:08Z,"[2024, 12, 18, 18, 44, 8, 2, 353, 0]",Advanced Reasoning and Transformation Engine f...,"{'type': 'text/plain', 'language': None, 'base...",This paper presents the Advanced Reasoning and...,...,"[{'href': 'http://arxiv.org/abs/2412.14146v1',...","{'term': 'cs.AI', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14146v1,{'title_text': 'Advanced Reasoning and Transfo...,CS.AI,,,,
4,http://arxiv.org/abs/2410.23953v3,True,http://arxiv.org/abs/2410.23953v3,2024-12-18T18:41:48Z,"[2024, 12, 18, 18, 41, 48, 2, 353, 0]",2024-10-31T14:07:26Z,"[2024, 10, 31, 14, 7, 26, 3, 305, 0]",Representative Social Choice: From Learning Th...,"{'type': 'text/plain', 'language': None, 'base...",Social choice theory is the study of preferenc...,...,"[{'href': 'http://arxiv.org/abs/2410.23953v3',...","{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2410.23953v3,{'title_text': 'Representative Social Choice:F...,CS.AI,Full version (20 pages). Under review. Receive...,,,


In [40]:
data.shape

(6069, 24)

In [42]:
df = data.drop_duplicates("id")

In [43]:
df.head()

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,links,arxiv_primary_category,tags,html_link,parsing_result,category,arxiv_comment,arxiv_doi,arxiv_journal_ref,arxiv_affiliation
0,http://arxiv.org/abs/2412.14170v1,True,http://arxiv.org/abs/2412.14170v1,2024-12-18T18:59:53Z,"[2024, 12, 18, 18, 59, 53, 2, 353, 0]",2024-12-18T18:59:53Z,"[2024, 12, 18, 18, 59, 53, 2, 353, 0]",E-CAR: Efficient Continuous Autoregressive Ima...,"{'type': 'text/plain', 'language': None, 'base...",Recent advances in autoregressive (AR) models ...,...,"[{'href': 'http://arxiv.org/abs/2412.14170v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14170v1,{'title_text': 'E-CAR: Efficient Continuous Au...,CS.AI,,,,
1,http://arxiv.org/abs/2412.14167v1,True,http://arxiv.org/abs/2412.14167v1,2024-12-18T18:59:49Z,"[2024, 12, 18, 18, 59, 49, 2, 353, 0]",2024-12-18T18:59:49Z,"[2024, 12, 18, 18, 59, 49, 2, 353, 0]",VideoDPO: Omni-Preference Alignment for Video ...,"{'type': 'text/plain', 'language': None, 'base...",Recent progress in generative diffusion models...,...,"[{'href': 'http://arxiv.org/abs/2412.14167v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14167v1,{'title_text': 'VideoDPO: Omni-Preference Alig...,CS.AI,,,,
2,http://arxiv.org/abs/2412.14158v1,True,http://arxiv.org/abs/2412.14158v1,2024-12-18T18:53:22Z,"[2024, 12, 18, 18, 53, 22, 2, 353, 0]",2024-12-18T18:53:22Z,"[2024, 12, 18, 18, 53, 22, 2, 353, 0]",AKiRa: Augmentation Kit on Rays for optical vi...,"{'type': 'text/plain', 'language': None, 'base...",Recent advances in text-conditioned video diff...,...,"[{'href': 'http://arxiv.org/abs/2412.14158v1',...","{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14158v1,{'title_text': 'AKiRa: Augmentation Kit on Ray...,CS.AI,,,,
3,http://arxiv.org/abs/2412.14146v1,True,http://arxiv.org/abs/2412.14146v1,2024-12-18T18:44:08Z,"[2024, 12, 18, 18, 44, 8, 2, 353, 0]",2024-12-18T18:44:08Z,"[2024, 12, 18, 18, 44, 8, 2, 353, 0]",Advanced Reasoning and Transformation Engine f...,"{'type': 'text/plain', 'language': None, 'base...",This paper presents the Advanced Reasoning and...,...,"[{'href': 'http://arxiv.org/abs/2412.14146v1',...","{'term': 'cs.AI', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2412.14146v1,{'title_text': 'Advanced Reasoning and Transfo...,CS.AI,,,,
4,http://arxiv.org/abs/2410.23953v3,True,http://arxiv.org/abs/2410.23953v3,2024-12-18T18:41:48Z,"[2024, 12, 18, 18, 41, 48, 2, 353, 0]",2024-10-31T14:07:26Z,"[2024, 10, 31, 14, 7, 26, 3, 305, 0]",Representative Social Choice: From Learning Th...,"{'type': 'text/plain', 'language': None, 'base...",Social choice theory is the study of preferenc...,...,"[{'href': 'http://arxiv.org/abs/2410.23953v3',...","{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",http://arxiv.org/html/2410.23953v3,{'title_text': 'Representative Social Choice:F...,CS.AI,Full version (20 pages). Under review. Receive...,,,


In [52]:
df.to_json("articles.jsonl")

### Посмотрим, сколько вопросов из датасета для RAGAS-а имеют статьи, которые попали в результат парсинга

In [53]:
dt = pd.read_json("articles.jsonl")

In [58]:
df_ragas = pd.read_parquet("hf://datasets/taesiri/arxiv_qa/data/train-00000-of-00001.parquet")

In [74]:
dt_idxs = dt["id"].apply(lambda x: x.split("/")[-1].split("v")[0]).values

In [75]:
dt_idxs

array(['2412.14170', '2412.14167', '2412.14158', ..., '2307.06949',
       '2309.11499', '1706.03762'], dtype=object)

In [84]:
in_data = []
for ragas_paper_id in df_ragas["paper_id"]:
    if ragas_paper_id in dt_idxs:
        in_data.append(ragas_paper_id)

In [87]:
in_data[:5]

['2309.11499', '2309.11499', '2309.11499', '2309.11499', '2309.11499']

In [90]:
df_ragas.query(f"paper_id in {in_data}").to_json("ragas_dataset.jsonl")

In [91]:
sub_df_ragas = df_ragas.query(f"paper_id in {in_data}")

In [92]:
sub_df_ragas.shape

(1618, 4)

In [None]:
#1618 вопросов