In [1]:
import requests
from bs4 import BeautifulSoup
import os
from lxml import html
import json

In [2]:
def build_search_url(query: str, size: int, start: int) -> str:
    query_parsed = '+'.join(query.split())

    return f"https://arxiv.org/search/?query={query_parsed}&searchtype=all&source=header&order=-announced_date_first&size={size}&start={start}"

def download_page(url, save_folder):
    response = requests.get(url)
    if response.status_code == 200:
        file_name = os.path.join(save_folder, url.split("/")[-1] + ".html")
        with open(file_name, "wb") as f:
            f.write(response.content)
        print(f"Page downloaded: {file_name}")
    else:
        print(f"Failed to download page from: {url}")

def download_articles(articles, save_folder):
    for article in articles:
        link = article.find("a")['href']
        html_link = link.replace("arxiv.org", "ar5iv.org")

        response_mod = requests.head(html_link, allow_redirects=True)
        if response_mod.url == link:
            print(f"{link} redirects to the original link. Skipping...")
        else:
            print(f"{link} doesn't redirect to the original link. Downloading...")
            download_page(html_link, save_folder)

def get_articles(query: str, save_folder: str):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    start = 0
    size = 200

    while True:
        response = requests.get(build_search_url(query, size, start))
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            articles = soup.find_all("p", class_="list-title is-inline-block")

            if not articles or len(articles) == 0:
                break

            download_articles(articles, save_folder)

            start += size
        else:
            break

In [3]:
search_query = "deepmatcher"
directory_path = "articles/deepmatcher"

get_articles(search_query, directory_path)

https://arxiv.org/abs/2301.02993 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\2301.02993.html
https://arxiv.org/abs/2211.06975 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\2211.06975.html
https://arxiv.org/abs/2103.04489 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\2103.04489.html
https://arxiv.org/abs/2102.07134 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\2102.07134.html
https://arxiv.org/abs/1802.05664 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\1802.05664.html
https://arxiv.org/abs/1506.07656 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\1506.07656.html
https://arxiv.org/abs/1503.02427 doesn't redirect to the original link. Downloading...
Page downloaded: articles/deepmatcher\1503.02427.html


In [4]:
def clean_html(element):
    for child in element.iter():
        # Remove all attributes except rowspan and colspan
        attributes_to_keep = {'rowspan', 'colspan'}
        for attribute in list(child.attrib.keys()):
            if attribute not in attributes_to_keep:
                del child.attrib[attribute]
        # Remove classes
        if 'class' in child.attrib:
            del child.attrib['class']
        # Remove inline styles
        if 'style' in child.attrib:
            del child.attrib['style']
        # Remove IDs
        if 'id' in child.attrib:
            del child.attrib['id']
    return element


def extract_tables_from_html(html_file_path):
    tree = html.parse(html_file_path)
    tables_xpath = tree.xpath('//figure[contains(@class, "ltx_table")]')

    extracted_tables = []
    for table in tables_xpath:
        clean_table = clean_html(table)
        table_string = html.tostring(clean_table, method='html', encoding='utf-8').decode(encoding='utf-8')
        table_string = table_string.replace('\n', '')
        extracted_tables.append(table_string)

    return extracted_tables

def extract_tables_from_directory(dir_path):
    extracted_tables_map = {}

    for filename in os.listdir(dir_path):
        if filename.endswith(".html"):
            file_path = os.path.join(dir_path, filename)
            article_id = os.path.splitext(filename)[0]
            extracted_tables = extract_tables_from_html(file_path)
            extracted_tables_map[article_id] = extracted_tables

    return extracted_tables_map

def save_tables_to_json(extracted_tables_map, output_file):
    filtered_tables_map = {article_id: tables for article_id, tables in extracted_tables_map.items() if tables}
    with open(output_file, 'w') as json_file:
        json.dump(filtered_tables_map, json_file, indent=4)

In [6]:
directory_path = "articles/deepmatcher"
articles_tables_map = extract_tables_from_directory(directory_path)

for article_id, article_tables in articles_tables_map.items():
    print(f"Article ID: {article_id} - # Tables Found: {len(article_tables)}")

extraction_path = 'extracted_tables/deepmatcher.json'

save_tables_to_json(articles_tables_map, extraction_path)

Article ID: 1503.02427 - # Tables Found: 2
Article ID: 1506.07656 - # Tables Found: 7
Article ID: 1802.05664 - # Tables Found: 0
Article ID: 2102.07134 - # Tables Found: 3
Article ID: 2103.04489 - # Tables Found: 9
Article ID: 2211.06975 - # Tables Found: 13
Article ID: 2301.02993 - # Tables Found: 8
