In [1]:
import requests
from bs4 import BeautifulSoup
import os
from lxml import html
import json

In [2]:
def build_search_url(query: str, size: int, start: int) -> str:
    query_parsed = '+'.join(query.split())

    return f"https://arxiv.org/search/?query={query_parsed}&searchtype=all&source=header&order=-announced_date_first&size={size}&start={start}"

def download_page(url, save_folder):
    response = requests.get(url)
    if response.status_code == 200:
        file_name = os.path.join(save_folder, url.split("/")[-1] + ".html")
        with open(file_name, "wb") as f:
            f.write(response.content)
        print(f"Page downloaded: {file_name}")
    else:
        print(f"Failed to download page from: {url}")

def download_articles(articles, save_folder):
    for article in articles:
        link = article.find("a")['href']
        html_link = link.replace("arxiv.org", "ar5iv.org")

        response_mod = requests.head(html_link, allow_redirects=True)
        if response_mod.url == link:
            print(f"{link} redirects to the original link. Skipping...")
        else:
            print(f"{link} doesn't redirect to the original link. Downloading...")
            download_page(html_link, save_folder)

def get_articles(query: str, save_folder: str):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    start = 0
    size = 200

    while True:
        response = requests.get(build_search_url(query, size, start))
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            articles = soup.find_all("p", class_="list-title is-inline-block")

            if not articles or len(articles) == 0:
                break

            download_articles(articles, save_folder)

            start += size
        else:
            break

In [3]:
search_query = "entity resolution"
directory_path = "experiments/articles/entity resolution"

get_articles(search_query, directory_path)

https://arxiv.org/abs/2404.01140 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.20329 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.19036 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.17469 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.17245 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.12092 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.06434 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.06097 redirects to the original link. Skipping...
https://arxiv.org/abs/2403.05895 redirects to the original link. Skipping...
https://arxiv.org/abs/2402.17543 redirects to the original link. Skipping...
https://arxiv.org/abs/2402.13714 doesn't redirect to the original link. Downloading...
Page downloaded: articles/entity resolution\2402.13714.html
https://arxiv.org/abs/2402.13571 redirects to the original link. Skipping...
https:

In [2]:
def clean_html(element):
    for child in element.iter():
        # Remove all attributes except rowspan and colspan
        attributes_to_keep = {'rowspan', 'colspan'}
        for attribute in list(child.attrib.keys()):
            if attribute not in attributes_to_keep:
                del child.attrib[attribute]
        # Remove classes
        if 'class' in child.attrib:
            del child.attrib['class']
        # Remove inline styles
        if 'style' in child.attrib:
            del child.attrib['style']
        # Remove IDs
        if 'id' in child.attrib:
            del child.attrib['id']
    return element


def extract_tables_from_html(html_file_path):
    tree = html.parse(html_file_path)
    tables_xpath = tree.xpath('//figure[contains(@class, "ltx_table")]')

    extracted_tables = []
    for table in tables_xpath:
        clean_table = clean_html(table)
        table_string = html.tostring(clean_table, method='html', encoding='utf-8').decode(encoding='utf-8')
        table_string = table_string.replace('\n', '')
        extracted_tables.append({'table': table_string, 'processed': False})

    return extracted_tables

def extract_tables_from_directory(dir_path):
    extracted_tables_map = {}

    for filename in os.listdir(dir_path):
        if filename.endswith(".html"):
            file_path = os.path.join(dir_path, filename)
            article_id = os.path.splitext(filename)[0]
            extracted_tables = extract_tables_from_html(file_path)
            extracted_tables_map[article_id] = extracted_tables

    return extracted_tables_map

def save_tables_to_json(extracted_tables_map, output_file):
    filtered_tables_map = {article_id: tables for article_id, tables in extracted_tables_map.items() if tables}
    with open(output_file, 'w') as json_file:
        json.dump(filtered_tables_map, json_file, indent=4)

In [6]:
directory_path = "experiments/articles/nba"
articles_tables_map = extract_tables_from_directory(directory_path)

num_tables = 0
for article_id, article_tables in articles_tables_map.items():
    num_article_tables = len(article_tables)
    num_tables += num_article_tables
    print(f"Article ID: {article_id} - # Tables Found: {num_article_tables}")
print(f"\nTotal number of tables found: {num_tables}")

extraction_path = 'experiments/extracted_tables/nba.json'

save_tables_to_json(articles_tables_map, extraction_path)

Article ID: 0709.3487 - # Tables Found: 1
Article ID: 0904.2060 - # Tables Found: 0
Article ID: 1008.0705 - # Tables Found: 6
Article ID: 1107.5793 - # Tables Found: 0
Article ID: 1109.2825 - # Tables Found: 1
Article ID: 1201.0058 - # Tables Found: 2
Article ID: 1210.2452 - # Tables Found: 5
Article ID: 1301.3523 - # Tables Found: 8
Article ID: 1302.4735 - # Tables Found: 1
Article ID: 1401.0942 - # Tables Found: 1
Article ID: 1403.7548 - # Tables Found: 4
Article ID: 1404.2927 - # Tables Found: 0
Article ID: 1405.0231 - # Tables Found: 5
Article ID: 1501.00060 - # Tables Found: 1

Total number of tables found: 35
