In [1]:
import arxiv
import re
import time
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [2]:
def get_arxiv_articles_with_html(query, max_results=100):
    '''
    Return list of ArXiV documents offering an (LaTeXML-postprocessed) HTML variant
    '''
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    articles_with_html = []
    
    for result in client.results(search):
        if any(link.title == "pdf" for link in result.links):
            articles_with_html.append({
                'id': result.entry_id,
                'title': result.title,
                'pdf_url': result.entry_id.replace('abs', 'pdf') + '.pdf',
                'html_url': result.entry_id.replace('/abs/', '/html/')
            })
    
    return articles_with_html



def download_plain_text_from_html(url):
    '''
    Download text from URL
    '''
    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the text from the HTML
    text = soup.get_text()

    return text

def beautify_text(text):
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    # Normalize multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    
    # Handle special cases for specific artifacts
    replacements = {
        r'\\xa0': ' ',  # Replace non-breaking space with regular space
        r'\\u2062': '',  # Remove invisible separator
        r'\\n': '\n',  # Normalize escaped newlines
        r'\\t': '\t',  # Normalize escaped tabs
    }
    
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    
    # Remove any remaining escape sequences
    text = re.sub(r'\\', '', text)
    
    return text

def extract_title(url):
    """
    Extract title from ArXiV HTML website
    """
    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the title element using its class
    title_element = soup.find('h1', class_='ltx_title ltx_title_document')
    
    # Extract and return the title text
    if title_element:
        return title_element.get_text(strip=True)
    else:
        return "Title not found"

def extract_authors(url):
    '''
    Extract author information from ArXiV HTML website
    '''
    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the authors div
    authors_div = soup.find('div', class_='ltx_authors')
    
    # Extract and return the names of the authors
    authors = []
    if authors_div:
        author_spans = authors_div.find_all('span', class_='ltx_personname')
        for span in author_spans:
            authors.append(span.get_text(strip=True))
    
    return authors

# Function to extract date and domain from the given URL using Selenium
def extract_date_and_domain(url, sec_to_timeout:int=3):
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless browser
    driver = webdriver.Chrome(options=options)

    # Load the page
    driver.get(url)

    try:
        # Wait for the watermark-tr div to be present
        watermark_div = WebDriverWait(driver, sec_to_timeout).until(
            EC.presence_of_element_located((By.ID, "watermark-tr"))
        )

        # Extract the text from the div
        text = watermark_div.text.strip()

        # Extract the arXiv ID, domain, and date using regular expression
        match = re.search(r'arXiv:\d{4}\.\d{5}(?:v\d)? \[(.*?)\] (\d{2} \w{3} \d{4})', text)
        if match:
            domain = match.group(1)
            date = match.group(2)
        else:
            domain = ''
            date = ''
    except TimeoutException:
        # Handle timeout exception
        domain = ''
        date = ''
    finally:
        # Close the WebDriver
        driver.quit()

    return {'date': date, 'domain': domain}

# Function to extract abstract from the given URL using Selenium
def extract_abstract(url, sec_to_timeout=10):
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless browser
    driver = webdriver.Chrome(options=options)

    # Load the page
    driver.get(url)

    try:
        # Wait for the abstract div to be present
        abstract_div = WebDriverWait(driver, sec_to_timeout).until(
            EC.presence_of_element_located((By.ID, "abstract"))
        )

        # Extract the text from the div
        abstract_text = abstract_div.text.strip()
    except TimeoutException:
        # Handle timeout exception
        abstract_text = None
    finally:
        # Close the WebDriver
        driver.quit()

    return abstract_text

def extract_author_emails(url, sec_to_timeout=5):
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless browser
    driver = webdriver.Chrome(options=options)

    # Load the page
    driver.get(url)

    try:
        # Wait for the authors and emails to be present
        authors_div = WebDriverWait(driver, sec_to_timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ltx_authors"))
        )
        
        # Extract author names
        author_elements = authors_div.find_elements(By.CLASS_NAME, "ltx_personname")
        authors = [author.text.strip() for author in author_elements]

        # Extract emails
        email_elements = driver.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_email")
        emails = [email.text.strip() for email in email_elements]

        # Create a dictionary with author names and emails
        author_emails = {author: email for author, email in zip(authors, emails)}
    except TimeoutException:
        # Handle timeout exception
        author_emails = {}
    finally:
        # Close the WebDriver
        driver.quit()

    return author_emails

def post_process_emails(author_emails:dict):
    """
    Post-processes author/email dictinary extracted from ArXiV to match author names and email addresses (if present) 
    """
    # Initialize result dictionary
    processed_emails = {}
    unmatched_count = 1

    for names, emails in author_emails.items():
        name_list = [name.strip() for name in names.split(',')]
        email_list = [email.strip() for email in emails.split(',')]

        # Create a map for name substrings
        name_map = {name.split()[-1].lower(): name for name in name_list}

        for email in email_list:
            email_user = email.split('@')[0].lower()
            matched = False

            for key, full_name in name_map.items():
                if key in email_user:
                    processed_emails[full_name] = email
                    matched = True
                    break

            if not matched:
                processed_emails[f'unmatched_name_{unmatched_count}'] = email
                unmatched_count += 1

        # Assign empty string to names without an email address
        for name in name_list:
            if name not in processed_emails:
                processed_emails[name] = ''

    return processed_emails

def clean_text(text):
    # Remove unwanted LaTeX commands
    text = re.sub(r'\\[A-Za-z]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    text = text.replace('†', '')  # Remove any specific unwanted characters
    text = text.replace('&', '')  # Remove leading & symbols
    return text

def extract_emails_ltx_contact(url, sec_to_timeout=10):
    '''Extract author emails and institutions from an ArXiv paper page'''
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless browser
    driver = webdriver.Chrome(options=options)

    # Load the page
    driver.get(url)

    author_emails = {}
    author_institutions = {}
    unknown_author_counter = 1
    
    try:
        # Wait until the ltx_authors element is present
        WebDriverWait(driver, sec_to_timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ltx_authors"))
        )

        authors = driver.find_elements(By.CLASS_NAME, "ltx_role_author")

        for author in authors:
            # Extracting the author's name
            name_elem = author.find_element(By.CLASS_NAME, "ltx_personname")
            name = clean_text(name_elem.text.strip())

            # Extracting the email address
            email_elem = author.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_email")
            email = email_elem[0].text.strip() if email_elem else ''

            # Cleaning the name if it has unwanted characters
            name = re.sub(r'[*\d]', '', name).strip()

            # Handling cases with multiple authors in one element
            if ' and ' in name or ', ' in name:
                multiple_names = re.split(r' and |, ', name)
                for n in multiple_names:
                    n = clean_text(n)
                    if not n:
                        n = f'unknown_author_{unknown_author_counter}'
                        unknown_author_counter += 1
                    author_emails[n.strip()] = email
            else:
                name = clean_text(name)
                if not name:
                    name = f'unknown_author_{unknown_author_counter}'
                    unknown_author_counter += 1
                author_emails[name] = email

            # Extracting the institution
            institution_elems = author.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_affiliation")
            if institution_elems:
                for inst_elem in institution_elems:
                    institution = clean_text(inst_elem.text.strip())
                    if institution not in author_institutions:
                        author_institutions[institution] = []
                    author_institutions[institution].append(name)

        for institution in author_institutions:
            author_institutions[institution] = list(set(author_institutions[institution]))

    finally:
        driver.quit()

    return {'emails': author_emails, 'institutions': author_institutions}
    
# debug
#print(text)

In [4]:
# 
email_url_list = {
    'https://arxiv.org/html/2408.00630v1' : 4, # 4, easy
    'https://arxiv.org/html/2408.00164v1' : 1, # X 1 trivial, 1 mail
    'https://arxiv.org/html/2408.00685v1' : 1, # X 1, easy
    'https://arxiv.org/html/2408.00615v1' : 1, # 1, easy
    'https://arxiv.org/html/2405.01212v1' : 1, # 1  easy: 1 mail at btm
    'https://arxiv.org/html/2405.01201v1' : 1, # 1  easy: again
    'https://arxiv.org/html/2405.01380v1' : 1, # 1 easy
    'https://arxiv.org/html/2407.18948v1' : 1, # 1 easy, 1 email
    'https://arxiv.org/html/2407.20418v1' : 3, # 3, easy
    'https://arxiv.org/html/2405.01297v1' : 5, # 5 emails individually
    'https://arxiv.org/html/2405.01348v1' : 2, # 2 mails
    'https://arxiv.org/html/2405.01407v2' : 1, # 1 email, followed by institute (deloitte)
    'https://arxiv.org/html/2408.00095v1' : 2, # 2 emails, bit messy
    'https://arxiv.org/html/2408.00013v1' : 4, # 4 mails
    'https://arxiv.org/html/2407.21070v1' : 1, # 1 email
    'https://arxiv.org/html/2407.21132v1' : 2, # 2 emails
    'https://arxiv.org/html/2407.20457v1' : 1, # 1 tricky: email hidden at the end
    'https://arxiv.org/html/2405.00661v1' : 2, # 2, leading $1$ that requires filtering
    'https://arxiv.org/html/2408.00014v1' : 3, # 3 emails
    'https://arxiv.org/html/2408.00024v1' : 2, # 2 tricky, 2 mails at the end of names
    'https://arxiv.org/html/2408.00041v1' : 7, # 7 emails right after each name
    'https://arxiv.org/html/2408.00014v1' : 3, # 3 mails at btm
    'https://arxiv.org/html/2408.00003v1' : 3, # 3 mails at btm followed by `email"` each
    'https://arxiv.org/html/2408.00265v1' : 4, # 4 emails below name
    'https://arxiv.org/html/2408.00131v1' : 1, # 1 challenging: 1 mail hyperlinked to one of the names
    'https://arxiv.org/html/2402.06758v3' : 2, # 2 challenging 2 mails hyperlinked
    'https://arxiv.org/html/2408.00291v1' : 2, # 2 mails at the end
    'https://arxiv.org/html/2408.00732v1' : 1, # 1 email at the end
    'https://arxiv.org/html/2408.00688v1' : 1, # 1 tricky, 1 email at the bottom
    'https://arxiv.org/html/2408.00721v1' : 4, # 4, at the end of names
    'https://arxiv.org/html/2408.00670v1' : 3, # 3 challenging, burried in names
    'https://arxiv.org/html/2408.00336v1' : 2, # 2, tricky, only 2 authors
    'https://arxiv.org/html/2407.20927v1' : 1, # 1, tricky, only one author
    'https://arxiv.org/html/2408.00322v1' : 1, # 1, tricky, not first but second though
    'https://arxiv.org/html/2407.20747v1' : 2, # 2, tricky,
    'https://arxiv.org/html/2408.00757v1' : 2, # 2 tricky
    'https://arxiv.org/html/2408.00725v1' : 4, # 4 tricky, linked with name
    'https://arxiv.org/html/2408.00709v1' : 0, # challenging, links with names but NO email adresses
    'https://arxiv.org/html/2408.00666v1' : 5, # tricky, linkes w name
    'https://arxiv.org/html/2408.00614v1' : 3, # linked with name
    'https://arxiv.org/html/2407.18727v1' : 1, # one, linked
    'https://arxiv.org/html/2407.19187v1' : 2, # tricky, layout appears kaputt
    'https://arxiv.org/html/2407.20053v1' : 7, # linked
    'https://arxiv.org/html/2407.19909v1' : 4, # linked
    'https://arxiv.org/html/2407.20624v1' : 2, # 
    'https://arxiv.org/html/2407.18638v2' : 2, # 2 emails but 16 authors
    'https://arxiv.org/html/2408.00509v1' : 1, # tricky, 1 asterisk with 3 authors
    'https://arxiv.org/html/2408.00560v1' : 2, # tricky, 2 mails but a dozen authors or so
    'https://arxiv.org/html/2408.00589v1' : 1, # tricky, 1 email but 3 authors, linked at end
    'https://arxiv.org/html/2408.00111v1' : 2, # challenging, 2 emails at the end to at the beginning linked via "equally controbuted"
    'https://arxiv.org/html/2407.21724v1' : 2, # tricky, linked all over.
}

for url, trg_number in email_url_list.items():
    print(extract_emails_ltx_contact(url))
    print()
    break
#    break

TimeoutException: Message: 
Stacktrace:
#0 0x561d037d971a <unknown>
#1 0x561d034aa640 <unknown>
#2 0x561d034f9c0b <unknown>
#3 0x561d034f9ef1 <unknown>
#4 0x561d0353db64 <unknown>
#5 0x561d0351c90d <unknown>
#6 0x561d0353b08a <unknown>
#7 0x561d0351c683 <unknown>
#8 0x561d034ecd71 <unknown>
#9 0x561d034ed7de <unknown>
#10 0x561d037a12ab <unknown>
#11 0x561d037a5242 <unknown>
#12 0x561d0378e665 <unknown>
#13 0x561d037a5dd2 <unknown>
#14 0x561d037732af <unknown>
#15 0x561d037c8eb8 <unknown>
#16 0x561d037c9090 <unknown>
#17 0x561d037d84ec <unknown>
#18 0x1550799d16ea start_thread


In [5]:
url

'https://arxiv.org/html/2408.00630v1'

In [None]:
{'emails': {'T. Thiemann  Inst. for Quantum Gravity': '', 'FAU Erlangen – Nürnberg': '', 'Staudtstr.': '', 'Erlangen': '', 'Germany': ''}, 'institutions': {}}

In [67]:
# Function to extract author emails from the given URL using Selenium for ltx_contact ltx_role_email

# Function to extract author emails and institutions from the given URL using Selenium for ltx_contact ltx_role_email
def extract_emails_ltx_contact_1(url, sec_to_timeout=10):
    '''Newest
    '''
    # Initialize the WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run headless browser
    driver = webdriver.Chrome(options=options)

    # Load the page
    driver.get(url)

    author_emails = {}
    author_institutions = {}

    try:
        # Wait for the authors and emails to be present
        authors_div = WebDriverWait(driver, sec_to_timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ltx_authors"))
        )

        # Extract author names and superscripts
        author_elements = authors_div.find_elements(By.CLASS_NAME, "ltx_personname")
        authors = [author.text.strip() for author in author_elements]

        # Extract institutions superscripts
        institution_superscripts = []
        for author in authors:
            superscripts = re.findall(r'\d+', author)
            institution_superscripts.append(superscripts)

        # Remove superscripts from author names
        author_names = []
        for author in authors:
            clean_name = re.sub(r'\d+', '', author).strip()
            author_names.append(clean_name)

        # Extract emails
        email_elements = driver.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_email")
        emails = [email.text.strip() for email in email_elements]

        # Create a dictionary with author names and emails
        author_emails = {author: email for author, email in zip(author_names, emails)}

        # Extract institutions
        institution_elements = driver.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_address")
        institution_texts = [inst.text.strip() for inst in institution_elements]

        # Handle cases where institution and email are directly under author
        author_notes_divs = authors_div.find_elements(By.CLASS_NAME, "ltx_author_notes")
        for note in author_notes_divs:
            try:
                author_name_element = note.find_element(By.XPATH, './preceding-sibling::span[@class="ltx_personname"]')
                author_name = re.sub(r'\d+', '', author_name_element.text).strip()
                email_elements = note.find_elements(By.CLASS_NAME, "ltx_contact ltx_role_email")
                address_elements = note.find_elements(By.CLASS_NAME, "ltx_contact ltx_role_address")
                for email_element in email_elements:
                    email = email_element.text.strip()
                    author_emails[author_name] = email
                for address_element in address_elements:
                    institution = address_element.text.strip()
                    institution = re.sub(r'^\d+', '', institution).strip()  # Remove leading digits
                    if institution not in author_institutions:
                        author_institutions[institution] = []
                    author_institutions[institution].append(author_name)
            except NoSuchElementException:
                continue  # Skip if no preceding sibling with class "ltx_personname"

        # Create dictionary for author institutions
        for author, superscripts in zip(author_names, institution_superscripts):
            for superscript in superscripts:
                if superscript.isdigit():
                    index = int(superscript) - 1
                    if index < len(institution_texts):
                        institution = institution_texts[index]
                        institution = re.sub(r'^\d+', '', institution).strip()  # Remove leading digits
                        if institution not in author_institutions:
                            author_institutions[institution] = []
                        author_institutions[institution].append(author)

        # Additional case handling for when author, institution, and email are in a different structure
        text_elements = driver.find_elements(By.CLASS_NAME, "ltx_text.ltx_font_sansserif")
        for element in text_elements:
            text = element.text.strip()
            if re.search(r'@', text):  # This is an email address, not an institution
                author_match = re.search(r'^(.*?)(?=[\s]*@)', text)
                if author_match:
                    author_name = author_match.group(1).strip()
                    author_name = re.sub(r'\d+', '', author_name).strip()
                    email = text.strip()
                    author_emails[author_name] = email
                continue  # Skip further processing for email elements
            
            # Process for institution elements
            if re.search(r'Inst\.|Department|Univ\.|School|Affiliation', text):
                institution = text.strip()
                institution = re.sub(r'^\d+', '', institution).strip()  # Remove leading digits
                # Extract the author name related to this institution
                preceding_author_elements = element.find_elements(By.XPATH, 'preceding::span[@class="ltx_personname"]')
                if preceding_author_elements:
                    author_name = re.sub(r'\d+', '', preceding_author_elements[-1].text).strip()
                    if institution not in author_institutions:
                        author_institutions[institution] = []
                    author_institutions[institution].append(author_name)

        # Additional handling for affiliations in ltx_author_notes
        affiliation_elements = driver.find_elements(By.CLASS_NAME, "ltx_contact.ltx_role_affiliation")
        for element in affiliation_elements:
            institution = element.text.strip()
            institution = re.sub(r'^\d+', '', institution).strip()  # Remove leading digits
            author_name_element = element.find_element(By.XPATH, './preceding::span[@class="ltx_personname"]')
            author_name = re.sub(r'\d+', '', author_name_element.text).strip()
            if institution not in author_institutions:
                author_institutions[institution] = []
            author_institutions[institution].append(author_name)

    except TimeoutException:
        # Handle timeout exception
        author_emails = {}
        author_institutions = {}
    finally:
        # Close the WebDriver
        driver.quit()

    return {'emails': author_emails, 'institutions': author_institutions}

In [None]:
expected_output = {
    'https://arxiv.org/html/2408.00164v1' : {'emails': {'Manish Ramchander': 'manishd@imsc.res.in'}, 'institutions': {}},
    'https://arxiv.org/html/2408.00685v1' : {'emails': {'Debmalya Sain': 'saindebmalya@gmail.com'},
                                             'institutions': {'(Sain) Department of Mathematics, Indian Institute of Information Technology Raichur, Karnataka 584135, India': ['Debmalya Sain']}},
    '' : '',
    
    
} 

In [1]:
ten_expected_outputs = [
{'emails': {'Giulia Cusin' : 'cusin@iap.fr', 'Cyril Pitrou' : 'pitrou@iap.fr', 'Camille Bonvin' : 'camille.bonvin@unige.ch', 
            'Aurélien Barrau' : 'barrau@lpsc.in2p3.fr'},
 'institutions': {'Institut d’Astrophysique de Paris, UMR-7095 du CNRS et de Sorbonne Université, Paris, France' : ['Giulia Cusin', 'Cyril Pitrou'],
                  'Département de Physique Théorique and Center for Astroparticle Physics, Université de Genève, Quai E. Ansermet 24, CH-1211 Genève 4, Switzerland' : ['Giulia Cusin','Camille Bonvin'],
                  'Laboratoire de Physique Subatomique et de Cosmologie, Univ. Grenoble-Alpes, CNRS-IN2P3, 53 av. des Martyrs, 38026 Grenoble, France' : ['Aurélien Barrau', 'Killian Martineau']}}


]


In [None]:
ten_examples_provided = [
    'https://arxiv.org/html/2405.01297v1',
]

In [75]:
keys[0]

'https://arxiv.org/html/2408.00164v1'

In [74]:
extract_emails_ltx_contact_5(keys[1])

{'emails': {'Debmalya Sain': 'saindebmalya@gmail.com'}, 'institutions': {}}

In [None]:
author_emails['institutions']

In [None]:
url_list = ['https://arxiv.org/html/2408.00175v1',
           'https://arxiv.org/html/2408.00524v1',
           'https://arxiv.org/html/2406.00051v1',
           'https://arxiv.org/html/2408.00168v1',
           'https://arxiv.org/html/2408.00722v1',
           'https://arxiv.org/html/2408.00571v1',
           'https://arxiv.org/html/2407.18402v1',
           'https://arxiv.org/html/2407.18820v1',
           'https://arxiv.org/html/2407.18426v1',
           'https://arxiv.org/html/2405.00116v1',
           'https://arxiv.org/html/2405.00374v1',
           'https://arxiv.org/html/2405.00661v1']

for url_loc in url_list:
    # be kind
    time.sleep(0.3)
    # print
    print(post_process_emails(extract_author_emails(url_loc)))
    #print(extract_abstract(url_loc))
    

In [None]:
url_text = download_plain_text_from_html('https://arxiv.org/html/2402.14703')

In [None]:
def post_process_emails(author_emails:dict):
    """
    Post-processes author/email dictinary extracted from ArXiV to match author names and email addresses (if present) 
    """
    # Initialize result dictionary
    processed_emails = {}
    unmatched_count = 1

    for names, emails in author_emails.items():
        name_list = [name.strip() for name in names.split(',')]
        email_list = [email.strip() for email in emails.split(',')]

        # Create a map for name substrings
        name_map = {name.split()[-1].lower(): name for name in name_list}

        for email in email_list:
            email_user = email.split('@')[0].lower()
            matched = False

            for key, full_name in name_map.items():
                if key in email_user:
                    processed_emails[full_name] = email
                    matched = True
                    break

            if not matched:
                processed_emails[f'unmatched_name_{unmatched_count}'] = email
                unmatched_count += 1

        # Assign empty string to names without an email address
        for name in name_list:
            if name not in processed_emails:
                processed_emails[name] = ''

    return processed_emails

# Example usage
author_emails = {
    'Isabella Danhoni, Guy D. Moore': 'idanhoni@theorie.ikp.physik.tu-darmstadt.de,guy.moore@physik.tu-darmstadt.de'
}
processed_emails = post_process_emails(author_emails)
print("Processed Emails:", processed_emails)

In [None]:
# Send a GET request to the URL
url = 'https://arxiv.org/html/2402.14703'
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the div containing the date and domain
watermark_div = soup.select_one('div#watermark-tr')

In [None]:
watermark_div