# Taxonomy Dictionary

Here, the Category Taxonomy is retrieved from arXiv and saved as a dictionary.

In [17]:
import requests
from bs4 import BeautifulSoup

def crawl_arxiv_taxonomy(url):
    # Fetch the webpage content
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to fetch the webpage.")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize an empty dictionary to store the taxonomy
    taxonomy = {}

    # Find all subcategories
    subcategory_elements = soup.select('#category_taxonomy_list > div > div > div > div > div > h4')

    # Iterate over subcategories
    for subcategory_element in subcategory_elements:
        main_category_element = subcategory_element.find_previous('h2', class_='accordion-head')
        main_category_name = main_category_element.text.strip()
        
        short_name = subcategory_element.text.strip().split()[0]  # Extracting only the abbreviation
        long_name = subcategory_element.find('span').text.strip('()')

        # Add subcategory to the taxonomy dictionary under the main category
        if main_category_name in taxonomy:
            taxonomy[main_category_name].append({'short_name': short_name, 'long_name': long_name})
        else:
            taxonomy[main_category_name] = [{'short_name': short_name, 'long_name': long_name}]

    return taxonomy

# URL of the arXiv category taxonomy page
url = 'https://arxiv.org/category_taxonomy'

# Crawl the taxonomy
taxonomy_data = crawl_arxiv_taxonomy(url)

# Print the extracted taxonomy
for main_category, subcategories in taxonomy_data.items():
    print("Main Category:", main_category)
    for subcategory in subcategories:
        print("  Short Name:", subcategory['short_name'])
        print("  Long Name:", subcategory['long_name'])
    print()


Main Category: Computer Science
  Short Name: cs.AI
  Long Name: Artificial Intelligence
  Short Name: cs.AR
  Long Name: Hardware Architecture
  Short Name: cs.CC
  Long Name: Computational Complexity
  Short Name: cs.CE
  Long Name: Computational Engineering, Finance, and Science
  Short Name: cs.CG
  Long Name: Computational Geometry
  Short Name: cs.CL
  Long Name: Computation and Language
  Short Name: cs.CR
  Long Name: Cryptography and Security
  Short Name: cs.CV
  Long Name: Computer Vision and Pattern Recognition
  Short Name: cs.CY
  Long Name: Computers and Society
  Short Name: cs.DB
  Long Name: Databases
  Short Name: cs.DC
  Long Name: Distributed, Parallel, and Cluster Computing
  Short Name: cs.DL
  Long Name: Digital Libraries
  Short Name: cs.DM
  Long Name: Discrete Mathematics
  Short Name: cs.DS
  Long Name: Data Structures and Algorithms
  Short Name: cs.ET
  Long Name: Emerging Technologies
  Short Name: cs.FL
  Long Name: Formal Languages and Automata Theory
 

In [18]:
# Store the dictionary using %store
%store taxonomy_data

Stored 'taxonomy_data' (dict)
