In [6]:
import sys
import os

from acl_papers import modules
from acl_papers import ACL_PAPER_PATH


COLUMNS = ['paper_type', 'title', 'abstract', 'pub_year', 'relevancy_status', 'publication']
TITLE_CLASS = 'd-sm-flex align-items-stretch'
ABSTRACT_CLASS = 'card bg-light mb-2 mb-lg-3 collapse abstract-collapse'

output_file = os.path.join(ACL_PAPER_PATH, 'acl_data.csv')

df = modules.load_or_create_df(output_file, columns=COLUMNS)
df

Unnamed: 0,paper_type,title,abstract,pub_year,relevancy_status,publication
0,long,Enhancing Ethical Explanations of Large Langua...,An increasing amount of research in Natural La...,2024,,EACL
1,long,Multi-Relational Hyperbolic Word Embeddings fr...,Natural language definitions possess a recursi...,2024,,EACL
2,long,Anisotropy Is Inherent to Self-Attention in Tr...,The representation degeneration problem is a p...,2024,,EACL
3,long,Generating Benchmarks for Factuality Evaluatio...,Before deploying a language model (LM) within ...,2024,,EACL
4,long,"Leak, Cheat, Repeat: Data Contamination and Ev...",Natural Language Processing (NLP) research is ...,2024,,EACL
...,...,...,...,...,...,...
3374,main,Diagnosing Vision-and-Language Navigation: Wha...,Vision-and-language navigation (VLN) is a mult...,2022,,NAACL
3375,main,Aligning to Social Norms and Values in Interac...,We focus on creating agents that act in alignm...,2022,,NAACL
3376,main,"MOVER: Mask, Over-generate and Rank for Hyperb...","Despite being a common figure of speech, hyper...",2022,,NAACL
3377,main,Embarrassingly Simple Performance Prediction f...,"The task of natural language inference (NLI), ...",2022,,NAACL


In [5]:
CONFS = {
    2024: [
        {
            'url': 'https://aclanthology.org/volumes/2024.eacl-long/',
            'venue': 'EACL',
            'type': 'long',
        },
        {
            'url': 'https://aclanthology.org/volumes/2024.eacl-short/',
            'venue': 'EACL',
            'type': 'short',
        },
        {
            'url': 'https://aclanthology.org/volumes/2024.naacl-long/',
            'venue': 'NAACL',
            'type': 'long',
        },
        {
            'url': 'https://aclanthology.org/volumes/2024.naacl-short/',
            'venue': 'NAACL',
            'type': 'short',
        },
    ],
    2023:[
        {
            'url': 'https://aclanthology.org/volumes/2023.eacl-main/',
            'venue': 'EACL',
            'type': 'main',
        },
        {
            'url': 'https://aclanthology.org/volumes/2023.emnlp-main/',
            'venue': 'EMNLP',
            'type': 'main',
        },
        ],
    2022: [
        {
            'url':'https://aclanthology.org/volumes/2022.emnlp-main/',
            'venue': 'EMNLP',
            'type': 'main'
        },
        {
            'url': 'https://aclanthology.org/volumes/2022.naacl-main/',
            'venue': 'NAACL',
            'type': 'main'
        }
    ]
}

ROWS = []

for year, confs in CONFS.items():
    for conf in confs:
        soup = modules.crawling(url=conf['url'])
        html_titles = soup.find_all('p', class_=TITLE_CLASS)[1:]
        html_abstracts = soup.find_all('div', class_=ABSTRACT_CLASS)

        assert(len(html_titles), len(html_abstracts))

        for html_title, html_abstract in zip(html_titles, html_abstracts):

            title = html_title.find_all(lambda tag: tag.name == 'a' and 'class' in tag.attrs and tag.attrs['class'] == ['align-middle'])
            
            abstract = html_abstract.find_all(lambda tag: tag.name == 'div' and 'class' in tag.attrs and tag.attrs['class'] == ['card-body', 'p-3', 'small'])

            row=dict()
        

            row['paper_type'] = conf['type']
            row['title'] = title[0].text
            row['abstract'] = abstract[0].text
            row['pub_year'] = year
            row['relevancy_status'] = None
            row['publication'] = conf['venue']
            
            ROWS.append(row)


modules.update_dataframe(path=output_file, columns=COLUMNS, new_rows=ROWS)

  assert(len(html_titles), len(html_abstracts))
