In [12]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
from utils import *
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [13]:
url = 'https://acl2018.org/programme/schedule/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


In [14]:
papers = []
day_schedules = soup.find_all(class_='day-program')
for day in day_schedules:
    rows = day.find_all('tr')
    current_titles = []
    for row in rows:
        is_area_track_info_row = 'session-name-row' in row['class'] and 'conc-session-indiv-row' in row['class']
        is_orals_papers_row = 'conc-session-details-row' in row['class']
        is_poster_papers_row = 'poster-session-row' in row['class'] and len(list(row.children)) > 1
        if is_area_track_info_row:
            title_divs = row.find_all(class_='conc-session-name')
            title_regex = r'Session \d+[A-Za-z]*: (.*?)\s*\d*$'
            current_titles = [re.search(title_regex, div.text).group(1) for div in title_divs]
        elif is_orals_papers_row:
            papers_per_area = row.find_all('td')
            assert len(papers_per_area) == len(current_titles)
            for paper_column, area in zip(papers_per_area, current_titles):
                for paper_div in paper_column.find_all('div', class_='talk-title'):
                    paper_link_index = 2
                    links = paper_div.find_all('a')
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': area})
            current_titles = []
        elif is_poster_papers_row:
            poster_subsessions = row.find_all(class_='poster-sub-session')
            for subsession in poster_subsessions:
                raw_track_title = subsession.find(class_='poster-session-name').text
                if raw_track_title == 'Tutorial':
                    continue
                title_regex = r'Poster Session \d+[A-Za-z]*: (.*?)$'
                track = re.search(title_regex, raw_track_title).group(1)
                for paper_span in subsession.find_all('span'):
                    links = paper_span.find_all('a')
                    paper_link_index = 1
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': track})
 
df = pd.DataFrame(papers)
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session
444,618,Finding syntax in human encephalography with b...,Best Paper Session
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [15]:
# there is no interpretability area, so we are not sure which area these papers might be sent to
areas = set(list(df['area']))
areas

{'Argument Mining',
 'Best Paper Session',
 'Dialog System',
 'Dialog System, Discourse',
 'Dialog and Interactive Systems, Multilinguality',
 'Discourse',
 'Discourse, Linguistics, Cognitive Modeling',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Generation, Summarization',
 'Inference, Reasoning',
 'Information Extraction',
 'Information Extraction, Text Mining',
 'Information Retrieval',
 'Language/Document Model',
 'Linguistics, Psycholinguistics and Cognitive Modeling',
 'Machine Learning',
 'Machine Learning, Question Answering',
 'Machine Translation',
 'Machine Translation, Multilinguality',
 'Morphology, Tagging, Parsing',
 'Multilinguality',
 'Multimodal',
 'Parsing',
 'Parsing, Morphology',
 'Question Answering',
 'Resource, Annotation',
 'Resources and Evaluation',
 'Semantic Parsing',
 'Semantics',
 'Sentiment',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Student Research Workshop',
 'Summarization',
 'Summarization, Social Media',
 'System Demon

In [16]:
NON_MAIN_CONFERENCE_AREAS = ['Student Research Workshop', 'System Demonstrations']

df = df[~df['area'].isin(NON_MAIN_CONFERENCE_AREAS)].copy()
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session
444,618,Finding syntax in human encephalography with b...,Best Paper Session
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [17]:
df['interpretability'] = None # as there is no interpretability area, we cannot really know if other areas might include them
df['doi'] = df['title'].apply(get_acl_anthology_doi)
df['source'] = 'ACL2018'
df

Unnamed: 0,id,title,area,interpretability,doi,source
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2018
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2018
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2018
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2018
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2018
...,...,...,...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2018
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,,,ACL2018
444,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2018
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2018


 # Saving the results

In [19]:
previous_df = pd.read_csv('../data/parsed_data.csv')
previous_df = previous_df.drop('Unnamed: 0', axis=1)
previous_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2018
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2018
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2018
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2018
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2018
...,...,...,...,...,...,...
9277,4608,PAR: Political Actor Representation Learning w...,NLP Applications,False,10.18653/v1/2022.emnlp-main.824,EMNLP2022
9278,4613,JDDC 2.1: A Multimodal Chinese Dialogue Datase...,Resources and Evaluation,False,,EMNLP2022
9279,4618,PCL: Peer-Contrastive Learning with Diverse Au...,"Semantics: Lexical, Sentence level, Textual In...",False,10.18653/v1/2022.emnlp-main.826,EMNLP2022
9280,4621,Digging Errors in NMT: Evaluating and Understa...,Machine Translation,False,10.18653/v1/2022.emnlp-main.827,EMNLP2022


In [20]:
# we'll delete previous rows from the conference
previous_df = previous_df[previous_df['source'] != 'ACL2018']

In [21]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020
...,...,...,...,...,...,...
9277,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2018
9278,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,,,ACL2018
9279,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2018
9280,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2018


In [22]:
updated_df.to_csv('../data/parsed_data.csv')