In [2]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
from utils import *
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re


In [3]:
url = 'https://acl2018.org/programme/schedule/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


In [8]:
papers = []
day_schedules = soup.find_all(class_='day-program')
for day in day_schedules:
    rows = day.find_all('tr')
    current_titles = []
    for row in rows:
        is_area_track_info_row = 'session-name-row' in row['class'] and 'conc-session-indiv-row' in row['class']
        is_orals_papers_row = 'conc-session-details-row' in row['class']
        is_poster_papers_row = 'poster-session-row' in row['class'] and len(list(row.children)) > 1
        if is_area_track_info_row:
            title_divs = row.find_all(class_='conc-session-name')
            title_regex = r'Session \d+[A-Za-z]*: (.*?)\s*\d*$'
            current_titles = [re.search(title_regex, div.text).group(1) for div in title_divs]
        elif is_orals_papers_row:
            papers_per_area = row.find_all('td')
            assert len(papers_per_area) == len(current_titles)
            for paper_column, area in zip(papers_per_area, current_titles):
                for paper_div in paper_column.find_all('div', class_='talk-title'):
                    paper_link_index = 2
                    links = paper_div.find_all('a')
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': area})
            current_titles = []
        elif is_poster_papers_row:
            # print(row)
            poster_subsessions = row.find_all(class_='poster-sub-session')
            for subsession in poster_subsessions:
                raw_track_title = subsession.find(class_='poster-session-name').text
                if raw_track_title == 'Tutorial':
                    continue
                title_regex = r'Poster Session \d+[A-Za-z]*: (.*?)$'
                track = re.search(title_regex, raw_track_title).group(1)
                for paper_span in subsession.find_all('span'):
                    links = paper_span.find_all('a')
                    paper_link_index = 1
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': track})
 
df = pd.DataFrame(papers)
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session
444,618,Finding syntax in human encephalography with b...,Best Paper Session
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [9]:
# there is no interpretability area
areas = set(list(df['area']))
areas

{'Argument Mining',
 'Best Paper Session',
 'Dialog System',
 'Dialog System, Discourse',
 'Dialog and Interactive Systems, Multilinguality',
 'Discourse',
 'Discourse, Linguistics, Cognitive Modeling',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Generation, Summarization',
 'Inference, Reasoning',
 'Information Extraction',
 'Information Extraction, Text Mining',
 'Information Retrieval',
 'Language/Document Model',
 'Linguistics, Psycholinguistics and Cognitive Modeling',
 'Machine Learning',
 'Machine Learning, Question Answering',
 'Machine Translation',
 'Machine Translation, Multilinguality',
 'Morphology, Tagging, Parsing',
 'Multilinguality',
 'Multimodal',
 'Parsing',
 'Parsing, Morphology',
 'Question Answering',
 'Resource, Annotation',
 'Resources and Evaluation',
 'Semantic Parsing',
 'Semantics',
 'Sentiment',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Student Research Workshop',
 'Summarization',
 'Summarization, Social Media',
 'System Demon

In [11]:
df['interpretability'] = None
df['doi'] = df['title'].apply(get_acl_anthology_doi)
df

Unnamed: 0,id,title,area,interpretability,source,doi
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,ACL2018,10.18653/v1/P18-1001
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,ACL2018,10.18653/v1/P18-1002
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,ACL2018,10.18653/v1/P18-1003
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,ACL2018,10.18653/v1/P18-1004
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,ACL2018,10.18653/v1/P18-1005
...,...,...,...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,ACL2018,10.18653/v1/P18-2124
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,,ACL2018,
444,618,Finding syntax in human encephalography with b...,Best Paper Session,,ACL2018,10.18653/v1/P18-1254
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,ACL2018,10.18653/v1/P18-1255


In [15]:
previous_df = pd.read_csv('../data/parsed_data.csv')
previous_df = previous_df.drop('Unnamed: 0', axis=1)
previous_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,797,Restricted Recurrent Neural Tensor Networks: E...,,,10.18653/v1/P18-2002,ACL2019
1,338,Neural Hidden Markov Model for Machine Transla...,,,10.18653/v1/P18-2060,ACL2019
2,300,Attention Focusing for Neural Machine Translat...,,,10.18653/v1/P18-1164,ACL2019
3,1638,Will it Blend? Blending Weak and Strong Labele...,,,10.18653/v1/P18-2095,ACL2019
4,92,Batch IS NOT Heavy: Learning Word Representati...,,,10.18653/v1/P18-1172,ACL2019
...,...,...,...,...,...,...
9524,T4773,Rank-Aware Negative Training for Semi-Supervis...,Machine Learning for NLP,False,10.1162/tacl_a_00574,ACL2023
9525,T4777,Transparency Helps Reveal When Language Models...,"Linguistic Theories, Cognitive Modeling, and P...",False,10.1162/tacl_a_00565,ACL2023
9526,T4803,Design Choices for Crowdsourcing Implicit Disc...,Discourse and Pragmatics,False,10.1162/tacl_a_00586,ACL2023
9527,T4929,Time-and-Space-Efficient Weighted Deduction,"Semantics: Sentence-level Semantics, Textual I...",False,10.1162/tacl_a_00588,ACL2023


In [17]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,797,Restricted Recurrent Neural Tensor Networks: E...,,,10.18653/v1/P18-2002,ACL2019
1,338,Neural Hidden Markov Model for Machine Transla...,,,10.18653/v1/P18-2060,ACL2019
2,300,Attention Focusing for Neural Machine Translat...,,,10.18653/v1/P18-1164,ACL2019
3,1638,Will it Blend? Blending Weak and Strong Labele...,,,10.18653/v1/P18-2095,ACL2019
4,92,Batch IS NOT Heavy: Learning Word Representati...,,,10.18653/v1/P18-1172,ACL2019
...,...,...,...,...,...,...
9971,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2018
9972,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,,,ACL2018
9973,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2018
9974,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2018


In [18]:
updated_df.to_csv('../data/parsed_data.csv')