In [1]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
from utils import *
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [4]:
url = 'https://acl2018.org/programme/schedule/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


In [5]:
papers = []
day_schedules = soup.find_all(class_='day-program')
for day in day_schedules:
    rows = day.find_all('tr')
    current_titles = []
    for row in rows:
        is_area_track_info_row = 'session-name-row' in row['class'] and 'conc-session-indiv-row' in row['class']
        is_orals_papers_row = 'conc-session-details-row' in row['class']
        is_poster_papers_row = 'poster-session-row' in row['class'] and len(list(row.children)) > 1
        if is_area_track_info_row:
            title_divs = row.find_all(class_='conc-session-name')
            title_regex = r'Session \d+[A-Za-z]*: (.*?)\s*\d*$'
            current_titles = [re.search(title_regex, div.text).group(1) for div in title_divs]
        elif is_orals_papers_row:
            papers_per_area = row.find_all('td')
            assert len(papers_per_area) == len(current_titles)
            for paper_column, area in zip(papers_per_area, current_titles):
                for paper_div in paper_column.find_all('div', class_='talk-title'):
                    paper_link_index = 2
                    links = paper_div.find_all('a')
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': area})
            current_titles = []
        elif is_poster_papers_row:
            poster_subsessions = row.find_all(class_='poster-sub-session')
            for subsession in poster_subsessions:
                raw_track_title = subsession.find(class_='poster-session-name').text
                if raw_track_title == 'Tutorial':
                    continue
                title_regex = r'Poster Session \d+[A-Za-z]*: (.*?)$'
                track = re.search(title_regex, raw_track_title).group(1)
                for paper_span in subsession.find_all('span'):
                    links = paper_span.find_all('a')
                    paper_link_index = 1
                    paper_link = links[paper_link_index]
                    paper_id = paper_link['href'].split('/')[-1]
                    title = paper_link.text
                    papers.append({'id': paper_id, 'title': title, 'area': track})
 
df = pd.DataFrame(papers)
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session
444,618,Finding syntax in human encephalography with b...,Best Paper Session
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [7]:
# there is no interpretability area, so we are not sure which area these papers might be sent to
areas = set(list(df['area']))
areas

{'Argument Mining',
 'Best Paper Session',
 'Dialog System',
 'Dialog System, Discourse',
 'Dialog and Interactive Systems, Multilinguality',
 'Discourse',
 'Discourse, Linguistics, Cognitive Modeling',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Generation, Summarization',
 'Inference, Reasoning',
 'Information Extraction',
 'Information Extraction, Text Mining',
 'Information Retrieval',
 'Language/Document Model',
 'Linguistics, Psycholinguistics and Cognitive Modeling',
 'Machine Learning',
 'Machine Learning, Question Answering',
 'Machine Translation',
 'Machine Translation, Multilinguality',
 'Morphology, Tagging, Parsing',
 'Multilinguality',
 'Multimodal',
 'Parsing',
 'Parsing, Morphology',
 'Question Answering',
 'Resource, Annotation',
 'Resources and Evaluation',
 'Semantic Parsing',
 'Semantics',
 'Sentiment',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Student Research Workshop',
 'Summarization',
 'Summarization, Social Media',
 'System Demon

In [8]:
NON_MAIN_CONFERENCE_AREAS = ['Student Research Workshop', 'System Demonstrations']

df = df[~df['area'].isin(NON_MAIN_CONFERENCE_AREAS)].copy()
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session
444,618,Finding syntax in human encephalography with b...,Best Paper Session
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [9]:
df['source'] = 'ACL'
df['year'] = 2018
df['doi'] = None
df['abstract'] = None
df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,ACL,2018,,
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,ACL,2018,,
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,ACL,2018,,
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,ACL,2018,,
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,ACL,2018,,
...,...,...,...,...,...,...,...
442,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,ACL,2018,,
443,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,ACL,2018,,
444,618,Finding syntax in human encephalography with b...,Best Paper Session,ACL,2018,,
445,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,ACL,2018,,


 # Saving the results

In [2]:
previous_df = pd.read_csv('../data/cl_papers.csv', index_col=0)
previous_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
7433,204,The importance of Being Recurrent for Modeling...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7434,1198,Towards Dynamic Computation Graphs via Sparse ...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7435,1111,Convolutional Neural Networks with Recurrent N...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7436,1379-TACL,Language Modeling for Morphologically Rich Lan...,Area H or D,EMNLP,2018,,


In [3]:
# we'll delete previous rows from the conference
previous_df = previous_df[~((previous_df['source'] == 'ACL') & (previous_df['year'] == 2018))]
previous_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
7433,204,The importance of Being Recurrent for Modeling...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7434,1198,Towards Dynamic Computation Graphs via Sparse ...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7435,1111,Convolutional Neural Networks with Recurrent N...,Area H (Machine Learning) [LONG],EMNLP,2018,,
7436,1379-TACL,Language Modeling for Morphologically Rich Lan...,Area H or D,EMNLP,2018,,


In [10]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
7834,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,ACL,2018,,
7835,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,ACL,2018,,
7836,618,Finding syntax in human encephalography with b...,Best Paper Session,ACL,2018,,
7837,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,ACL,2018,,


In [11]:
updated_df.to_csv('../data/cl_papers.csv')