In [1]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
from utils import *
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [6]:
url = 'https://acl2019.org/schedule.acl2019.org/index.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
papers = []

for day in soup.find_all(class_='day-program'):
    for tr in day.find_all('div', class_='session-name'):
        if 'Oral Presentations' in tr.text:
            sessions = tr.parent.parent.parent.find_next_sibling('tr')
            papers_tr = sessions.find_next_sibling('tr').find_next_sibling('tr')
            for session, paper_column in zip(sessions.find_all('td'), papers_tr.find_all('td')):
                session_name = session.find(class_='conc-session-name')
                title_regex = r'Session \d.: (\D*)( \d)?'
                track = re.search(title_regex, session_name.text).group(1).strip()

                for paper in paper_column.find_all('a', class_='titlelink'):
                    title = paper.text
                    paper_id = paper.attrs['data-src'].split('-')[-1]
                    papers.append({'id': paper_id, 'title': title, 'area': track})

        elif tr.text.startswith('Poster Session'):
            papers_tr = tr.parent.parent.parent.find_next_sibling('tr')
            for subsession in papers_tr.find_all(class_='poster-sub-session'):
                subsession_name = subsession.find(class_='poster-session-name').text
                title_regex = r'Session \d.: (\D*)( \d)?'
                track = re.search(title_regex, subsession_name).group(1).strip()
                
                for paper in subsession.find_all('a', class_='titlelink'):
                    title = paper.text
                    paper_id = paper.attrs['data-src'].split('-')[-1]
                    papers.append({'id': paper_id, 'title': title, 'area': track})


df = pd.DataFrame(papers)
df          

Unnamed: 0,id,title,area
0,1793,One Time of Interaction May Not Be Enough: Go ...,Dialogue and Interactive Systems
1,1922,Incremental Transformer with Deliberation Deco...,Dialogue and Interactive Systems
2,2340,Improving Multi-turn Dialogue Modelling with U...,Dialogue and Interactive Systems
3,837,Do Neural Dialog Systems Use the Conversation ...,Dialogue and Interactive Systems
4,1693,Boosting Dialog Response Generation,Dialogue and Interactive Systems
...,...,...,...
676,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp..."
677,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp..."
678,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp..."
679,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp..."


In [8]:
set(df['area']) # there is no explicit interpretability area

{'Applications',
 'Bias in Language Processing',
 'Dialogue and Generation',
 'Dialogue and Interactive Systems',
 'Discourse and Pragmatics',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Information Extraction and Text Mining',
 'Linguistic Theories, Cognitive Modeling and Psycholinguistics',
 'Machine Learning',
 'Machine Translation',
 'Multidisciplinary',
 'Multilinguality',
 'Multilinguality and Morphology',
 'Phonology, Morphology and Word Segmentation',
 'Question Answering',
 'Resources and Evaluation',
 'Semantics',
 'Sentence-level Semantics',
 'Sentence-level semantics',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Summarization',
 'Tagging, Chunking, Syntax and Parsing',
 'Textual Inference and Other Areas of Semantics',
 'Vision, Robotics, Multimodal, Grounding and Speech',
 'Visual and Multimodal Question Answering',
 'Word-level Semantics'}

In [9]:
df['source'] = 'ACL'
df['year'] = 2019
df['doi'] = None
df['abstract'] = None
df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,1793,One Time of Interaction May Not Be Enough: Go ...,Dialogue and Interactive Systems,ACL,2019,,
1,1922,Incremental Transformer with Deliberation Deco...,Dialogue and Interactive Systems,ACL,2019,,
2,2340,Improving Multi-turn Dialogue Modelling with U...,Dialogue and Interactive Systems,ACL,2019,,
3,837,Do Neural Dialog Systems Use the Conversation ...,Dialogue and Interactive Systems,ACL,2019,,
4,1693,Boosting Dialog Response Generation,Dialogue and Interactive Systems,ACL,2019,,
...,...,...,...,...,...,...,...
676,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
677,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
678,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
679,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,


In [11]:
# there is one duplicated paper
indices = df.index[df['title'] == 'Dual Adversarial Neural Transfer for Low-Resource Named Entity Recognition'].tolist()
if len(indices) == 2:
    df = df.drop(indices[1])


In [13]:
df['area'].value_counts()

area
Machine Learning                                                 64
Information Extraction and Text Mining                           53
Dialogue and Interactive Systems                                 51
Machine Translation                                              49
Question Answering                                               41
Generation                                                       38
Applications                                                     33
Sentiment Analysis and Argument Mining                           33
Tagging, Chunking, Syntax and Parsing                            29
Word-level Semantics                                             27
Vision, Robotics, Multimodal, Grounding and Speech               25
Resources and Evaluation                                         25
Social Media                                                     25
Summarization                                                    24
Multilinguality                            

# Saving the results

In [14]:
previous_df = pd.read_csv('../data/cl_papers.csv', index_col=0)
previous_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
8570,3639),Spelling-Aware Construction of Macaronic Texts...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8571,3718),Towards Machine Reading for Interventions from...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8572,4014),RUN through the Streets: A New Dataset and Bas...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8573,162),Context-Aware Conversation Thread Detection in...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,


In [5]:
# we'll delete previous rows from the conference
previous_df = previous_df[~((previous_df['source'] == 'ACL') & (previous_df['year'] == 2019))]
previous_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
8570,3639),Spelling-Aware Construction of Macaronic Texts...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8571,3718),Towards Machine Reading for Interventions from...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8572,4014),RUN through the Streets: A New Dataset and Bas...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,
8573,162),Context-Aware Conversation Thread Detection in...,"Information Extraction, Text Mining and NLP Ap...",EMNLP,2019,,


In [15]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,source,year,doi,abstract
0,main.1004,AnswerFact: Fact Checking in Product Question ...,Question Answering,EMNLP,2020,10.18653/v1/2020.emnlp-main.1004,Product-related question answering platforms n...
1,main.1006,Knowledge-Grounded Dialogue Generation with Pr...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1006,We study knowledge-grounded dialogue generatio...
2,main.1009,BiST: Bi-directional Spatio-Temporal Reasoning...,Dialog and Interactive Systems,EMNLP,2020,10.18653/v1/2020.emnlp-main.1009,Video-grounded dialogues are very challenging ...
3,main.1010,A Knowledge-Aware Sequence-to-Tree Network for...,NLP Applications,EMNLP,2020,10.18653/v1/2020.emnlp-main.1010,With the advancements in natural language proc...
4,main.1011,Knowledge Association with Hyperbolic Knowledg...,Information Extraction,EMNLP,2020,10.18653/v1/2020.emnlp-main.1011,Capturing associations for knowledge graphs (K...
...,...,...,...,...,...,...,...
9250,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
9251,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
9252,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,
9253,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",ACL,2019,,


In [16]:
updated_df.to_csv('../data/cl_papers.csv')