In [106]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))

In [107]:
from utils import *
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [108]:
url = 'https://acl2019.org/schedule.acl2019.org/index.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [109]:
papers = []

for day in soup.find_all(class_='day-program'):
    for tr in day.find_all('div', class_='session-name'):
        if 'Oral Presentations' in tr.text:
            sessions = tr.parent.parent.parent.find_next_sibling('tr')
            papers_tr = sessions.find_next_sibling('tr').find_next_sibling('tr')
            for session, paper_column in zip(sessions.find_all('td'), papers_tr.find_all('td')):
                session_name = session.find(class_='conc-session-name')
                title_regex = r'Session \d.: (\D*)( \d)?'
                track = re.search(title_regex, session_name.text).group(1).strip()

                for paper in paper_column.find_all('a', class_='titlelink'):
                    title = paper.text
                    paper_id = paper.attrs['data-src'].split('-')[-1]
                    papers.append({'id': paper_id, 'title': title, 'area': track})

        elif tr.text.startswith('Poster Session'):
            papers_tr = tr.parent.parent.parent.find_next_sibling('tr')
            for subsession in papers_tr.find_all(class_='poster-sub-session'):
                subsession_name = subsession.find(class_='poster-session-name').text
                title_regex = r'Session \d.: (\D*)( \d)?'
                track = re.search(title_regex, subsession_name).group(1).strip()
                
                for paper in subsession.find_all('a', class_='titlelink'):
                    title = paper.text
                    paper_id = paper.attrs['data-src'].split('-')[-1]
                    papers.append({'id': paper_id, 'title': title, 'area': track})


df = pd.DataFrame(papers)
df          

Unnamed: 0,id,title,area
0,1793,One Time of Interaction May Not Be Enough: Go ...,Dialogue and Interactive Systems
1,1922,Incremental Transformer with Deliberation Deco...,Dialogue and Interactive Systems
2,2340,Improving Multi-turn Dialogue Modelling with U...,Dialogue and Interactive Systems
3,837,Do Neural Dialog Systems Use the Conversation ...,Dialogue and Interactive Systems
4,1693,Boosting Dialog Response Generation,Dialogue and Interactive Systems
...,...,...,...
676,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp..."
677,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp..."
678,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp..."
679,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp..."


In [110]:
set(df['area']) # there is no explicit interpretability area

{'Applications',
 'Bias in Language Processing',
 'Dialogue and Generation',
 'Dialogue and Interactive Systems',
 'Discourse and Pragmatics',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Information Extraction and Text Mining',
 'Linguistic Theories, Cognitive Modeling and Psycholinguistics',
 'Machine Learning',
 'Machine Translation',
 'Multidisciplinary',
 'Multilinguality',
 'Multilinguality and Morphology',
 'Phonology, Morphology and Word Segmentation',
 'Question Answering',
 'Resources and Evaluation',
 'Semantics',
 'Sentence-level Semantics',
 'Sentence-level semantics',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Summarization',
 'Tagging, Chunking, Syntax and Parsing',
 'Textual Inference and Other Areas of Semantics',
 'Vision, Robotics, Multimodal, Grounding and Speech',
 'Visual and Multimodal Question Answering',
 'Word-level Semantics'}

In [111]:
df['interpretability'] = None
df['doi'] = df['title'].apply(get_acl_anthology_doi)
df['source'] = 'ACL2019'
df

Unnamed: 0,id,title,area,interpretability,doi,source
0,1793,One Time of Interaction May Not Be Enough: Go ...,Dialogue and Interactive Systems,,10.18653/v1/P19-1001,ACL2019
1,1922,Incremental Transformer with Deliberation Deco...,Dialogue and Interactive Systems,,10.18653/v1/P19-1002,ACL2019
2,2340,Improving Multi-turn Dialogue Modelling with U...,Dialogue and Interactive Systems,,10.18653/v1/P19-1003,ACL2019
3,837,Do Neural Dialog Systems Use the Conversation ...,Dialogue and Interactive Systems,,10.18653/v1/P19-1004,ACL2019
4,1693,Boosting Dialog Response Generation,Dialogue and Interactive Systems,,10.18653/v1/P19-1005,ACL2019
...,...,...,...,...,...,...
676,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1656,ACL2019
677,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1657,ACL2019
678,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1658,ACL2019
679,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1659,ACL2019


In [112]:
# there is one duplicated paper
indices = df.index[df['doi'] == '10.18653/v1/P19-1336'].tolist()
if len(indices) == 2:
    df = df.drop(indices[1])


In [113]:
df['area'].value_counts()

area
Machine Learning                                                 64
Information Extraction and Text Mining                           53
Dialogue and Interactive Systems                                 51
Machine Translation                                              49
Question Answering                                               41
Generation                                                       38
Applications                                                     33
Sentiment Analysis and Argument Mining                           33
Tagging, Chunking, Syntax and Parsing                            29
Word-level Semantics                                             27
Vision, Robotics, Multimodal, Grounding and Speech               25
Resources and Evaluation                                         25
Social Media                                                     25
Summarization                                                    24
Multilinguality                            

# Saving the results

In [114]:
previous_df = pd.read_csv('../data/parsed_data.csv', sep=",", index_col=0)
previous_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020
...,...,...,...,...,...,...
9277,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2018
9278,1603,'Lighter' Can Still Be Dark: Modeling Comparat...,Best Paper Session,,,ACL2018
9279,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2018
9280,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2018


In [115]:
# we'll delete previous rows from the conference
previous_df = previous_df[previous_df['source'] != 'ACL2019']

In [116]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020
...,...,...,...,...,...,...
9277,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1656,ACL2019
9278,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1657,ACL2019
9279,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1658,ACL2019
9280,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1659,ACL2019


In [117]:
updated_df.to_csv('../data/parsed_data.csv')