In [12]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))

In [13]:
from utils import *
import pandas as pd

In [55]:
with open('../data/acl2019/data/papers/order') as f:
    text = f.readlines()

papers = []
current_track = None
for line in text:
    if line == '\n':
        current_track = None
        continue
    area_regex_match = re.search(r'Session \d.: ([^\d%\(]*)(\(Short\))?( \d)?', line)
    if area_regex_match:
        current_track = area_regex_match.group(1).strip()
        continue

    paper_regex_match = re.search(r'(\d*)( \d+:\d+--\d+:\d+)? # (.*?)( #|$)', line)
    if paper_regex_match:
        paper_id = paper_regex_match.group(1).strip()
        title = paper_regex_match.group(3).strip()
        papers.append({'id': paper_id, 'title': title, 'area': current_track})
        continue

df = pd.DataFrame(papers)
df

Unnamed: 0,id,title,area
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation
...,...,...,...
396,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session
397,1603,Lighter' Can Still Be Dark: Modeling Comparati...,Best Paper Session
398,618,Finding syntax in human encephalography with b...,Best Paper Session
399,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session


In [56]:
set(df['area']) # there is no explicit interpretability area

{'Argument Mining',
 'Best Paper Session',
 'Dialog System',
 'Dialog System, Discourse',
 'Dialog and Interactive Systems, Multilinguality',
 'Discourse',
 'Discourse, Linguistics, Cognitive Modeling',
 'Document Analysis',
 'Evaluation',
 'Generation',
 'Generation, Summarization',
 'Inference, Reasoning',
 'Information Extraction',
 'Information Extraction, Text Mining',
 'Information Retrieval',
 'Language/Document Model',
 'Linguistics, Psycholinguistics and Cognitive Modeling',
 'Machine Learning',
 'Machine Learning, Question Answering',
 'Machine Translation',
 'Machine Translation, Multilinguality',
 'Morphology, Tagging, Parsing',
 'Multilinguality',
 'Multimodal',
 'Parsing',
 'Parsing, Morphology',
 'Question Answering',
 'Resource, Annotation',
 'Resources and Evaluation',
 'Semantic Parsing',
 'Semantics',
 'Sentiment',
 'Sentiment Analysis and Argument Mining',
 'Social Media',
 'Summarization',
 'Summarization, Social Media',
 'Text Mining and Applications',
 'Vision',


In [57]:
df['interpretability'] = None
df['doi'] = df['title'].apply(get_acl_anthology_doi)
df['source'] = 'ACL2019'
df

Unnamed: 0,id,title,area,interpretability,doi,source
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2019
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2019
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2019
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2019
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2019
...,...,...,...,...,...,...
396,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2019
397,1603,Lighter' Can Still Be Dark: Modeling Comparati...,Best Paper Session,,,ACL2019
398,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2019
399,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2019


# Saving the results

In [58]:
previous_df = pd.read_csv('../data/parsed_data.csv', sep=",", index_col=0)
previous_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2018
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2018
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2018
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2018
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2018
...,...,...,...,...,...,...
8342,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2019
8343,1603,Lighter' Can Still Be Dark: Modeling Comparati...,Best Paper Session,,10.18653/v1/P18-2125,ACL2019
8344,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2019
8345,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2019


In [59]:
# we'll delete previous rows from the conference
previous_df = previous_df[previous_df['source'] != 'ACL2019']

In [60]:
updated_df = pd.concat([previous_df, df], ignore_index=True)
updated_df

Unnamed: 0,id,title,area,interpretability,doi,source
0,187,Probabilistic FastText for Multi-Sense Word Em...,Word Semantics,,10.18653/v1/P18-1001,ACL2018
1,1520,A La Carte Embedding: Cheap but Effective Indu...,Word Semantics,,10.18653/v1/P18-1002,ACL2018
2,707,Unsupervised Learning of Distributional Relati...,Word Semantics,,10.18653/v1/P18-1003,ACL2018
3,1553,Explicit Retrofitting of Distributional Word V...,Word Semantics,,10.18653/v1/P18-1004,ACL2018
4,76,Unsupervised Neural Machine Translation with W...,Machine Translation,,10.18653/v1/P18-1005,ACL2018
...,...,...,...,...,...,...
8342,1125,Know What You Don’t Know: Unanswerable Questio...,Best Paper Session,,10.18653/v1/P18-2124,ACL2019
8343,1603,Lighter' Can Still Be Dark: Modeling Comparati...,Best Paper Session,,,ACL2019
8344,618,Finding syntax in human encephalography with b...,Best Paper Session,,10.18653/v1/P18-1254,ACL2019
8345,1247,Learning to Ask Good Questions: Ranking Clarif...,Best Paper Session,,10.18653/v1/P18-1255,ACL2019


In [61]:
updated_df.to_csv('../data/parsed_data.csv')