In [16]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))

In [17]:
from utils import *
import pandas as pd

In [18]:
df = pd.read_csv('../data/clean_data2.csv', index_col=0)
df

Unnamed: 0,id,title,area,interpretability,doi,source,working_doi
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020,True
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020,True
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020,True
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020,True
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020,True
...,...,...,...,...,...,...,...
9277,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1656,ACL2019,True
9278,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1657,ACL2019,True
9279,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1658,ACL2019,True
9280,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1659,ACL2019,True


In [19]:
processed_df = df.copy()
processed_df['abstract'] = None

In [20]:
import requests
import bs4

def get_abstract_with_acl_anthology(doi):
    url_path = doi.split('/')[-1]
    url = 'https://aclanthology.org/' + url_path
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    css_selector = "#main > div.row.acl-paper-details > div.col.col-lg-10.order-2 > div > div > span"
    element = soup.select_one(css_selector)
    abstract = element.text
    return abstract

def get_abstract_from_semantic_scholar(paper_id):
    query = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    fields = "abstract"
    response = requests.get(query, headers={"x-api-key": API_KEY}, params={"fields": fields})
    paper_dict = response.json()

    assert type(paper_dict['abstract']) == str
    return paper_dict['abstract']


def get_abstract_from_crossref(doi):
    url = 'https://api.crossref.org/works/' + doi
    r = requests.get(url)
    response = r.json()
    soup = bs4.BeautifulSoup(response['message']['abstract'], 'html.parser')
    return soup.find('jats:p').text



In [21]:
def get_abstract(doi):
    try:
        abstract = get_abstract_with_acl_anthology(doi)
        return abstract
    except Exception as e:
        print('failed to abstract from acl anthology for', doi)
        print(e)

    try:
        abstract = get_abstract_from_semantic_scholar(doi)
        return abstract
    except Exception as e:
        print('failed to abstract from semantic scholar for', doi)
        print(e)

    try:
        abstract = get_abstract_from_crossref(doi)
        return abstract
    except Exception as e:
        print('failed to abstract from semantic scholar for', doi)
        print(e)

    print('no source has the abstract for', doi)
    return None


In [31]:
from tqdm import tqdm

def fill_abstracts(df):
    for i in tqdm(range(len(df))):
        if df.iloc[i]['doi'] is not None and pd.isna(df.iloc[i]['abstract']):
            abstract = get_abstract(df.iloc[i]['doi'])
            df.iloc[i, df.columns.get_loc('abstract')] = abstract
    return df

processed_df = fill_abstracts(processed_df)
processed_df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9282/9282 [00:01<00:00, 8655.02it/s]


Unnamed: 0,id,title,area,interpretability,doi,source,working_doi,abstract
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020,True,Non-goal oriented dialog agents (i.e. chatbots...
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020,True,Emotion-controllable response generation is an...
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020,True,This paper is a theoretical contribution to th...
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020,True,Machine translation (MT) has benefited from us...
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020,True,We report on methods to create the largest pub...
...,...,...,...,...,...,...,...,...
9277,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1656,ACL2019,True,"Human language is often multimodal, which comp..."
9278,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1657,ACL2019,True,Chest X-Ray (CXR) images are commonly used for...
9279,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1658,ACL2019,True,We introduce the first dataset for human edits...
9280,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1659,ACL2019,True,"In this paper, we study abstractive summarizat..."


In [32]:
processed_df[processed_df['abstract'].isna()]

Unnamed: 0,id,title,area,interpretability,doi,source,working_doi,abstract


In [33]:
### some papers did not have their abstract by any of the sources
### these abstracts where added manually

doi_to_abstract = {
    '10.18653/v1/2022.emnlp-main.782': 'The development of conversational agents to interact with patients and deliver clinical advice has attracted the interest of many researchers, particularly in light of the COVID-19 pandemic. The training of an end-to-end neural based dialog system, on the other hand, is hampered by a lack of multi-turn medical dialog corpus. We make the very first attempt to release a highquality multi-turn Medical Dialog dataset relating to Covid-19 disease named CDialog, with over 1K conversations collected from the online medical counselling websites. We annotate each utterance of the conversation with seven different categories of medical entities, including diseases, symptoms, medical tests, medical history, remedies, medications and other aspects as additional labels. Finally, we propose a novel neural medical dialog system based on the CDialog dataset to advance future research on developing automated medical dialog systems. We use pre-trained language models for dialogue generation, incorporating annotated medical entities, to generate a virtual doctor’s response that addresses the patient’s query. Experimental results show that the proposed dialog models perform comparably better when supplemented with entity information and hence can improve the response quality.',
    '10.18653/v1/2022.emnlp-main.786': 'Tokenisation is the first step in almost all NLP tasks, and state-of-the-art transformer-based language models all use subword tokenisation algorithms to process input text. Existing algorithms have problems, often producing tokenisations of limited linguistic validity and representing equivalent strings differently depending on their position within a word. We hypothesise that these problems hinder the ability of transformer-based models to handle complex words, and suggest that these problems are a result of allowing tokens to include spaces. We thus experiment with an alternative tokenisation approach where spaces are always treated as individual tokens. Specifically, we apply this modification to the BPE and Unigram algorithms. We find that our modified algorithms lead to improved performance on downstream NLP tasks that involve handling complex words, whilst having no detrimental effect on performance in general natural language understanding tasks. Intrinsically, we find that our modified algorithms give more morphologically correct tokenisations, in particular when handling prefixes. Given the results of our experiments, we advocate for always treating spaces as individual tokens as an improved tokenisation method.'
}

for doi, abstract in doi_to_abstract.items():
    print(doi)
    assert len(processed_df[processed_df['doi'] == doi]) == 1
    processed_df.loc[processed_df['doi'] == doi, 'abstract'] = abstract
    

10.18653/v1/2022.emnlp-main.782


AssertionError: 

# Computing the embeddings

In [34]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from tqdm import tqdm
tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')

model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
adapter_name = model.load_adapter("allenai/specter2_classification", source="hf", set_active=True)

def get_embedding(paper_row):
    text = paper_row['title'] + tokenizer.sep_token + paper_row['abstract']
    inputs = tokenizer(text,
                       padding=True,
                       truncation=True,
                       return_tensors="pt",
                       return_token_type_ids=False,
                       max_length=2048)
    output = model(**inputs)
    embeddings = output.last_hidden_state[:, 0, :][0].detach().numpy()
    return embeddings

processed_df['embedding'] = processed_df.progress_apply(get_embedding, axis=1)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9282/9282 [14:28<00:00, 10.69it/s]


In [35]:
processed_df

Unnamed: 0,id,title,area,interpretability,doi,source,working_doi,abstract,embedding
0,main.8,Large Scale Multi-Actor Generative Dialog Mode...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.8,ACL2020,True,Non-goal oriented dialog agents (i.e. chatbots...,"[-0.55811894, -0.12536883, -0.06339799, -1.815..."
1,main.52,CDL: Curriculum Dual Learning for Emotion-Cont...,Dialogue and Interactive Systems,False,10.18653/v1/2020.acl-main.52,ACL2020,True,Emotion-controllable response generation is an...,"[-1.1278496, -0.5229794, 0.0056311972, -1.3223..."
2,main.46,Emergence of Syntax Needs Minimal Supervision,Theory and Formalism in NLP (Linguistic and Ma...,False,10.18653/v1/2020.acl-main.46,ACL2020,True,This paper is a theoretical contribution to th...,"[0.26176804, 0.8106163, 0.27426642, -1.174295,..."
3,main.359,Selecting Backtranslated Data from Multiple So...,Machine Translation,False,10.18653/v1/2020.acl-main.359,ACL2020,True,Machine translation (MT) has benefited from us...,"[-0.43927717, 1.0674063, 0.08589529, -0.437168..."
4,main.417,ParaCrawl: Web-Scale Acquisition of Parallel C...,Resources and Evaluation,False,10.18653/v1/2020.acl-main.417,ACL2020,True,We report on methods to create the largest pub...,"[-0.3593886, 0.33652788, -0.026537634, -0.7595..."
...,...,...,...,...,...,...,...,...,...
9277,889,Multimodal Transformer for Unaligned Multimoda...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1656,ACL2019,True,"Human language is often multimodal, which comp...","[-0.25314885, -0.08230631, -0.3357741, -1.7636..."
9278,2155,"Show, Describe and Conclude: On Exploiting the...","Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1657,ACL2019,True,Chest X-Ray (CXR) images are commonly used for...,"[-0.488766, -0.30718938, -1.3068513, -0.397652..."
9279,384,Visual Story Post-Editing,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1658,ACL2019,True,We introduce the first dataset for human edits...,"[-0.63484645, -0.21893704, 0.09538727, -0.8349..."
9280,1891,Multimodal Abstractive Summarization for How2 ...,"Vision, Robotics, Multimodal, Grounding and Sp...",,10.18653/v1/P19-1659,ACL2019,True,"In this paper, we study abstractive summarizat...","[-0.9391148, -0.115553305, -0.06912733, -1.159..."


In [37]:
processed_df.to_csv('../data/new_classifier_data.csv')