In [1]:
import os, sys
sys.path.append(os.path.dirname(os.getcwd()))
from utils import *

In [19]:
import requests
from bs4 import BeautifulSoup

URLS = [
    ('https://aclanthology.org/volumes/2023.blackboxnlp-1', 'BLACKBOX2023'),
    ('https://aclanthology.org/volumes/2022.blackboxnlp-1', 'BLACKBOX2022'),
    ('https://aclanthology.org/volumes/2021.blackboxnlp-1', 'BLACKBOX2021'),
    ('https://aclanthology.org/volumes/2020.blackboxnlp-1', 'BLACKBOX2020'),
    ('https://aclanthology.org/volumes/W19-48/', 'BLACKBOX2019'),
]

In [20]:
import bibtexparser

data = []
for url, source in URLS:

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    papers = soup.find_all(class_='d-sm-flex align-items-stretch')
    for paper in papers:
        if 'Proceedings of the ' in paper.text:
            continue
        
        
        bib_path = paper.find('a', class_='badge-secondary')['href']
        bib_url = 'https://aclanthology.org' + bib_path
        response = requests.get(bib_url)
        bib = bibtexparser.parse_string(response.text)
        title = bib.blocks[0].fields_dict['title'].value
        doi = bib.blocks[0].fields_dict['doi'].value
        abstract = bib.blocks[0].fields_dict['abstract'].value
        
        data.append({
            'id': None,
            'title': title,
            'area': 'Blackbox',
            'interpretability': True,
            'doi': doi,
            'source': source,
            'abstract': abstract,
        })

df = pd.DataFrame(data)
df

Unnamed: 0,id,title,area,interpretability,doi,source,abstract
0,,Knowledge-Grounded Natural Language Recommenda...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.1,BLACKBOX2023,Explanations accompanying a recommendation can...
1,,Emergent Linear Representations in World Model...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.2,BLACKBOX2023,How do sequence models represent their decisio...
2,,Explaining Data Patterns in Natural Language w...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.3,BLACKBOX2023,Large language models (LLMs) have displayed an...
3,,Probing Quantifier Comprehension in Large Lang...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.4,BLACKBOX2023,"With their increasing size, large language mod..."
4,,Disentangling the Linguistic Competence of Pri...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.5,BLACKBOX2023,Differential Privacy (DP) has been tailored to...
...,...,...,...,...,...,...,...
164,,Adversarial Attack on Sentiment Classification,Blackbox,True,10.18653/v1/W19-4824,BLACKBOX2019,"In this paper, we propose a white-box attack a..."
165,,Open Sesame: Getting inside {BERT}{'}s Linguis...,Blackbox,True,10.18653/v1/W19-4825,BLACKBOX2019,How and to what extent does BERT encode syntac...
166,,{GE}val: Tool for Debugging {NLP} Datasets and...,Blackbox,True,10.18653/v1/W19-4826,BLACKBOX2019,This paper presents a simple but general and e...
167,,From Balustrades to Pierre Vinken: Looking for...,Blackbox,True,10.18653/v1/W19-4827,BLACKBOX2019,We inspect the multi-head self-attention in Tr...


In [21]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from tqdm import tqdm
tqdm.pandas()

tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')

model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
adapter_name = model.load_adapter("allenai/specter2_classification", source="hf", set_active=True)

def get_embedding(paper_row):
    text = paper_row['title'] + tokenizer.sep_token + paper_row['abstract']
    inputs = tokenizer(text,
                       padding=True,
                       truncation=True,
                       return_tensors="pt",
                       return_token_type_ids=False,
                       max_length=2048)
    output = model(**inputs)
    embeddings = output.last_hidden_state[:, 0, :][0].detach().numpy()
    return embeddings

df['embedding'] = df.progress_apply(get_embedding, axis=1)
df

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169/169 [00:16<00:00, 10.16it/s]


Unnamed: 0,id,title,area,interpretability,doi,source,abstract,embedding
0,,Knowledge-Grounded Natural Language Recommenda...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.1,BLACKBOX2023,Explanations accompanying a recommendation can...,"[-1.9978445, -0.5634321, -1.1563166, -1.435869..."
1,,Emergent Linear Representations in World Model...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.2,BLACKBOX2023,How do sequence models represent their decisio...,"[-0.013929166, -0.76116437, -0.089470685, -0.6..."
2,,Explaining Data Patterns in Natural Language w...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.3,BLACKBOX2023,Large language models (LLMs) have displayed an...,"[-0.90607655, -0.3102931, -0.019184401, -0.672..."
3,,Probing Quantifier Comprehension in Large Lang...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.4,BLACKBOX2023,"With their increasing size, large language mod...","[-0.32903624, 0.6297687, 0.65335226, -1.216356..."
4,,Disentangling the Linguistic Competence of Pri...,Blackbox,True,10.18653/v1/2023.blackboxnlp-1.5,BLACKBOX2023,Differential Privacy (DP) has been tailored to...,"[0.51844144, 0.30955702, -0.633747, -0.9364292..."
...,...,...,...,...,...,...,...,...
164,,Adversarial Attack on Sentiment Classification,Blackbox,True,10.18653/v1/W19-4824,BLACKBOX2019,"In this paper, we propose a white-box attack a...","[-1.1030502, -0.88766366, -0.77479035, -1.1464..."
165,,Open Sesame: Getting inside {BERT}{'}s Linguis...,Blackbox,True,10.18653/v1/W19-4825,BLACKBOX2019,How and to what extent does BERT encode syntac...,"[0.31116417, 0.7632777, -0.10821019, -1.525758..."
166,,{GE}val: Tool for Debugging {NLP} Datasets and...,Blackbox,True,10.18653/v1/W19-4826,BLACKBOX2019,This paper presents a simple but general and e...,"[-0.47677433, 0.40539914, -0.2836782, -0.95066..."
167,,From Balustrades to Pierre Vinken: Looking for...,Blackbox,True,10.18653/v1/W19-4827,BLACKBOX2019,We inspect the multi-head self-attention in Tr...,"[-0.08656074, 0.66642904, -0.14793973, -1.1109..."


In [22]:
df.to_csv('../data/blackbox_papers.csv')