In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./nips/papers.csv")

In [2]:
data.sample(5)

Unnamed: 0,source_id,year,title,abstract,full_text
553,454,1991,Gradient Descent: Second Order Momentum and Sa...,,Gradient Descent: Second-Order Momentum \n\nan...
324,424,1990,Stochastic Neurodynamics,,Stochastic Neurodynamics \n\nJ.D. Cowan \n\nDe...
1729,1755,1999,Optimal Kernel Shapes for Local Linear Regression,,Optimal Kernel Shapes for Local Linear \n\n...
6294,1140,2016,Mixed Linear Regression with Multiple Components,"In this paper, we study the mixed linear regre...",Mixed Linear Regression with Multiple Componen...
5418,1443,2014,Multi-Step Stochastic ADMM in High Dimensions:...,"In this paper, we consider a multi-step versio...",Multi-Step Stochastic ADMM in High Dimensions:...


In [3]:
data.shape

(9680, 5)

In [4]:
data = data.rename(columns={"full_text": "text"})

In [5]:
data.isna().sum(axis=0)

source_id       0
year            0
title           0
abstract     3319
text            3
dtype: int64

In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(data)

In [7]:
from transformers import BertTokenizer, BertModel
import torch

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)


def tokenize_text(data, tokenizer):
    """Tokenize text using tokenizer and return tokens."""
    content = data["title"]
    content += data["abstract"] if data["abstract"] else ""
    content += data["text"] if data["text"] else ""
    ids = tokenizer.encode(
        content,
        add_special_tokens=True,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_attention_mask=True,
    )
    return {"ids": ids}

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [8]:
dataset = dataset.map(tokenize_text, fn_kwargs={"tokenizer": tokenizer})

Map:   0%|          | 0/9680 [00:00<?, ? examples/s]

In [9]:
dataset = dataset.with_format(type="torch", columns=["ids"])
dataset[1]

{'ids': tensor([  101, 20831,  6431, 23077,  2487, 20831,  6431, 23077,  8038,  8043,
          1055,  1012,  8273,  1011,  2087, 10354,  2050,  2662,  2820,  1997,
          2974, 18880,  1010,  6187, 19989, 17788, 10061,  2129,  2515,  1996,
         20831,  1997,  1037, 15756,  2897,  1006,  2193,  1997, 19962,  9331,
          8583,  2566, 11265, 21017,  1007, 14396,  2000,  1996, 11619,  1997,
          1996,  3471,  2009,  2064,  5047,  1006,  7594,  2011,  1996, 23077,
          1007,  1029, 11991,  3399,  2052,  6592,  2053,  7189,  2012,  2035,
          1010,  2144,  2035, 22017, 20898,  4972,  2064,  2022,  7528,  2478,
          1037,  4984,  2007,  2200,  2659, 20831,  1006,  1041,  1012,  1043,
          1012,  1010,  2478,  2048,  1011,  7953, 16660,  2094,  6733,  1007,
          1012,  2174,  1010,  2005,  1037,  2897,  2008, 10229,  1037,  3291,
          2013,  4973,  2478,  1037,  2334,  4083,  3627,  1010,  2057,  6011,
          2008,  1996, 23077,  1997,  1996,  

In [10]:
dataset

Dataset({
    features: ['source_id', 'year', 'title', 'abstract', 'text', 'ids'],
    num_rows: 9680
})

In [11]:
pad_index = 0


def classify(data, model):
    """Classify text using model and return classification."""
    input_ids = torch.tensor(data["ids"]).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        cls = outputs.last_hidden_state[0, 0, :]
    return {"cls": cls}


dataset = dataset.map(
    classify, fn_kwargs={"model": BertModel.from_pretrained("bert-base-uncased")}
)

Map:   0%|          | 0/9680 [00:00<?, ? examples/s]

  input_ids = torch.tensor(data["ids"]).unsqueeze(0)
  input_ids = torch.tensor(data["ids"]).unsqueeze(0)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [12]:
dataset = dataset.with_format(type="torch", columns=["cls"])

In [13]:
dataset

Dataset({
    features: ['source_id', 'year', 'title', 'abstract', 'text', 'ids', 'cls'],
    num_rows: 9680
})

In [14]:
def top_k_similar(cls_q, dataset, k=10):
    """Get top k similar papers to query cls."""
    cls = dataset["cls"]
    sim = (cls @ cls_q.T).squeeze()
    idx = sim.argsort(descending=True)[:k]
    print(sim[idx])
    return idx


model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [15]:
text = "Recurrent Neural Networks"
inputs = tokenizer(text, return_tensors="pt")
# Get the tokens' embeddings
with torch.no_grad():
    outputs = model(**inputs)
cls_q = outputs.last_hidden_state[:, 0, :]

idx = top_k_similar(cls_q, dataset, k=10)
idx = idx.tolist()
idx

tensor([187.6234, 184.4373, 184.4104, 183.4597, 181.1204, 180.9749, 180.8561,
        180.8233, 180.6211, 180.0400])


[3713, 9499, 4331, 8911, 5948, 8453, 9462, 519, 4707, 6309]

In [16]:
text = "RNN"
inputs = tokenizer(text, return_tensors="pt")
# Get the tokens' embeddings
with torch.no_grad():
    outputs = model(**inputs)
cls_q = outputs.last_hidden_state[:, 0, :]
idx = top_k_similar(cls_q, dataset, k=10)
idx = idx.tolist()
print(idx)

tensor([170.9345, 169.3526, 168.3620, 168.2141, 166.3156, 165.7052, 165.6727,
        165.0882, 163.4123, 163.1649])
[3713, 273, 9499, 4829, 8497, 9267, 6675, 8826, 5214, 5920]


In [17]:
text = "Recurrent Neural Networks (RNN)"
inputs = tokenizer(text, return_tensors="pt")
# Get the tokens' embeddings
with torch.no_grad():
    outputs = model(**inputs)
cls_q = outputs.last_hidden_state[:, 0, :]
idx = top_k_similar(cls_q, dataset, k=10)
idx = idx.tolist()
print(idx)

tensor([196.2789, 194.6957, 194.4283, 193.8195, 193.0585, 192.8403, 192.7309,
        192.2985, 191.6830, 191.0993])
[4331, 8911, 8453, 9499, 3713, 8329, 4707, 9464, 7575, 8930]


In [18]:
for i in idx:
    print(dataset["title"][i])
    print(dataset["abstract"][i])
    print("\n")

Select and Sample - A Model of Efficient Neural Inference and Learning
An increasing number of experimental studies indicate that perception encodes a posterior probability distribution over possible causes of sensory stimuli, which is used to act close to optimally in the environment. One outstanding difficulty with this hypothesis is that the exact posterior will in general be too complex to be represented directly, and thus neurons will have to represent an approximation of this distribution. Two influential proposals of efficient posterior representation by neural populations are: 1) neural activity represents samples of the underlying distribution, or 2) they represent a parametric representation of a variational approximation of the posterior. We show that these approaches can be combined for an inference scheme that retains the advantages of both: it is able to represent multiple modes and arbitrary correlations, a feature of sampling methods, and it reduces the represented spac

In [19]:
prompt = "Recurrent Neural Networks (RNN)."
document = "An abstract from the NeurIPS database that discusses " + str(
    dataset["abstract"][9464]
)

In [20]:
print(document)

An abstract from the NeurIPS database that discusses Recurrent neural networks (RNNs) are a widely used tool for modeling sequential data, yet they are often treated as inscrutable black boxes. Given a trained recurrent network, we would like to reverse engineer it--to obtain a quantitative, interpretable description of how it solves a particular task. Even for simple tasks, a detailed understanding of how recurrent networks work, or a prescription for how to develop such an understanding, remains elusive. In this work, we use tools from dynamical systems analysis to reverse engineer recurrent networks trained to perform sentiment classification, a foundational natural language processing task. Given a trained network, we find fixed points of the recurrent dynamics and linearize the nonlinear system around these fixed points. Despite their theoretical capacity to implement complex, high-dimensional computations, we find that trained networks converge to highly interpretable, low-dimens

In [21]:
from transformers import pipeline, set_seed

generator = pipeline("text-generation", model="gpt2")
set_seed(42)
generator(prompt, max_length=512, num_return_sequences=5)

  _torch_pytree._register_pytree_node(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Recurrent Neural Networks (RNN). The RNN consists of 3 separate networks: recurrent neural networks (RCTs), recurrent neural networks (RDNs), and recurrent neural populations (RNCs). RCTs are generally considered as being more complex than RDNs due to the nature of the structure and algorithm used to develop them. RNNs allow for relatively simple and cost prohibitive applications (such as learning, memory, and neural network training) and are capable of delivering complex training outputs as long as the required dataset is available. The RNN has four features:\n\nUniformity: The dataset will be different when the original data is stored in the same form as a priori. In particular, when the dataset is stored in an RNNs container, when the initial input data is presented as a single row, the entire dataset will be stored in the same form as the original.\n\nThe dataset will be different when the original data is stored in the same form as a priori. In particular, whe

In [22]:
generator = pipeline("text-generation", model="gpt2")
set_seed(42)
generator(prompt + " " + document, max_length=512, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Recurrent Neural Networks (RNN). An abstract from the NeurIPS database that discusses Recurrent neural networks (RNNs) are a widely used tool for modeling sequential data, yet they are often treated as inscrutable black boxes. Given a trained recurrent network, we would like to reverse engineer it--to obtain a quantitative, interpretable description of how it solves a particular task. Even for simple tasks, a detailed understanding of how recurrent networks work, or a prescription for how to develop such an understanding, remains elusive. In this work, we use tools from dynamical systems analysis to reverse engineer recurrent networks trained to perform sentiment classification, a foundational natural language processing task. Given a trained network, we find fixed points of the recurrent dynamics and linearize the nonlinear system around these fixed points. Despite their theoretical capacity to implement complex, high-dimensional computations, we find that trained