In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model and tokenizer
model_type = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModelForMaskedLM.from_pretrained(model_type)

In [8]:
vocab = tokenizer.vocab

for k,v in vocab.items():
    if v == 50522 - 1: print(k)

freshwater ecosystems


In [3]:
model

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(50522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [12]:
model.vocab_projector.weight.data[-3]

tensor([ 9.0496e-03, -2.1067e-02, -5.4298e-02, -6.8098e-02, -1.2024e-03,
        -9.7472e-03, -6.2430e-02, -2.9215e-02, -2.9146e-02, -1.1137e-02,
        -5.0268e-02, -5.0039e-02, -2.7808e-02, -2.2594e-02, -3.6646e-02,
         4.5885e-03, -7.3914e-02, -5.4797e-02,  3.8664e-03, -8.8493e-02,
        -7.1113e-02, -1.4475e-02, -7.7523e-02,  1.5005e-02, -6.6993e-02,
        -2.9653e-02, -6.3812e-02, -7.8983e-02, -4.0851e-02, -4.1726e-02,
        -3.2084e-02, -4.5018e-02, -2.8832e-02, -2.7007e-02, -3.4009e-02,
        -9.0697e-02, -2.6999e-02, -2.0067e-02, -5.2242e-02,  2.9216e-03,
        -3.8523e-02, -5.5479e-02, -5.6126e-02, -5.6417e-02,  1.3120e-02,
         1.5214e-02, -1.3880e-02, -7.6977e-02, -4.9199e-02,  2.1481e-02,
        -4.2122e-02, -6.4041e-02, -3.6062e-02,  3.0891e-02, -2.4131e-02,
        -8.7660e-02, -6.4405e-02, -3.9170e-02, -3.0893e-02, -1.6546e-02,
        -2.1007e-02, -4.3339e-02,  6.4595e-02, -9.3538e-03,  3.4987e-02,
        -3.1237e-02, -6.5498e-02, -6.4519e-02, -4.9

In [11]:
model.vocab_projector.weight.data[-1]

tensor([-4.5815e-02, -1.2520e-02, -6.6545e-02, -4.2761e-02, -9.6622e-03,
         1.0962e-02, -1.0034e-01, -4.1586e-02, -7.7363e-02, -3.4036e-02,
        -4.1248e-02, -8.4172e-02, -8.3280e-02, -3.9620e-02, -9.7805e-03,
        -2.8812e-02, -4.7525e-02, -3.4094e-02,  2.2444e-02, -4.3920e-02,
        -1.0368e-01, -3.5474e-02, -1.3943e-02,  2.5517e-02, -1.1391e-01,
         2.3993e-03, -1.3550e-02, -5.1630e-02,  1.8774e-02, -3.1944e-02,
        -3.0806e-03, -4.0250e-02, -4.1456e-03,  5.2317e-03, -7.2740e-02,
        -4.6933e-02, -7.4871e-02, -4.5224e-02, -6.5761e-02,  4.3127e-03,
        -7.3170e-02, -4.5238e-02, -1.0213e-01, -2.6731e-02, -3.0149e-02,
        -1.2455e-02, -2.8717e-02, -3.7203e-02,  1.3243e-03, -3.3210e-02,
        -5.5864e-02, -5.7475e-02, -1.8340e-02,  3.8490e-02, -5.6768e-02,
        -6.7764e-02, -8.4514e-02, -4.1092e-02, -1.9946e-02, -3.3641e-02,
        -3.7516e-02, -2.6224e-02,  1.0169e-01, -7.7498e-02, -2.2211e-02,
        -2.9383e-02, -2.7878e-02, -7.3883e-02, -1.2

In [7]:
phrases_to_add = ["machine learning", "deep learning"]

tokenized_phrases_to_add = [tokenizer(phrase, add_special_tokens=False) for phrase in phrases_to_add]

In [8]:
tokenized_phrases_to_add

[{'input_ids': [3698, 4083], 'attention_mask': [1, 1]},
 {'input_ids': [2784, 4083], 'attention_mask': [1, 1]}]

In [12]:
for i in range(len(phrases_to_add)):
    input_ids = tokenized_phrases_to_add[i]["input_ids"]

    phrase_embeddings = torch.mean(model.distilbert.embeddings.word_embeddings.weight.data[input_ids], dim = 0)

In [13]:
phrase_embeddings.shape

torch.Size([768])

In [4]:
new_token = "machine learning"

if new_token not in tokenizer.vocab:
    num_added_toks = tokenizer.add_tokens([new_token])


In [7]:
tokenizer.vocab

30523

In [9]:
test = "the boy is learning to use machine"

tokenizer.tokenize(test)

['the', 'boy', 'is', 'learning', 'to', 'use', 'machine']

In [14]:
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(30523, 768, padding_idx=0)

In [10]:
new_token_index = tokenizer.vocab[new_token]

In [13]:
model.distilbert.embeddings.word_embeddings.weight.data[new_token_index]

IndexError: index 30522 is out of bounds for dimension 0 with size 30522

In [14]:
model.distilbert.embeddings.word_embeddings.weight.data[0].shape

torch.Size([768])

In [3]:
model.distilbert.embeddings.word_embeddings.weight.data[0]

tensor([-1.6649e-02, -6.6612e-02, -1.6329e-02, -4.2112e-02, -8.0348e-03,
        -1.3965e-02, -6.3488e-02, -2.0491e-02, -8.5822e-03, -6.3427e-02,
        -2.8296e-02, -3.3587e-02, -3.5466e-02, -5.2275e-03, -2.0351e-02,
        -6.0686e-02, -5.0486e-02, -5.8112e-02, -2.1134e-02, -5.8061e-02,
        -3.6556e-02, -3.8286e-02, -9.5839e-03, -2.8228e-02, -1.0817e-01,
        -4.2421e-02, -6.7244e-03, -7.6137e-02,  1.3189e-02, -1.9380e-02,
        -3.8669e-02, -1.0872e-02, -1.7320e-02, -3.3488e-02, -6.0760e-02,
        -5.3773e-02, -3.7320e-02, -2.9963e-02, -5.9872e-02, -2.6235e-02,
        -5.3190e-02, -3.6603e-02, -7.2672e-02, -3.5065e-02, -1.1630e-02,
        -7.6393e-03, -1.0994e-02, -3.4178e-02, -3.4682e-02, -3.5877e-02,
        -5.6536e-02, -4.5791e-02, -5.2554e-02,  1.3923e-01, -3.5378e-02,
        -3.6677e-02, -2.9200e-02, -9.8809e-03, -2.6176e-02,  1.1668e-02,
        -2.1027e-02, -2.2904e-02,  1.5897e-01, -3.1597e-02, -3.2808e-02,
         1.5736e-04, -4.1507e-02, -3.9246e-02, -1.6

In [5]:
model

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [1]:
import json

In [2]:
with open("/scratch/lamdo/erukg_cache_custom_trained_combined_references_nounphrase_v8-1.json") as f:
    phrases = json.load(f)

In [8]:
type(phrases)

dict

In [9]:
phrases.keys()

dict_keys(['phrase_vocab', 'docid2phraseid', 'docid2tokenscore'])

In [3]:
phrase_vocab = phrases["phrase_vocab"]
docid2phraseid = phrases["docid2phraseid"]

In [4]:
for i in range(len(phrase_vocab)):
    if phrase_vocab[i] == "minnesota ph.d. dissertation":
        print(i)

24282


In [14]:
type(phrase_vocab), type(docid2phraseid)

(list, dict)

In [16]:
list(docid2phraseid.keys())[0]

'46010735'

In [18]:
docid2phraseid["46010735"]

[39512, 208, 6479, 12289, 5713, 36481, 152172]

In [5]:
from collections import Counter

In [6]:
phrase_counter = Counter()
for item in docid2phraseid.values():
    phrase_counter.update(item)

In [7]:
phrase_counter[24282]

20

In [10]:
len([item for item in phrase_counter.items() if item[1] >= 50])

9539

In [24]:
tokens = tokenizer("bcg vaccine", add_special_tokens=False)

In [10]:
import string
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

In [11]:
vocab = tokenizer.vocab
for punc in string.punctuation:
    print(vocab.get(punc))

for word in STOPWORDS:
    print(vocab.get(word))

999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1063
1064
1065
1066
1045
2033
2026
2870
2057
2256
14635
9731
2017
None
None
None
None
2115
6737
4426
25035
2002
2032
2010
2370
2016
None
2014
5106
2841
2009
None
2049
2993
2027
2068
2037
17156
3209
2054
2029
2040
3183
2023
2008
None
2122
2216
2572
2003
2024
2001
2020
2022
2042
2108
2031
2038
2018
2383
2079
2515
2106
2725
1037
2019
1996
1998
2021
2065
2030
2138
2004
2127
2096
1997
2012
2011
2005
2007
2055
2114
2090
2046
2083
2076
2077
2044
2682
2917
2000
2013
2039
2091
1999
2041
2006
2125
2058
2104
2153
2582
2059
2320
2182
2045
2043
2073
2339
2129
2035
2151
2119
2169
2261
2062
2087
2060
2070
2107
2053
4496
2025
2069
2219
2168
2061
2084
2205
2200
1055
1056
2064
2097
2074
2123
None
2323
None
2085
1040
2222
1049
1051
2128
2310
1061
7110
4995
None
2481
None
2134
None
2987
None
2910
None
8440
None
4033
None
3475
None
5003
None
None
None
None
None
None
171

In [12]:
STOPWORDS

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
tokenizer("minnesota ph.d. dissertation")

{'input_ids': [101, 5135, 6887, 1012, 1040, 1012, 14481, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
vocab_projector_weight = model.vocab_projector.weight.data.detach()

torch.mean(model.distilbert.embeddings.word_embeddings.weight.data[tokens["input_ids"]], dim = 0)

tensor([ 1.1039e-02, -1.4630e-02, -4.1841e-02, -4.2749e-02, -4.2918e-02,
        -5.6100e-02, -1.6078e-02, -5.4448e-03, -3.9614e-02, -2.0507e-02,
        -2.9320e-02, -2.4553e-02, -3.6039e-02, -1.1925e-02, -3.1055e-02,
        -4.0494e-02, -2.2535e-02, -2.4259e-02, -2.1335e-02, -4.6651e-02,
        -3.3383e-02, -2.0066e-02, -3.3335e-02,  2.5565e-02, -3.6282e-02,
        -3.3762e-02, -1.9214e-02, -4.0721e-02, -1.2457e-02, -2.6381e-02,
         5.5385e-03, -5.2517e-02, -7.0429e-02, -3.2824e-02,  6.9580e-03,
        -1.9314e-02, -2.5998e-02,  5.7266e-03, -4.3370e-02, -1.9060e-02,
        -4.9430e-02, -8.2869e-02, -6.4278e-02, -6.0141e-02, -3.6454e-02,
        -3.1448e-02, -2.3804e-02, -4.6387e-03, -8.2683e-03, -2.4804e-02,
        -4.3299e-02, -4.6207e-02, -5.3463e-02,  2.5140e-02, -2.1960e-02,
        -4.3798e-02,  4.1690e-02, -1.1906e-02, -1.2816e-02,  2.3274e-02,
        -2.9134e-02, -3.6432e-02,  2.5015e-02, -1.3404e-02, -2.9042e-02,
        -5.3636e-03, -5.5516e-02, -4.1645e-02, -4.5

In [5]:
import json
from pyserini.search.lucene import LuceneSearcher

In [2]:
searcher = LuceneSearcher("/scratch/lamdo/keyphrase_generation_retrieval_index/index/eru-kg-base")

Mar 29, 2025 9:22:20 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [8]:
searcher.num_docs

630749

In [3]:
doc_0 = searcher.doc(0).lucene_document()

In [6]:
json.loads(doc_0.get("raw"))

{'id': '46010735',
 'contents': '[On the nature of regressive state in hypnosis: a study by word association test]\nThe purpose of this study is to reveal the nature of regressive state in hypnosis by means of word association test (WAT) Stimulus words for WAT, pronounced without intonation, was presented to hypnotic, control, and distraction groups. At the first test, all groups were under awaken state, and at the second test, hypnotic group was under hypnosis, distraction group was under distraction. (1) Under hypnosis, more visual images (signifie images) and clang associations (significant images) were imagined. The hypnosis was supposed to be a partial, controlled "regression in the service of the ego" and in this state the lexical meaning was not dissolved, but the unity among person, symbol, and referential object in Werner Kaplan\'s sense was restored by the images. (2) The results suggest that hypnosis is different from distraction state, and is a state of specific concentrati