In [2]:
import torch, os
from transformers import AutoModelForMaskedLM, AutoTokenizer, TrainingArguments

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Paths to checkpoint files
checkpoint_dir = "model_gitig_"

# Load the tokenizer and model
model = AutoModelForMaskedLM.from_pretrained(checkpoint_dir)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)

vocab = tokenizer.vocab
reverse_voc = {v:k for k,v in vocab.items()}

# Define test sentences with [MASK] token
test_sentences = [
    "[MASK] is the task of identifying and retrieving information that are relevant to an information need",
    "[MASK] is essential for the future.",
    "The quick brown fox jumps over the [MASK] dog.",
    "In computing, a [MASK] is an organized collection of data or a type of data store",
    "[MASK] are phrases that represent the most relevant information contained in the document",
    "A [MASK] task requires the detection and classification of semantic relationship mentions within a set of artifacts, typically from text or XML documents",
    "[MASK] comprise two or more homopolymer subunits linked by covalent bonds",
    "A [MASK] occurs when a baby is born before 37 weeks of pregnancy",
    """Supplementing remote sensing of Ice: Deep Learning-Based [MASK] System for Automatic Detection and Localization of Sea-ice Formations From Close-Range Optical Images. This paper presents a three-stage approach for the automated analysis of close-range optical images containing ice objects. The proposed system is based on an ensemble of deep learning models and conditional random field postprocessing. The following surface ice formations were considered: Icebergs, Deformed ice, Level ice, Broken ice, Ice floes, Floebergs, Floebits, Pancake ice, and Brash ice. Additionally, five non-surface ice categories were considered: Sky, Open water, Shore, Underwater ice, and Melt ponds. To find input parameters for the approach, the performance of 12 different neural network architectures was explored and evaluated using a 5-fold cross-validation scheme. The best performance was achieved using an ensemble of models having pyramid pooling layers (PSPNet, PSPDenseNet, DeepLabV3+, and UPerNet) and convolutional conditional random field postprocessing with a mean intersection over union score of 0.799, and this outperformed the best single-model approach. The results of this study show that when per-class performance was considered, the Sky was the easiest class to predict, followed by Deformed ice and Open water. Melt pond was the most challenging class to predict. Furthermore, we have extensively explored the strengths and weaknesses of our approach and, in the process, discovered the types of scenes that pose a more significant challenge to the underlying neural networks. When coupled with optical sensors and AIS, the proposed approach can serve as a supplementary source of large-scale ‘ground truth’ data for validation of satellite-based sea-ice products. We have provided an implementation of the approach at https://github.com/panchinabil/sea_ice_segmentation .""",
    """The best performing models also connect the encoder and decoder through an [MASK]. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.""",
    """[MASK] have emerged as a powerful new family of deep generative models with record-breaking performance in many applications, including image synthesis, video generation, and molecule design""",
    """Real world [MASK] (BCI): Cross-Domain Learning and Practical Applications"""
]

# Perform predictions
model.eval()
for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get top predictions for masked token
    masked_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    # print(logits)
    predictions = logits[0, masked_index].topk(50)

    print(f"Input: {sentence}")
    for idx, score in zip(predictions.indices.tolist(), predictions.values.tolist()):
        predicted_token = [reverse_voc[item] for item in idx]
        print(", ".join(predicted_token))
    print()


Input: [MASK] is the task of identifying and retrieving information that are relevant to an information need
identification, retrieval, metadata, verification, sorting, screening, analysis, it, processing, classification, validation, discovery, searching, this, tracking, documentation, evaluation, scanning, reporting, assessment, research, authentication, coordination, search, recognition, mapping, preservation, coding, information, interpretation, investigation, diagnosis, detection, filtering, registration, retention, tracing, accounting, relevance, monitoring, recall, translation, sampling, transparency, forensic, persistence, prevention, framing, interviewing, standardization

Input: [MASK] is essential for the future.
education, sustainability, cooperation, knowledge, it, this, integrity, conservation, democracy, freedom, stability, happiness, biodiversity, equality, peace, diversity, progress, survival, equity, growth, innovation, development, safety, energy, transparency, care, 

In [2]:
torch.max(logits[...,30522:])

tensor(37.1301)

In [3]:
torch.max(logits[...,:30522])

tensor(21.4729)

In [11]:
model

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(41657, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [4]:
model.distilbert.embeddings.word_embeddings.weight.data

tensor([[-0.0166, -0.0666, -0.0163,  ..., -0.0200, -0.0514, -0.0264],
        [-0.0132, -0.0673, -0.0161,  ..., -0.0227, -0.0554, -0.0260],
        [-0.0176, -0.0709, -0.0144,  ..., -0.0246, -0.0596, -0.0232],
        ...,
        [ 0.0267, -0.0407,  0.2724,  ...,  0.0286,  0.0342,  0.1492],
        [ 0.0524,  0.2483,  0.1334,  ..., -0.2215, -0.0739,  0.0652],
        [ 0.2415,  0.1530,  0.3405,  ..., -0.1294, -0.1256,  0.0224]])

In [5]:
model.vocab_projector.weight.data

tensor([[-0.0166, -0.0666, -0.0163,  ..., -0.0200, -0.0514, -0.0264],
        [-0.0132, -0.0673, -0.0161,  ..., -0.0227, -0.0554, -0.0260],
        [-0.0176, -0.0709, -0.0144,  ..., -0.0246, -0.0596, -0.0232],
        ...,
        [ 0.0267, -0.0407,  0.2724,  ...,  0.0286,  0.0342,  0.1492],
        [ 0.0524,  0.2483,  0.1334,  ..., -0.2215, -0.0739,  0.0652],
        [ 0.2415,  0.1530,  0.3405,  ..., -0.1294, -0.1256,  0.0224]])