# CASPER Quick test run

In [1]:
import torch, os, string
from transformers import AutoModelForMaskedLM, AutoTokenizer
from splade.models.transformer_rep import PhraseSpladev3 as CASPER
from collections import Counter

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
# set the dir for trained weights
model_type_or_dir = "lamdo/casper"

In [4]:
# loading model and tokenizer

model = CASPER(model_type_or_dir, agg="max", original_bert_vocab_size=30522)
# model = Splade(model_type_or_dir, agg = "max")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}

len(reverse_voc)

59419

In [5]:
# doc = """ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction. Neural information retrieval (IR) has greatly advanced search and other knowledge-intensive language tasks. While many neural IR methods encode queries and documents into single-vector representations, late interaction models produce multi-vector representations at the granularity of each token and decompose relevance modeling into scalable token-level computations. This decomposition has been shown to make late interaction more effective, but it inflates the space footprint of these models by an order of magnitude. In this work, we introduce ColBERTv2, a retriever that couples an aggressive residual compression mechanism with a denoised supervision strategy to simultaneously improve the quality and space footprint of late interaction. We evaluate ColBERTv2 across a wide range of benchmarks, establishing state-of-the-art quality within and outside the training domain while reducing the space footprint of late interaction models by 6--10×."""

# doc = """Supplementing Remote Sensing of Ice: Deep Learning-Based Image Segmentation System for Automatic Detection and Localization of Sea-ice Formations From Close-Range Optical Images. This paper presents a three-stage approach for the automated analysis of close-range optical images containing ice objects. The proposed system is based on an ensemble of deep learning models and conditional random field postprocessing. The following surface ice formations were considered: Icebergs, Deformed ice, Level ice, Broken ice, Ice floes, Floebergs, Floebits, Pancake ice, and Brash ice. Additionally, five non-surface ice categories were considered: Sky, Open water, Shore, Underwater ice, and Melt ponds. To find input parameters for the approach, the performance of 12 different neural network architectures was explored and evaluated using a 5-fold cross-validation scheme. The best performance was achieved using an ensemble of models having pyramid pooling layers (PSPNet, PSPDenseNet, DeepLabV3+, and UPerNet) and convolutional conditional random field postprocessing with a mean intersection over union score of 0.799, and this outperformed the best single-model approach. The results of this study show that when per-class performance was considered, the Sky was the easiest class to predict, followed by Deformed ice and Open water. Melt pond was the most challenging class to predict. Furthermore, we have extensively explored the strengths and weaknesses of our approach and, in the process, discovered the types of scenes that pose a more significant challenge to the underlying neural networks. When coupled with optical sensors and AIS, the proposed approach can serve as a supplementary source of large-scale ‘ground truth’ data for validation of satellite-based sea-ice products. We have provided an implementation of the approach at https://github.com/panchinabil/sea_ice_segmentation ."""

doc = "deep transfer learning in neural networks"

In [6]:
# # now compute the document representation
# for punc in string.punctuation:
#     doc = doc.replace(punc, " ")
    
doc_tokens = tokenizer(doc, max_length = 256, return_tensors="pt")
with torch.no_grad():
    doc_rep = model(d_kwargs=doc_tokens)["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)
    # print(torch.sum(doc_rep))
    # doc_rep = encode_custom_mask_punc(doc_tokens, model).squeeze()
print(doc_rep.shape)
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
print("number of actual dimensions: ", len(col))

# now let's inspect the bow representation:
weights = doc_rep[col].cpu().tolist()
d = {k: v for k, v in zip(col, weights)} #if k >= model.original_bert_vocab_size}
sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
bow_rep = []

print(doc)
for k, v in sorted_d.items():
    print((reverse_voc[k], round(v, 2)))
    bow_rep.append((reverse_voc[k], round(v, 2)))
# print("SPLADE BOW rep:\n", bow_rep)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([59419])
number of actual dimensions:  181
deep transfer learning in neural networks
('deep learning', 2.26)
('deep', 1.43)
('knowledge transfer', 1.36)
('transfer learning', 1.25)
('soap', 1.24)
('neural network', 1.15)
('learning algorithm', 1.09)
('ann', 1.05)
('training', 1.03)
('casper', 1.02)
('neural networks', 1.02)
('learning', 0.93)
('embedding', 0.9)
('in', 0.89)
('language', 0.86)
('e-learning', 0.83)
('network', 0.81)
('image', 0.8)
('transfer', 0.78)
('knowledge management', 0.78)
('depth', 0.77)
('knowledge sharing', 0.75)
('magic', 0.74)
('brain', 0.72)
('spillover', 0.72)
('the', 0.7)
('.', 0.69)
('language learning', 0.69)
('machine translation', 0.67)
('mapping', 0.63)
('specialisation', 0.59)
(',', 0.57)
('neurons', 0.57)
('reinforcement learning', 0.56)
('domain adaptation', 0.55)
('back propagation', 0.55)
('deeper', 0.52)
('people', 0.52)
('km', 0.51)
('fox', 0.5)
('cascades', 0.49)
('semi-supervised learning', 0.48)
('cross', 0.48)
('habituation', 0.4

  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():
  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():
