# SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking

This notebook gives a minimal example usage of SPLADE.

* We provide models via Hugging Face (https://huggingface.co/naver)
* See [Naver Labs Europe website](https://europe.naverlabs.com/research/machine-learning-and-optimization/splade-models/) for other intermediate models.

| model | MRR@10 (MS MARCO dev) | recall@1000 (MS MARCO dev) | expected FLOPS | ~ avg q length | ~ avg d length | 
| --- | --- | --- | --- | --- | --- |
| `naver/splade_v2_max` (**v2** [HF](https://huggingface.co/naver/splade_v2_max)) | 34.0 | 96.5 | 1.32 | 18 | 92 |
| `naver/splade_v2_distil` (**v2** [HF](https://huggingface.co/naver/splade_v2_distil)) | 36.8 | 97.9 | 3.82 | 25 | 232 |
| `naver/splade-cocondenser-selfdistil` (**v2bis**, [HF](https://huggingface.co/naver/splade-cocondenser-selfdistil))| 37.6 | 98.4 | 2.32 | 56 | 134 |
| `naver/splade-cocondenser-ensembledistil` (**v2bis**, [HF](https://huggingface.co/naver/splade-cocondenser-ensembledistil)) | 38.3 | 98.3  | 1.85 | 44 | 120 |

In [1]:
import torch, os, string
from transformers import AutoModelForMaskedLM, AutoTokenizer
from splade.models.transformer_rep import SpladeMaxSim, Splade, PhraseSpladev2, PhraseSpladev3, PhraseSpladev4, PhraseSpladev5
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [15]:
# set the dir for trained weights

##### v2
# model_type_or_dir = "naver/splade_v2_max"
model_type_or_dir = "naver/splade_v2_distil"

### v2bis, directly download from Hugging Face
# model_type_or_dir = "naver/splade-cocondenser-selfdistil"
# model_type_or_dir = "naver/splade-cocondenser-ensembledistil"
# model_type_or_dir = "/scratch/lamdo/phrase_splade_checkpoints/phrase_splade_71/debug/checkpoint/model"
# model_type_or_dir = "lamdo/scibert-base-uncased-phrase-30kaddedphrasesfroms2orc-mlm-70000steps"
# model_type_or_dir = "/scratch/lamdo/splade_maxsim_ckpts/splade_maxsim_150k_lowregv3/debug/checkpoint/model"
# model_type_or_dir = 'lamdo/distilbert-base-uncased-phrase-16kaddedphrasesfroms2orc-mlm-150000steps-multiwords'
# model_type_or_dir = "/scratch/lamdo/splade_checkpoints/experiments_combined_references_v8-1/debug/checkpoint/model"
# model_type_or_dir = "lamdo/distilbert-base-uncased-phrase-60kaddedphrasesfroms2orc-mlm-150000steps"
# model_type_or_dir = "/scratch/lamdo/phrase_splade_checkpoints/splade_max_1/debug/checkpoint/model"

In [16]:
# loading model and tokenizer

# model = PhraseSpladev3(model_type_or_dir, agg="max", original_bert_vocab_size=30522)
model = Splade(model_type_or_dir, agg = "max")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}

len(reverse_voc)

30522

In [17]:
model

Splade(
  (transformer_rep): TransformerRep(
    (transformer): DistilBertForMaskedLM(
      (activation): GELUActivation()
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(in_features=768, out_features=768, bias=True)
                (v_lin): Linear(in_features=768, out_features=768, bias=True)
                (out_lin): Linear(in_features=768, out_features=768, bias=True)
              )
 

In [18]:
def encode_custom(tokens, model, is_q = False):
    out = model.encode_(tokens, is_q)["logits"]  # shape (bs, pad_len, voc_size)

    if hasattr(model, "create_token_phrase_mask"):
        print("yoyo")
        token_phrase_mask = model.create_token_phrase_mask(tokens)
        out = out * token_phrase_mask

    out = torch.log(1 + torch.relu(out)) * tokens["attention_mask"].unsqueeze(-1)

    # out = encode_custom_mask_punc(tokens, model, is_q, False)
    # mask = ~torch.isin(tokens["input_ids"], PUNCID)
    # out = out * mask.unsqueeze(-1)

    return out

    res = torch.zeros_like(out)
    res = res.to(out.device)

    out, token_indices = torch.max(out, dim = 1)


    res.scatter_(1, token_indices.unsqueeze(1), out.unsqueeze(1))
    return res

def splade_pooling_v3(out, tokens, original_bert_vocab_size):
    out_tokens = out[..., :original_bert_vocab_size] # shape (bs, pad_len, original_bert_vocab_size)
    out_phrases = out[..., original_bert_vocab_size:] # shape (bs, pad_len, vocab_size - original_bert_vocab_size)
    values_tokens, _ = torch.max(torch.log(1 + torch.relu(out_tokens)) * tokens["attention_mask"].unsqueeze(-1), dim=1) # shape (bs, original_bert_vocab_size)
    values_phrases = torch.sum(torch.log(1 + torch.relu(out_phrases)) * tokens["attention_mask"].unsqueeze(-1), dim=1) # shape (bs, vocab_size - original_bert_vocab_size)

    values = torch.cat([values_tokens, values_phrases], dim = -1)
    return values
    # 0 masking also works with max because all activations are positive


# PUNCID = torch.tensor([tokenizer.vocab[punc] for punc in string.punctuation] + [tokenizer.vocab["[SEP]"], tokenizer.vocab["[CLS]"]])
PUNCID = torch.tensor([tokenizer.vocab["[SEP]"], tokenizer.vocab["[CLS]"]])
def encode_custom_mask_punc(tokens, model, is_q = False, pooling = True):
    out = model.encode_(tokens, is_q)["logits"]  # shape (bs, pad_len, voc_size)
    out = torch.log(1 + torch.relu(out)) * tokens["attention_mask"].unsqueeze(-1)

    mask = ~torch.isin(tokens["input_ids"], PUNCID)
    out = out * mask.unsqueeze(-1)

    res = torch.zeros_like(out)
    res = res.to(out.device)

    # out, token_indices = torch.max(out, dim = 1)

    if pooling: return splade_pooling_v3(out, tokens, model.original_bert_vocab_size)
    return out

In [19]:
# example document from MS MARCO passage collection (doc_id = 8003157)

# doc = """ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction. Neural information retrieval (IR) has greatly advanced search and other knowledge-intensive language tasks. While many neural IR methods encode queries and documents into single-vector representations, late interaction models produce multi-vector representations at the granularity of each token and decompose relevance modeling into scalable token-level computations. This decomposition has been shown to make late interaction more effective, but it inflates the space footprint of these models by an order of magnitude. In this work, we introduce ColBERTv2, a retriever that couples an aggressive residual compression mechanism with a denoised supervision strategy to simultaneously improve the quality and space footprint of late interaction. We evaluate ColBERTv2 across a wide range of benchmarks, establishing state-of-the-art quality within and outside the training domain while reducing the space footprint of late interaction models by 6--10×."""

doc = """Supplementing Remote Sensing of Ice: Deep Learning-Based Image Segmentation System for Automatic Detection and Localization of Sea-ice Formations From Close-Range Optical Images. This paper presents a three-stage approach for the automated analysis of close-range optical images containing ice objects. The proposed system is based on an ensemble of deep learning models and conditional random field postprocessing. The following surface ice formations were considered: Icebergs, Deformed ice, Level ice, Broken ice, Ice floes, Floebergs, Floebits, Pancake ice, and Brash ice. Additionally, five non-surface ice categories were considered: Sky, Open water, Shore, Underwater ice, and Melt ponds. To find input parameters for the approach, the performance of 12 different neural network architectures was explored and evaluated using a 5-fold cross-validation scheme. The best performance was achieved using an ensemble of models having pyramid pooling layers (PSPNet, PSPDenseNet, DeepLabV3+, and UPerNet) and convolutional conditional random field postprocessing with a mean intersection over union score of 0.799, and this outperformed the best single-model approach. The results of this study show that when per-class performance was considered, the Sky was the easiest class to predict, followed by Deformed ice and Open water. Melt pond was the most challenging class to predict. Furthermore, we have extensively explored the strengths and weaknesses of our approach and, in the process, discovered the types of scenes that pose a more significant challenge to the underlying neural networks. When coupled with optical sensors and AIS, the proposed approach can serve as a supplementary source of large-scale ‘ground truth’ data for validation of satellite-based sea-ice products. We have provided an implementation of the approach at https://github.com/panchinabil/sea_ice_segmentation ."""


# doc = """A comprehensive survey of graph embedding: Problems, techniques, and applications. Graph is an important data representation which appears in a wide diversity of real-world scenarios. Effective graph analytics provides users a deeper understanding of what is behind the data, and thus can benefit a lot of useful applications such as node classification, node recommendation, link prediction, etc. However, most graph analytics methods suffer the high computation and space cost. Graph embedding is an effective yet efficient way to solve the graph analytics problem. It converts the graph data into a low dimensional space in which the graph structural information and graph properties are maximumly preserved. In this survey, we conduct a comprehensive review of the literature in graph embedding. We first introduce the formal definition of graph embedding as well as the related concepts. After that, we propose two taxonomies of graph embedding which correspond to what challenges exist in different"""

# doc = """Attention Is All You Need. The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data."""

# doc = "ERU-KG: Efficient Reference-aligned Unsupervised Keyphrase Generation"

# doc = """ERU-KG: Efficient Reference-aligned Unsupervised Keyphrase Generation. Unsupervised keyphrase prediction has gained growing interest in recent years. However, existing methods typically rely on heuristically defined importance scores, which may lead to inaccurate informativeness estimation. In addition, they lack consideration for time efficiency. To solve these problems, we propose ERU-KG, an unsupervised keyphrase generation (UKG) model that consists of a phraseness and an informativeness module. The former generate candidates, while the latter estimate their relevance. The informativeness module innovates by learning to model informativeness through references (e.g., queries, citation contexts, and titles) and at the term-level, thereby 1) capturing how the key concepts of the document are perceived in different contexts and 2) estimate informativeness of phrases more efficiently by aggregating term informativeness, removing the need for explicit modeling of the candidates. ERU-KG demonstrates its effectiveness on keyphrase generation benchmarks by outperforming unsupervised baselines and achieving on average 89% of the performance of a supervised baseline for top 10 predictions. Additionally, to highlight its practical utility, we evaluate the model on text retrieval tasks and show that keyphrases generated by ERU-KG are effective when employed as query and document expansions. Finally, inference speed tests reveal that ERU-KG is the fastest among baselines of similar model sizes."""

# doc = """SPLADE v2: Sparse Lexical and Expansion Model for Information Retrieval. In neural Information Retrieval (IR), ongoing research is directed towards improving the first retriever in ranking pipelines. Learning dense embeddings to conduct retrieval using efficient approximate nearest neighbors methods has proven to work well. Meanwhile, there has been a growing interest in learning \emph{sparse} representations for documents and queries, that could inherit from the desirable properties of bag-of-words models such as the exact matching of terms and the efficiency of inverted indexes. Introduced recently, the SPLADE model provides highly sparse representations and competitive results with respect to state-of-the-art dense and sparse approaches. In this paper, we build on SPLADE and propose several significant improvements in terms of effectiveness and/or efficiency. More specifically, we modify the pooling mechanism, benchmark a model solely based on document expansion, and introduce models trained with distillation. We also report results on the BEIR benchmark. Overall, SPLADE is considerably improved with more than 9\% gains on NDCG@10 on TREC DL 2019, leading to state-of-the-art results on the BEIR benchmark."""

# doc = """The author uses 3 096 sample households in 15 counties from the year 1995 to 2006 to analyze the impact of PFPs on rural households' income inequality by income inequality decomposition.The research indicates that:(1) the percentage of subsidy income generated from PFPs has increased 8.03% during the period from 1995 to 2006;(2) the contribution of subsidy income generated from PFPs has been up from 0.330 7% in 1995 to 3.794 1% in 2006;(3) the policy-caused subsidy income inequality is more prominent than that caused by the planned regions of PFPs.Therefore a rational policy adjustment of PFPs will contribute more to poverty reduction in China's rural areas."""

# doc = " | But much of the responsibility of the social inequity that leads to different health outcomes lies elsewhere. Health is affected by policies in other sectors, such as education, taxation, transport, and agriculture too."


# doc = "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. The BLAST programs are widely used tools for searching protein and DNA databases for sequence similarities. For protein comparisons, a variety of definitional, algorithmic and statistical refinements described here permits the execution time of the BLAST programs to be decreased substantially while enhancing their sensitivity to weak similarities. A new criterion for triggering the extension of word hits, combined with a new heuristic for generating gapped alignments, yields a gapped BLAST program that runs at approximately three times the speed of the original. In addition, a method is introduced for automatically combining statistically significant alignments produced by BLAST into a position-specific score matrix, and searching the database using this matrix. The resulting Position-Specific Iterated BLAST (PSIBLAST) program runs at approximately the same speed per iteration as gapped BLAST, but in many cases is much more sensitive to weak but biologically relevant sequence similarities. PSI-BLAST is used to uncover several new and interesting members of the BRCT superfamily."

# doc = "Generative Image Dynamics. We present an approach to modeling an image-space prior on scene motion. Our prior is learned from a collection of motion trajectories extracted from real video sequences depicting natural, oscillatory dynamics such as trees, flowers, candles, and clothes swaying in the wind. We model this dense, long-term motion prior in the Fourier domain:given a single image, our trained model uses a frequency-coordinated diffusion sampling process to predict a spectral volume, which can be converted into a motion texture that spans an entire video. Along with an image-based rendering module, these trajectories can be used for a number of downstream applications, such as turning still images into seamlessly looping videos, or allowing users to realistically interact with objects in real pictures by interpreting the spectral volumes as image-space modal bases, which approximate object dynamics."

# doc = "Rich Human Feedback for Text-to-Image Generation. Recent Text-to-Image (T2I) generation models such as Stable Diffusion and Imagen have made significant progress in generating high-resolution images based on text descriptions. However, many generated images still suffer from issues such as artifacts/implausibility, misalignment with text descriptions, and low aesthetic quality. Inspired by the success of Reinforcement Learning with Human Feedback (RLHF) for large language models, prior works collected human-provided scores as feedback on generated images and trained a reward model to improve the T2I generation. In this paper, we enrich the feedback signal by (i) marking image regions that are implausible or misaligned with the text, and (ii) annotating which words in the text prompt are misrepresented or missing on the image. We collect such rich human feedback on 18K generated images (RichHF-18K) and train a multimodal transformer to predict the rich feedback automatically. We show that the predicted rich human feedback can be leveraged to improve image generation, for example, by selecting high-quality training data to finetune and improve the generative models, or by creating masks with predicted heatmaps to inpaint the problematic regions. Notably, the improvements generalize to models (Muse) beyond those used to generate the images on which human feedback data were collected (Stable Diffusion variants)"

# doc = "MedYOLO: A Medical Image Object Detection Framework. Artificial intelligence-enhanced identification of organs, lesions, and other structures in medical imaging is typically done using convolutional neural networks (CNNs) designed to make voxel-accurate segmentations of the region of interest. However, the labels required to train these CNNs are time-consuming to generate and require attention from subject matter experts to ensure quality. For tasks where voxel-level precision is not required, object detection models offer a viable alternative that can reduce annotation effort. Despite this potential application, there are few options for general purpose object detection frameworks available for 3-D medical imaging. We report on MedYOLO, a 3-D object detection framework using the one-shot detection method of the YOLO family of models and designed for use with medical imaging. We tested this model on four different datasets: BRaTS, LIDC, an abdominal organ Computed Tomography (CT) dataset, and an ECG-gated heart CT dataset. We found our models achieve high performance on commonly present medium and large-sized structures such as the heart, liver, and pancreas even without hyperparameter tuning. However, the models struggle with very small or rarely present structures."

# doc = "A study of smoothing methods for language models applied to ad hoc information retrieval. Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and then rank documents by the likelihood of the query according to the estimated language model. A core problem in language model estimation is smoothing, which adjusts the maximum likelihood estimator so as to correct the inaccuracy due to data sparseness. In this paper, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collection."

# doc = "Big data: astronomical or genomical? Genomics is a Big Data science and is going to get much bigger, very soon, but it is not known whether the needs of genomics will exceed other Big Data domains. Projecting to the year 2025, we compared genomics with three other major generators of Big Data: astronomy, YouTube, and Twitter. Our estimates show that genomics is a “four-headed beast”—it is either on par with or the most demanding of the domains analyzed here in terms of data acquisition, storage, distribution, and analysis. We discuss aspects of new technologies that will need to be developed to rise up and meet the computational challenges that genomics poses for the near future. Now is the time for concerted, community-wide planning for the “genomical” challenges of the next decade."

# doc = "Topic sentiment mixture: modeling facets and opinions in weblogs. In this paper, we define the problem of topic-sentiment analysis on Weblogs and propose a novel probabilistic model to capture the mixture of topics and sentiments simultaneously. The proposed Topic-Sentiment Mixture (TSM) model can reveal the latent topical facets in a Weblog collection, the subtopics in the results of an ad hoc query, and their associated sentiments. It could also provide general sentiment models that are applicable to any ad hoc topics. With a specifically designed HMM structure, the sentiment models and topic models estimated with TSM can be utilized to extract topic life cycles and sentiment dynamics. Empirical experiments on different Weblog datasets show that this approach is effective for modeling the topic facets and sentiments and extracting their dynamics from Weblog collections."


# doc = "Deep Residual Learning for Image Recognition. Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers."

# doc = "Fairness in Dead-Reckoning based Distributed Multi-Player Games. In a distributed multi-player game that uses dead-reckoning vectors to exchange movement information among players, there is inaccuracy in rendering the objects at the receiver due to network delay between the sender and the receiver. The object is placed at the receiver at the position indicated by the dead-reckoning vector, but by that time, the real position could have changed considerably at the sender. This inaccuracy would be tolerable if it is consistent among all players; that is, at the same physical time, all players see inaccurate (with respect to the real position of the object) but the same position and trajectory for an object. But due to varying network delays between the sender and different receivers, the inaccuracy is different at different players as well. This leads to unfairness in game playing. In this paper, we first introduce an error measure for estimating this inaccuracy. Then we develop an algorithm for scheduling the sending of dead-reckoning vectors at a sender that strives to make this error equal at different receivers over time. This algorithm makes the game very fair at the expense of increasing the overall mean error of all players. To mitigate this effect, we propose a budget based algorithm that provides improved fairness without increasing the mean error thereby maintaining the accuracy of game playing. We have implemented both the scheduling algorithm and the budget based algorithm as part of BZFlag, a popular distributed multi-player game. We show through experiments that these algorithms provide fairness among players in spite of widely varying network delays. An additional property of the proposed algorithms is that they require less number of DRs to be exchanged (compared to the current implementation of BZflag) to achieve the same level of accuracy in game playing."

# doc = "Evaluating Adaptive Resource Management for Distributed Real-Time Embedded Systems. A challenging problem faced by researchers and developers of distributed real-time and embedded (DRE) systems is devising and implementing effective adaptive resource management strategies that can meet end-to-end quality of service (QoS) requirements in varying operational conditions. This paper presents two contributions to research in adaptive resource management for DRE systems. First, we describe the structure and functionality of the Hybrid Adaptive Resourcemanagement Middleware (HyARM), which provides adaptive resource management using hybrid control techniques for adapting to workload fluctuations and resource availability. Second, we evaluate the adaptive behavior of HyARM via experiments on a DRE multimedia system that distributes video in real-time. Our results indicate that HyARM yields predictable, stable, and high system performance, even in the face of fluctuating workload and resource availability."

# doc = "Real World BCI: Cross-Domain Learning and Practical Applications"


# doc = "how long is german measles contagious"

# doc = "Nonsense. In places where people have most kids, it doesn't matter how expensive baby care products are because most babies are born where they aren't used nearly as much, or at all. Sure, making them more expensive will help having less children in the first world, but that would change squat WRT the total number."

# doc = "lda"

# doc = "The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonwealth Bank. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."


# doc = "Here are the SEC requirements: The federal securities laws define the term accredited investor in Rule 501 of Regulation D as: a bank, insurance company, registered investment company, business development company, or small business investment company; an employee benefit plan, within the meaning of the Employee Retirement Income Security Act, if a bank, insurance company, or registered investment adviser makes the investment decisions, or if the plan has total assets in excess of $5 million; a charitable organization, corporation, or partnership with assets exceeding $5 million; a director, executive officer, or general partner of the company selling the securities; a business in which all the equity owners are accredited investors; a natural person who has individual net worth, or joint net worth with the person’s spouse, that exceeds $1 million at the time of the purchase, excluding the value of the primary residence of such person; a natural person with income exceeding $200,000 in each of the two most recent years or joint income with a spouse exceeding $300,000 for those years and a reasonable expectation of the same income level in the current year; or a trust with assets in excess of $5 million, not formed to acquire the securities offered, whose purchases a sophisticated person makes. No citizenship/residency requirements."

# doc = "where is steph currys home in nc"

# doc = "was ronald reagan a democrat"
# doc = "what slows down the flow of blood"

# doc = "A comprehensive survey of graph embedding: Problems, techniques, and applications"

# doc = """C-Pack: Packed Resources For General Chinese Embeddings. We introduce C-Pack, a package of resources that significantly advance the field of general Chinese embeddings. C-Pack includes three critical resources. 1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora for training embedding models. 3) C-TEM is a family of embedding models covering multiple sizes. Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the time of the release. We also integrate and optimize the entire suite of training methods for C-TEM. Along with our resources on general Chinese embedding, we release our data and models for English text embeddings. The English models achieve state-of-the-art performance on MTEB benchmark; meanwhile, our released English data is 2 times larger than the Chinese data"""

# doc = """Generative Representational Instruction Tuning. All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8x7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models"""

# doc = "I work in the field of medical text mining and I require a system that can accurately extract relevant information from medical documents. First of all, the system needs a named entity recognition module that is capable of recognizing biomedical entities. I am currently thinking about using deep learning based approaches for the named entity recognition module, so I can achieve optimal performance. However, since human annotations are expensive and difficult to obtain, I want to explore options where my deep learning module does not need much labeled data, instead, it could train on unlabeled data as well.  Since in real-life, there is a large volume of medical documents, I want to reduce the amount of training time. Currently, I think I could use some weight sharing or weight transfer to reduce training time. My final goal is to beat current state of the art models as well as pre-trained models. Overall, efficiency and low-cost are two important factors in my data collection, model training and inference pipeline."

# doc = "a person studying machine learning"
# doc = "a person learning to operate machine"

# doc = "Latent Dirichlet Allocation. We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document. We present efficient approximate inference techniques based on variational methods and an EM algorithm for empirical Bayes parameter estimation. We report results in document modeling, text classification, and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI model."

# doc = "FLawN-T5: An Empirical Examination of Effective Instruction-Tuning Data Mixtures for Legal Reasoning. Instruction tuning is an important step in making language models useful for direct user interaction. However, many legal tasks remain out of reach for most open LLMs and there do not yet exist any large scale instruction datasets for the domain. This critically limits research in this application area. In this work, we curate LawInstruct, a large legal instruction dataset, covering 17 jurisdictions, 24 languages and a total of 12M examples. We present evidence that domain-specific pretraining and instruction tuning improve performance on LegalBench, including improving Flan-T5 XL by 8 points or 16% over the baseline. However, the effect does not generalize across all tasks, training regimes, model sizes, and other factors. LawInstruct is a resource for accelerating the development of models with stronger information processing and decision making capabilities in the legal domain."

# doc = "what is a mental state where you sit at your desk and can not get anything done?"

doc = "deep transfer learning in neural networks"
# doc = "Domain Adaptation Techniques for Deep Architectures"

In [20]:
# # now compute the document representation
# for punc in string.punctuation:
#     doc = doc.replace(punc, " ")
    
doc_tokens = tokenizer(doc, max_length = 256, return_tensors="pt")
with torch.no_grad():
    doc_rep = model(d_kwargs=doc_tokens)["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)
    # print(torch.sum(doc_rep))
    # doc_rep = encode_custom_mask_punc(doc_tokens, model).squeeze()
print(doc_rep.shape)
# get the number of non-zero dimensions in the rep:
col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
print("number of actual dimensions: ", len(col))

# now let's inspect the bow representation:
weights = doc_rep[col].cpu().tolist()
d = {k: v for k, v in zip(col, weights)} #if k >= model.original_bert_vocab_size}
sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
bow_rep = []

print(doc)
for k, v in sorted_d.items():
    print((reverse_voc[k], round(v, 2)))
    bow_rep.append((reverse_voc[k], round(v, 2)))
# print("SPLADE BOW rep:\n", bow_rep)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():
  with torch.cuda.amp.autocast() if self.fp16 else NullContextManager():


torch.Size([30522])
number of actual dimensions:  38
deep transfer learning in neural networks
('deep', 2.55)
('transfer', 2.27)
('neural', 1.73)
('learning', 1.56)
('brain', 1.37)
('networks', 1.33)
('learn', 1.26)
('network', 1.15)
('transferred', 1.14)
('college', 1.03)
(',', 0.69)
('goal', 0.61)
('it', 0.58)
('ship', 0.54)
('what', 0.43)
('for', 0.42)
('ring', 0.42)
('device', 0.38)
('gordon', 0.33)
('test', 0.3)
('the', 0.28)
('doing', 0.28)
('use', 0.27)
('phone', 0.26)
('market', 0.24)
('university', 0.24)
('conference', 0.23)
('game', 0.2)
('to', 0.15)
('hospital', 0.15)
('gene', 0.12)
('habitat', 0.1)
('in', 0.08)
('is', 0.06)
('a', 0.03)
('program', 0.02)
('education', 0.02)
('.', 0.01)


In [21]:
to_print = ", ".join([f"({k}, {v})" for k,v in bow_rep])
to_print

'(deep, 2.55), (transfer, 2.27), (neural, 1.73), (learning, 1.56), (brain, 1.37), (networks, 1.33), (learn, 1.26), (network, 1.15), (transferred, 1.14), (college, 1.03), (,, 0.69), (goal, 0.61), (it, 0.58), (ship, 0.54), (what, 0.43), (for, 0.42), (ring, 0.42), (device, 0.38), (gordon, 0.33), (test, 0.3), (the, 0.28), (doing, 0.28), (use, 0.27), (phone, 0.26), (market, 0.24), (university, 0.24), (conference, 0.23), (game, 0.2), (to, 0.15), (hospital, 0.15), (gene, 0.12), (habitat, 0.1), (in, 0.08), (is, 0.06), (a, 0.03), (program, 0.02), (education, 0.02), (., 0.01)'

In [26]:
tokens = tokenizer(doc, return_tensors="pt")
out = encode_custom(tokens, model = model, is_q = True)
out.shape

torch.Size([1, 393, 59419])

In [24]:
torch.sum(tokens["attention_mask"], dim = 1, keepdim=True) ** 0.5

tensor([[3.3166],
        [3.3166],
        [3.3166],
        [3.3166]])

In [18]:
tokens_str = [reverse_voc[int(idx)] for idx in tokens["input_ids"][0]]

In [19]:
row, col = torch.nonzero(out[0][:], as_tuple = True)

In [20]:
token_mapper = [[tokens_str[j], Counter()] for j in range(max(row) + 1)]
for r,c in zip(row, col):
    r_token_id = int(tokens["input_ids"][0][r])
    r_token_str = reverse_voc[r_token_id]

    temp = {}

    c_token_id = int(c)
    c_token_str = reverse_voc[c_token_id]
    if c_token_str not in temp:
        temp[c_token_str] = round(float(out[0][r, c]), 2)

    token_mapper[r][0] = r_token_str
    token_mapper[r][1].update(temp)

In [21]:
for token, counter in token_mapper:
    print(token, "->", Counter(dict(counter)).most_common(50))

[CLS] -> [('image', 0.81), ('learning', 0.75), ('sensor', 0.61), ('scene', 0.58), ('motion capture', 0.56), ('image processing', 0.55), ('machine learning', 0.55), ('decoder', 0.53), ('video', 0.49), ('monte carlo', 0.49), ('training data', 0.48), ('motion vector', 0.47), ('neural', 0.44), ('quantum gravity', 0.44), ('steganography', 0.44), ('roadmap', 0.44), ('deep learning', 0.43), ('map', 0.42), ('stereo vision', 0.42), ('conjecture', 0.41), ('texture', 0.4), ('stereoscopic', 0.4), ('image quality', 0.38), ('depth map', 0.38), ('image texture', 0.38), ('visual attention', 0.37), ('galaxy', 0.36), ('neural network', 0.36), ('computer graphics', 0.36), ('problem', 0.35), ('perception', 0.35), ('video data', 0.35), ('gestalt', 0.35), ('interaction', 0.34), ('optical', 0.34), ('random field', 0.34), ('motion', 0.33), ('query', 0.33), ('motion control', 0.33), ('psychology', 0.32), ('face image', 0.32), ('image registration', 0.31), ('motion compensation', 0.31), ('video frame', 0.31), (

In [28]:
start_index =24
end_index = 33
print([item[0] for item in token_mapper[start_index:end_index]])
test = Counter()
for item in token_mapper[start_index:end_index]:
    test.update(item[1])

print(test.keys())
test

['other', 'knowledge', '-', 'intensive', 'language', 'tasks', '.', 'while', 'many']
dict_keys(['knowledge', 'intensive', 'language', 'c language', 'tasks'])


Counter({'language': 0.87,
         'c language': 0.52,
         'intensive': 0.3,
         'knowledge': 0.03,
         'tasks': 0.01})

In [None]:
# in reverse

token_mapper = {}
for r,c in zip(row, col):
    r_token_id = int(tokens["input_ids"][0][r])
    r_token_str = reverse_voc[r_token_id]

    c_token_id = int(c)
    c_token_str = reverse_voc[c_token_id]

    if c_token_str not in token_mapper: token_mapper[c_token_str] = []
    score = float(out[0][r, c])

    token_mapper[c_token_str].append(score)

In [None]:
for k in token_mapper:
    scores = list(sorted(token_mapper[k], reverse=True))
    print(k, [round(item, 2) for item in scores[:10]])

In [None]:
tokens

In [None]:
with torch.no_grad():
    batch_doc_rep, batch_doc_token_indices, batch_doc_pad_len = model.encode(tokenizer([doc, doc], return_tensors="pt"), is_q = False)  # (sparse) doc rep in voc space, shape (30522,)



for i in range(batch_doc_rep.size(0)):
    doc_rep = batch_doc_rep[i]
    doc_token_indices = batch_doc_token_indices[i]

    # get the number of non-zero dimensions in the rep:
    col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
    print("number of actual dimensions: ", len(col))

    # now let's inspect the bow representation:
    weights = doc_rep[col].cpu().tolist()
    _indices = doc_token_indices[col].cpu().tolist()
    d = {k: v for k, v in zip(col, weights)}
    d_indices = {reverse_voc[k]: v for k, v in zip(col, _indices)}
    sorted_d = {reverse_voc[k]: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    print(d_indices, "\n", sorted_d)

In [None]:
temp[0].shape, temp[1].shape

In [None]:
tokenizer.tokenize(doc)

In [None]:
original_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
print(tokenizer.tokenize(doc))
print()
print(original_tokenizer.tokenize(doc))

In [None]:
original_tokenizer.tokenize("asdbaisbd")

In [None]:
import json

In [None]:
with open("/scratch/lamdo/doris-mae/DORIS-MAE_dataset_v1.json") as f:
    ds = json.load(f)

In [None]:
ds["Corpus"][10]

In [58]:
doc_tokens = tokenizer(doc, max_length = 256, return_tensors="pt")

In [59]:
doc_tokens

{'input_ids': tensor([[  102,  4631,  5017,  1904,   168, 41261,   205, 12744, 32068,   220,
           475,  2537,   147,  7434,   205,   185,   709,   106,  5017,  1904,
          2641,   147, 11004,   111,  2208,   131,  2449,   198,   220,  7790,
         12744,   506,  1052,   501,  2049,   205,   185,  7207,  9561,  1960,
           111,  4852,   188,  1904,  5017,  1832,   190,  2470,   147,   111,
          2474,  5671,   422,  3222,   131,  1904,  8361,  1294, 30118,  1832,
           205,   185,  1584,  6124, 33302,  4247,   198,   407,  5017,  2449,
           220,  9247,   147, 10215,   422,   137,   300,  3571,  2683,   263,
          8222,  1175,  3826,   205,   191,   111,  1572,  4324,  4813,   185,
          3138,  5017, 14454,   190,   106,  3826,   131,   692,   147, 19901,
          4852,   579,   579,   579,   493, 30137, 12744,   506, 18112, 30123,
         14454,   563,  2077,  2773,  1268,  3480,   205,   130, 10623,   131,
           407,  5017, 14454, 11616,  

In [113]:
with torch.no_grad():
    out = model.transformer_rep.transformer(**doc_tokens)[0]
    test = torch.log(1 + torch.relu(out)) * doc_tokens["attention_mask"].unsqueeze(-1)

IndexError: index out of range in self

In [107]:
for line in test.nonzero():
    if line[-1] < 31090: print(line)

tensor([  0,   0, 106])
tensor([  0,   0, 111])
tensor([  0,   0, 112])
tensor([  0,   0, 121])
tensor([  0,   0, 130])
tensor([  0,   0, 131])
tensor([  0,   0, 137])
tensor([  0,   0, 145])
tensor([  0,   0, 147])
tensor([  0,   0, 158])
tensor([  0,   0, 165])
tensor([  0,   0, 168])
tensor([  0,   0, 170])
tensor([  0,   0, 188])
tensor([  0,   0, 190])
tensor([  0,   0, 191])
tensor([  0,   0, 198])
tensor([  0,   0, 205])
tensor([  0,   0, 214])
tensor([  0,   0, 234])
tensor([  0,   0, 235])
tensor([  0,   0, 238])
tensor([  0,   0, 239])
tensor([  0,   0, 244])
tensor([  0,   0, 256])
tensor([  0,   0, 263])
tensor([  0,   0, 286])
tensor([  0,   0, 305])
tensor([  0,   0, 334])
tensor([  0,   0, 370])
tensor([  0,   0, 422])
tensor([  0,   0, 448])
tensor([  0,   0, 450])
tensor([  0,   0, 473])
tensor([  0,   0, 487])
tensor([  0,   0, 546])
tensor([  0,   0, 563])
tensor([  0,   0, 579])
tensor([  0,   0, 862])
tensor([  0,   0, 894])
tensor([   0,    0, 1352])
tensor([   0,

In [78]:
test[0,0,106]

tensor(1.6048)

In [102]:
doc_tokens["input_ids"].shape

torch.Size([1, 160])

In [106]:
out[0,10][:31090]

tensor([-6.2160, -5.7205, -5.8507,  ..., -6.4396, -5.8260, -5.8544])