# Experiment of Calculation amount Reduction Method Using embed_compression

In [1]:
%%capture

!pip uninstall transformers -y

!pip install quanto
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_api_key = user_secrets.get_secret("hf_api_key")

!huggingface-cli login --token $hf_api_key

%load_ext memory_profiler

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
!rm -rf src/
!mkdir -p src/

import sys
sys.path.append("/kaggle/working/src")

In [4]:
%%writefile src/config.py
import torch

class CFG:
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

    dtype = torch.float16
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Writing src/config.py


In [5]:
%%writefile src/custom_models.py
from typing import List, Optional, Tuple, Union
import torch
from torch import nn
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from transformers.modeling_outputs import BaseModelOutputWithPast
from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaModel
from config import CFG

class CustomLlamaForCausalLM(LlamaForCausalLM):
     def __init__(self, config):
        super().__init__(config)
        self.model = CustomLlama(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

class CustomLlama(LlamaModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        
        self.next_token_i = 0
        self.embed_layers = 12
        self.mid_hidden_states = []
        self.is_get_mid = False

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
            )
            use_cache = False

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        past_seen_tokens = 0
        if use_cache:  # kept for BC (cache positions)
            if not isinstance(past_key_values, StaticCache):
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                past_seen_tokens = past_key_values.get_seq_length()
        if cache_position is None:
            if isinstance(past_key_values, StaticCache):
                raise ValueError("cache_position is a required argument when using StaticCache.")
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_seen_tokens)

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for i, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )
            hidden_states = layer_outputs[0]
            
            if self.is_get_mid and self.next_token_i == 0:
                self.mid_hidden_states.append(hidden_states)
            
            if not self.is_get_mid and i < self.embed_layers and self.next_token_i==0:
                mid_hidden_state = self.mid_hidden_states[i]
                if i > 16:
                    mid_hidden_state = mid_hidden_state.to("cuda:1")
                hidden_states = torch.cat((hidden_states[:, :6, :], mid_hidden_state, hidden_states[:, 6+mid_hidden_state.size(1):, :]), dim=1)

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)
        
        if self.is_get_mid and self.next_token_i > 1:
            return None
        
        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = (
                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
            )
        
        self.next_token_i += 1
        
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
    
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_seen_tokens: int,
    ):
        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None

        if self.config._attn_implementation == "sdpa":
            # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument,
            # in order to dispatch on Flash Attention 2.
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask, inputs_embeds=input_tensor, past_key_values_length=past_seen_tokens
            ):
                return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"):  # static cache
            target_length = self.config.max_position_embeddings
        else:  # dynamic cache
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
        if sequence_length != 1:
            causal_mask = torch.triu(causal_mask, diagonal=1)
        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
        if attention_mask is not None:
            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
            if attention_mask.dim() == 2:
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
            elif attention_mask.dim() == 4:
                # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                # cache. In that case, the 4D attention mask attends to the newest tokens only.
                if attention_mask.shape[-2] < cache_position[0] + sequence_length:
                    offset = cache_position[0]
                else:
                    offset = 0
                mask_shape = attention_mask.shape
                mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
                causal_mask[
                    : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
                ] = mask_slice

        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

if __name__ == "__main__":
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = CustomLlamaForCausalLM.from_pretrained(
        CFG.model_name,
        quantization_config=quantization_config,
        device_map="auto",
    )



Writing src/custom_models.py


In [6]:
%%writefile src/pipeline.py
from typing import Tuple, List
import torch
from torch import Tensor
from transformers import (
    AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizerBase, PreTrainedModel
)

from config import CFG
from custom_models import CustomLlamaForCausalLM

class EmbedCompressionPipeline:
    """Pipeline for compressing embeddings of input prompts using a pre-trained language model."""

    def __init__(self, tokenizer: PreTrainedTokenizerBase, model: PreTrainedModel):
        self.tokenizer = tokenizer
        self.model = model
    
    @classmethod
    def from_pretrained(cls, model_name):
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        
        device_map = [('model.embed_tokens', 0)]
        device_map.extend([(f'model.layers.{i}', 0) for i in range(0, 16)])
        device_map.extend([(f'model.layers.{i}', 1) for i in range(16, 32)])
        device_map.extend([
            ('model.norm', 1),
            ('lm_head', 1),
        ])
        device_map = {ii:jj for (ii,jj) in device_map}

        model = CustomLlamaForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map=device_map,
        )
        
        return cls(tokenizer, model)


    def set_mid_hidden_states(self, prompt, block_size: int = 2) -> Tensor:
        """Compress embeddings"""
        self.model.model.next_token_i = 0
        self.model.model.mid_hidden_states = []
        
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(CFG.device)
        try:
            self.model.model.is_get_mid = True
            outputs = self.model.generate(
                **input_ids,
                max_new_tokens=200,
                temperature=1.0,
            )
        except:
            self.model.model.is_get_mid = False
            
        self.model.model.mid_hidden_states = [self.embed_compression(state, block_size=block_size) for state in self.model.model.mid_hidden_states]
    
    def embed_compression(self, inputs_embeds: Tensor, block_size: int = 2) -> Tensor:
        """Compress embeddings by averaging over blocks of specified size."""
        means = [inputs_embeds[:, i:i+block_size, :].mean(dim=1, keepdim=True)
                 for i in range(0, inputs_embeds.size(1), block_size)]
        compression_inputs_embeds = torch.cat(means, dim=1)
        return compression_inputs_embeds
    

    def generate(self, prompt, embed_layers=12):
        self.model.model.next_token_i = 0
        self.model.model.is_get_mid = False
        self.model.model.embed_layers = embed_layers
        
        messages = [
            {"role": "assistant", "content": "embed"*self.model.model.mid_hidden_states[0].size(1)},
            {"role": "user", "content": prompt},
        ]
        prompt = self.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(CFG.device)

        outputs = self.model.generate(
            **input_ids,
            max_new_tokens=50,
            temperature=1.0,
        )
        return self.tokenizer.decode(outputs[0, input_ids['input_ids'].size(1):])

if __name__ == "__main__":
    pipe = EmbedCompressionPipeline.from_pretrained(CFG.model_name)
    pipe.set_mid_hidden_states("My name is Embed Compression Pipeline.")
    print(pipe.generate("What is your name?"))


Writing src/pipeline.py


# Evaluate

In [7]:
from config import CFG
from pipeline import EmbedCompressionPipeline

pipe = EmbedCompressionPipeline.from_pretrained(CFG.model_name)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [8]:
import subprocess
import re
import gc
import time
import torch

def clear_cache():
    for _ in range(10):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.1)

def get_gpu_memory():
    """Function to get current GPU memory usage via nvidia-smi command."""
    result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv'], capture_output=True, text=True)
    # Parse the output to extract memory usage
    memory_used = [int(re.findall(r'\d+', line)[0]) for line in result.stdout.split('\n') if re.search(r'\d+ MiB', line)]
    return memory_used


Get sample data from Haoxiang Shi, Jiaan Wang, Jiarong Xu, Cen Wang, Tetsuya Sakai: “CT-Eval: Benchmarking Chinese Text-to-Table Performance in Large Language Models”, 2024; <a href='http://arxiv.org/abs/2405.12174'>arXiv:2405.12174</a>.

In [9]:
comp_prompt = """1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
several IE sub-tasks, and found that compared to
supervised baselines, the performance of LLMs is
sub-optimal. However, their performance of LLMs
on Text-to-Table remains largely unexplored. Additionally, the rapid advancements in LLMs have facilitated their widespread adoption in multi-lingual
settings, enabling language modeling abilities to be
shared across different languages. Consequently, it
is theoretically possible to feasible to employ LLMs
for text-to-table in other languages, an area that
has yet to be thoroughly investigated.
Motivated by the aforementioned considerations,
we decide to benchmark LLMs on Chinese text-totable. A heuristic approach involves translating existing English datasets into Chinese for subsequent
performance evaluation. There are four datasets
widely used in text-to-table: E2E (Novikova et al.,
2017) is a restaurant domain dataset encompassing
51.5K samples about restaurant information like
names, addresses, rate scores, etc. Rotowire (Wiseman et al., 2017) is a sports domain dataset derived
from NBA basketball games with 4.9K samples
about NBA team information. WikiBio (Lebret
et al., 2016), compiled from Wikipedia, containing
72.6K documents and corresponding structured tables in the biography domain. WikiTableText (Bao
arXiv:2405.12174v1 [cs.CL] 20 May 2024
et al., 2018) is also derived from Wikipedia, and
involves 13.3K multidisciplinary samples spanning
fields like finance and politics. However, according to our preliminary analysis, these datasets also
suffer from the issues of less diversity or high hallucination, making them unsuitable to benchmark
LLMs. Specifically, (1) E2E, Rotowire, and WikiBio exhibit a lack of diversity as they focus on a
single domain, violating the core principle of instruction tuning in LLMs, that is, diversity. (2)
Although WikiTableText incorporates multiple domains for diversity, our analysis (§ 3.5) indicates
that 18.83% of samples exhibit hallucination in
the golden tables, that is, containing additional information beyond the provided documents. This
arises from treating Wikipedia infoboxes as golden
tables, created collaboratively by online users and
potentially containing additional basic information.
In this paper, we propose the Chinese Text-toTable Evaluation (CT-Eval) dataset, which is constructed through three steps to ensure data diversity
and minimize hallucination. To ensure diversity,
the first step involves collecting multidisciplinary
document-table pairs. We choose the Baidu Baike
as the data source, which is a popular Chinese multidisciplinary online encyclopedia. Each page in
this source contains text and an infobox summarizing the corresponding key information. Second, to
minimize data hallucination, we train an LLM, as a
hallucination judger, to filter out task samples with
hallucination in their golden tables (infoboxes). Finally, we obtain 88.6K samples with an average
length of 911.46 Chinese characters. We split them
into 86.6K, 1K and 1K for training, validation and
testing. For validation and testing samples, human
annotators further clean data hallucination in the
golden tables to ensure evaluation reliability.
Based on the proposed CT-Eval, we benchmark
various mainstream LLMs in both zero-shot (for
both open- and closed-source LLMs) and finetuning (only for open-source LLMs) scenarios. Our
experiments reveal that (1) GPT-4 achieves the best
zero-shot performance among all LLMs. However,
its performance remains a discernible disparity
compared to human judgment. (2) After fine-tuning
on the training set of CT-Eval, all open-source
LLMs demonstrate a significant performance improvement, outperforming zero-shot GPT-4 by a
large margin, indicating the effectiveness of CTEval. In-depth analyses of LLM-generated tables
reveal the persistence of hallucination issues in
both zero-shot and fine-tuned LLMs, highlighting
a challenge in using LLMs as text-to-table systems.
Future work could not only evaluate LLMs’ performance on text-to-table via our CT-Eval benchmark
dataset, but also leverage its training data to improve LLMs’ text-to-table ability via fine-tuning.
2 Related Work
2.1 Text-to-table Tasks
Wu et al. (2022) pioneer the text-to-table task.
Given the absence of a dedicated text-to-table
dataset, Wu et al. (2022) repurpose existing tableto-text datasets, i.e., WikiBio (Lebret et al., 2016),
E2E (Novikova et al., 2017), Rotowire (Wiseman et al., 2017) and WikiTableText (Bao et al.,
2018), for text-to-table tasks by reversing their
input-output pairs. They fine-tune BART (Lewis
et al., 2019) to perform text-to-table in a sequenceto-sequence manner, and find that the fine-tuned
BART outperforms the pipeline baselines using
relation extraction and named entity extraction.
STable (Pietruszka et al., 2022) employs two pretrained language models (PLMs) (T5 (Raffel et al.,
2020) and TILT (Powalski et al., 2021)) for text-totable, and designs a permutation-based decoder to
enhance the PLMs’ table generation ability. Subsequently, Li et al. (2023) find that the predefined row
order in golden tables introduces bias into text-totable models. Consequently, they train table header
and table body generators separately to produce
final tables. While these studies achieve notable
success, they primarily explored text-to-table performance before the LLM era. In addition, their
evaluation datasets generally focus on a single domain, and adapt from the table-to-text datasets, resulting in hallucination issues. Thus, they are unsuitable for benchmarking LLMs in text-to-table.
2.2 Large Language Models
The advent of advanced LLMs ,e.g., ChatGPT (OpenAI, 2022), GPT-4 (OpenAI, 2023), marks a pivotal
moment that propels the field of NLP into a boom
phase. Zhong et al. (2023) show that LLMs can
achieve decent performance on benchmarks like
GLUE (Wang et al., 2018), which spans eight representative NLP understanding tasks. Concurrently,
several studies have scrutinized the performance
of LLMs across various IE tasks. For example,
Gao et al. (2023) test the capability of ChatGPT
in event extraction. Similarly, González-Gallardo
et al. (2023) employ ChatGPT on historical entity
recognition. However, the results of these IE subtasks consistently show that LLMs underperform
compared to state-of-the-art supervised approaches.
The text-to-table task we focused on is more complex than the previous basic IE sub-tasks, however,
it remains an unexplored area for evaluation on
LLMs.
3 CT-Eval
In this section, we first discuss the data source for
building CT-Eval (§ 3.1). Then, we give the details
of how to control the hallucination in the preliminary collected data, including LLM hallucination
judger (§ 3.2) and human cleaning processes (§ 3.3).
Finally, we formulate the text-to-table task (§ 3.4)
and provide the details of data statistics (§ 3.5).
3.1 Data Source
Following the success of WikiTableText (Bao et al.,
2018), we also choose a multidisciplinary online
encyclopedia as the data source to ensure data diversity. After carefully comparing existing Chinese
online encyclopedias, we choose Baidu Baike2
,
which is one of the Chinese encyclopedias with
the most entries in the world.
We obtain the Baidu Baike data from the dumps
provided by Xu et al. (2017). The data contains
over 9M entity pages, each of which includes an
infobox and the corresponding textual description,
forming a text-to-table sample. Utilizing this data,
we implement the following rule-based strategy
for preliminary data cleaning: (1) Each page must
contain at least one infobox; otherwise, the golden
table is missing. (2) The number of tabular cells in
the infobox should exceed three to ensure validity.
(3) The length of the textual description should
surpass 200 tokens. After that, 200K documenttable pairs are remaining for further processing.
3.2 LLM Data Cleaning
After the initial cleaning process, the hallucination issue persists due to Baidu Baike’s submission
rules, wherein most of the page text and infoboxes
are edited and maintained by individuals. Thus,
the contents in the infoboxes may not always align
precisely with the textual documents. For instance,
some infoboxes may include additional knowledge
unrelated to the text, potentially misleading the
model from learning the text-to-table task.
2
https://baike.baidu.com/
§ There is a document and a golden table that
summarize the information contained in this
document. Please help me identify whether the
golden table contains additional information
beyond the document. Give the reason first and
then provide your judgment.
Figure 1: Illustration of judgment prompt.
To control the hallucination in the task samples,
we decide to employ an LLM as a hallucination
judger to filter out samples exhibiting hallucination.
While using GPT-4 directly as the LLM hallucination judger is a straightforward approach, utilizing
official APIs can be costly. Therefore, we first
randomly select 5K samples from the data before
cleaning. Then, we use GPT-4 to assess whether the
golden tables contain additional information. The
judgment prompt is illustrated in Figure 1, where
the LLM is tasked with evaluating hallucination in
a chain-of-thought manner. Next, we use the GPT4 judgment results to train an open-source LLM,
employing the trained model to evaluate the remaining samples. Samples containing hallucinations are
discarded. Given the capacity for understanding
lengthy documents of existing Chinese LLMs (Bai
et al., 2023b), we select ChatGLM3-6B-32k3
as
the open-source LLM hallucination judger. Finally,
there are 88.6K samples after the data cleaning
by the LLM hallucination judger. These samples
totally cover 28 domains, e.g., physics and religion.
3.3 Human Data Cleaning
We split the LLM-cleaned samples into the training, validation, and test sets with 84.6K, 1K and
1K samples, respectively. In the validation and test
sets, we balance the number of samples in each
domain to ensure a comprehensive evaluation. Furthermore, to mitigate the hallucination issue in the
validation and test sets, we employ human annotators to manually cleanse their golden tables.
In this phase, we employ five human annotators
and one data expert, all of whom are native Chinese
speakers with advanced educational qualifications.
The data expert is a researcher with extensive experience in IE research. We first organize a tutorial
for the five annotators to ensure alignment of annotation requirements. This includes clarifying the
concept of the hallucination issue, emphasizing the
additional information present in tables that cannot
be directly extracted from the corresponding documents. Next, for each document-table pair in the
3
https://huggingface.co/THUDM/chatglm3-6b-32k
CT-Eval
Industry
Informatics
Physics
Astronomy
Traffic Engineering
Biology
Medical Science
Culture
Religion
History
Geography
Education
Business
Finance
Management
Movie and Book
Science Fiction
Music
Animation
Dwelling
Traveling
Career
Healthy
Food
Cosmetics
Profile
Security
Military
STEM
24.0%
Social Science
60.9%
Life
Support
11.6%
O.
3.5
%
Figure 2: Domain distribution in CT-EVAL
validation and test sets, three annotators are asked
to remove the hallucination information from tables. The data expert reviews 10% of the manually
cleaned samples from each annotator. If the accuracy rate falls below 95%, the respective annotator
will be required to redo the annotation. Finally,
for each sample, if the three manually cleaned results are consistent, the results are saved as the final
data. Otherwise, the results are decided by a group
meeting among all annotators and the data expert.
3.4 Task Overview
Given a document D = {w1, w2, ..., w|D|}, where
wi
is the i-th word in D, the text-to-table task aims
to extract key information from D and outputs a table T = {c0,0, c0,1, ..., c0,n, c1,0, c1,1, ..., c1,n, ...,
ci,j , ..., cm,n}, where c0,k(k ∈ {0, 1, ..., n}) represents the column header and ck,0(k ∈ {0, 1, ..., n})
denotes as the row header. ci,j (i > 0, j > 0) represents the text of the cell in the i-th row and j-th
column in the table with m rows and n columns.
3.5 Data Statistics
We compare CT-Eval with the previous datasets
across several metrics including language, number of entries, average length, number of domains,
hallucination rate, and average number of cells.
As shown in Table 1, compared to previous
datasets that mostly focus on one domain, CT-Eval
covers 28 domains, offering a valuable resource
to evaluate the text-to-table capabilities of LLMs
across multiple domains. These 28 domains can
be categorized into four branches: STEM, social
science, life support and others. To provide a comprehensive understanding of domain coverage in
CT-Eval, we present the distributions of each domain in Figure 2. Notably, the “Social Science”
Original Item
Translation
材料 做法 厨师一点通
炖牛肉时，可以放几个
山楂进去，这样牛肉会
烂得快些，而且有股山
楂的清香。
1.牛肉洗净放入清水锅
中煮至七成熟，捞出切
成方块
2. …
材料：
牛肉，胡萝卜，
白萝卜，大料，
香叶
东坡牛肉
Original Item
Ingredients Cooking method Cooking Tips
When stewing beef, you
can put a few hawthorn
into it, so that the beef
will rot faster and have a
hawthorn fragrance.
1. Wash the beef and
put it into a pot of
water to boil until
it is seven mature,
fish out and cut it
into squares…
2. …
Ingredients:
Beef, carrots, white
radish, Chinese
spices, allspice
Dongpo
Beef
“标题”: “东坡牛肉”, “ ”章节“: [{”标题“: ”东坡牛肉的做法“},
内容“: ”葱段、姜片、蒜头、辣椒、香叶、大料、盐、白糖、番茄酱、排骨
酱、辣椒酱、高汤（用煮牛肉的汤即可）。\n做法：\n1、先来处理一下牛肉，
将牛腩在冷水中多泡一会…“}
Translation
“Title”: “Dongpo Beef”, “ ”Chapter“: [{”Title “: ”Dongpo Beef Practice“}, :
Ingredients：Diced green onion, ginger, garlic, chili pepper, coriander, dashi, salt,
sugar, tomato sauce, rib sauce, chili sauce, broth. Cooking method:1, first, let's deal
with the beef, soak the brisket in cold water for a while longer..."}
Figure 3: Text-to-table example from CT-EVAL.
branch exhibits the highest proportion, encompassing domains such as culture and religion. Conversely, the “Other” branch, which includes topics
related to cosmetics and profiles, represents the
smallest portion. The “STEM” (science, technology, engineering, and mathematics) branch features
data related to topics such as physical sciences and
astronomy. The “Life Support” branch is highly
relevant to everyday life, comprises 11.60% of the
data, and primarily focuses on domains such as
dwelling and health. An illustration of a data sample from CT-Eval is depicted in Figure 3. Each
document-table pair, akin to the example shown,
consists of a textual description and a golden table
with varying numbers of columns and rows.
To assess the quality of our dataset compared
to previous ones, we randomly select 200 samples
from the training, validation and test sets of CTEval, and previous E2E, Rotowire, WikiBio, and
WikiTableText, respectively. Then, we compute the
hallucination rate for each dataset through human
annotation. Specifically, three annotators proficient
in both English and Chinese judged whether the
golden tables contained hallucination information
following the guidance outlined in Section 3.3. The
hallucination rate for each dataset is determined
by the average proportion of hallucinated samples
judged by all three annotators. We observe that
the hallucination rate of the CT-Eval training set
is 6.83%, similar to Rotowire (6.17%). Moreover,
with the help of our human cleaning, the hallucination rates in our test and validation sets decrease to
1.00% and 1.50%, respectively, significantly lower
Dataset Language N. Entries Avg. Length N. Domains Hallu. R Avg. Cells
E2E English 42,061 90.58 Restaurant 4.17% 4.46
Rotowire English 3,398 1311.01 Sports 6.17% 40.49
WikiBio English 582,659 416.71 Biography 8.67% 4.19
WikiTableText English 10,000 59.76 Multiple Subclasses 18.83% 4.25
CT-Eval(train) Chinese 84,603 911.46 28 Subclasses 6.83% 11.40
CT-Eval(val.) Chinese 1,000 813.32 28 Subclasses 1.00% 10.78
CT-Eval(test) Chinese 1,000 845.22 28 Subclasses 1.50% 10.86
Table 1: Data Statistics of CT-Eval and previous text-to-table datasets. “N. Entries” denotes the number of entries.
“Hallu. R” indicates hallucination rate. “Avg.Cells” indicates the average number of table cells
than those in other datasets. Therefore, the reliability of our dataset can be verified. Regarding
domains, the E2E, Rotowire and WikiBio datasets
focus on single domains, whereas WikiTableText
and CT-Eval encompass multiple domains to ensure data diversity. Concerning the length of the input documents, we note that previous E2E and WikiTableText primarily contain short documents with
an average length of under 100 words, whereas documents in our dataset and Rotowire exceed words
or characters, presenting more complex inputs for
text-to-table models. In summary, CT-Eval stands
as the sole dataset fulfilling criteria of data diversity,
lengthy documents, and low hallucination.
"""

## block_size=3

In [10]:
%%time

pipe.set_mid_hidden_states(comp_prompt, block_size=3)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2024-05-21 21:07:17.269938: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 21:07:17.270070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 21:07:17.441797: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CPU times: user 29.7 s, sys: 3.78 s, total: 33.5 s
Wall time: 40 s


In [11]:
mid_hidden_state = pipe.model.model.mid_hidden_states[0]
mid_hidden_state.shape

torch.Size([1, 1506, 4096])

In [12]:
%%time

for i in range(10):
    print("embed_layers=", ((i+1)*3))
    print(pipe.generate("What is CT-Eval?", embed_layers=((i+5)*2)))
    print()

embed_layers= 3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is an acronym for "Chinese Text Evaluation", which is a dataset and evaluation benchmark for text-based conversational AI models, particularly Chinese-language models. The CT-Eval dataset is a collection of annotated Chinese texts, which includes text classification,

embed_layers= 6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I think there may be a typo!

I'm assuming you meant to ask "What is CT-ET?"

CT-ET stands for Chinese Textual Evaluation, which is a dataset used for evaluating the performance of Natural Language Processing (NLP) models

embed_layers= 9


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I think you meant to ask "What is CT-Eval?"

CT-Eval is an evaluation metric for natural language processing (NLP) models, particularly in the context of language translation and text generation. It is used to measure the quality of the

embed_layers= 12


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I think you meant to ask "What is CT-Eval?".

CT-Eval is short for "Chinese Text Evaluation", which is a dataset and a benchmark for evaluating the performance of machine learning models on text-related tasks, particularly in the context of

embed_layers= 15


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I apologize for the mistake earlier. After re-reading the text, I realized that CT-Eval is not mentioned. It seems that the correct abbreviation is actually "CT-Eval" or "CT-Eval dataset", which is a dataset used for

embed_layers= 18


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is the Chinese Text Evaluation dataset, which is a benchmark dataset for evaluating the performance of Chinese language models, specifically in the domain of text evaluation. The dataset consists of a collection of Chinese text documents, along with their corresponding annotations and labels

embed_layers= 21


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I apologize, but I couldn't find any information on "CT-Eval". It's possible that it's a specific tool or evaluation method used in a particular field or industry, or it might be a misspelling or acronym that I couldn't

embed_layers= 24


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is a benchmarking and evaluation platform for natural language processing (NLP) models, specifically designed for the task of text-to-text generation, such as language translation, text summarization, and text generation. CT-Eval provides a standardized

embed_layers= 27


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is an abbreviation that stands for "Chinese Text Evaluation".<|eot_id|>

embed_layers= 30
CT-Eval is the short name for the Chinese Treebank Evaluation. It is a Chinese treebank dataset and evaluation metric used to evaluate the performance of natural language processing (NLP) models, particularly those that are designed to process Chinese text.

The

CPU times: user 1min 45s, sys: 26.5 s, total: 2min 11s
Wall time: 2min 11s


In [13]:
clear_cache()

# GPU usage before inference
memory_before = get_gpu_memory()
print("GPU Usage Before Inference:")
print(memory_before, "MiB")

# Inference code here
print(pipe.generate("What is CT-Eval?", embed_layers=12))

# GPU usage after inference
memory_after = get_gpu_memory()
print("GPU Usage After Inference:")
print(memory_after, "MiB")

# Calculate the difference in memory usage
memory_difference = [after - before for before, after in zip(memory_before, memory_after)]
print("Difference in GPU Memory Usage:")
print(memory_difference, "MiB")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


GPU Usage Before Inference:
[3539, 3447] MiB
CT-Eval stands for "Chinese Text Evaluation" or "Computational Text Evaluation". It is a research dataset and evaluation metric specifically designed for natural language processing (NLP) and text-to-text generation tasks, such as machine translation, text summarization
GPU Usage After Inference:
[3765, 4799] MiB
Difference in GPU Memory Usage:
[226, 1352] MiB


## block_size=2

In [14]:
%%time

pipe.set_mid_hidden_states(comp_prompt, block_size=2)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CPU times: user 21.1 s, sys: 8.4 s, total: 29.5 s
Wall time: 29.5 s


In [15]:
mid_hidden_state = pipe.model.model.mid_hidden_states[0]
mid_hidden_state.shape

torch.Size([1, 2258, 4096])

In [16]:
%%time

for i in range(10):
    print("embed_layers=", ((i+1)*3))
    print(pipe.generate("What is CT-Eval?", embed_layers=((i+5)*2)))
    print()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


embed_layers= 3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval, also known as Chinese Text-to-Table (CTT), is a benchmark dataset and evaluation metric for text-to-table (T2T) models, which is a type of natural language processing (NLP) task that aims to

embed_layers= 6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is the abbreviation for Chinese Text-to-Table (CT-Eval). It is a dataset and evaluation benchmark for Chinese text-to-table (TT) tasks, specifically for Chinese text-to-table generation models. The dataset is designed to evaluate the

embed_layers= 9


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I think you meant to ask "What is CT-Eval (Chinese Text Evaluation) dataset?"

CT-Eval (Chinese Text Evaluation) is a benchmarking dataset for evaluating the performance of text-to-table and text-to-text models in the Chinese language.

embed_layers= 12


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm glad you asked!

CT-Eval is the dataset I introduced in my previous response, specifically designed for evaluating the performance of Large Language Models (LLMs) on the text-to-table task. It's a collection of 88,600 text

embed_layers= 15


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is an abbreviation for "Chinese Text-to-Table" Evaluation dataset, which is a benchmark dataset for evaluating the performance of text-to-table models in the Chinese language. It is a collection of pairs of texts and corresponding tables that can be

embed_layers= 18


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is the Chinese text-to-table (CT-Eval) dataset, which is a new dataset designed to evaluate the performance of large language models (LLMs) on the task of text-to-table (T2T) generation.<|eot_id|>

embed_layers= 21


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is the Chinese Text-to-Table (CT-Eval) dataset and benchmark for evaluating the performance of Large Language Models (LLMs) on the task of generating tables from natural language text. It is a dataset specifically designed to assess the

embed_layers= 24


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval stands for Chinese Text-to-Text Evaluation.<|eot_id|>

embed_layers= 27


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CT-Eval is the Chinese Text-to-Table (Text-to-Table) Evaluation dataset that I described in my previous response. It is a dataset specifically designed to evaluate the performance of language models in text-to-table tasks, particularly in the context of

embed_layers= 30
CT-Eval is the dataset and evaluation framework for the Text-to-Text (T2T) task, specifically designed for Chinese Text-to-Text (C2T) and English Text-to-Text (E2T).<|eot_id|>

CPU times: user 2min 23s, sys: 43.2 s, total: 3min 6s
Wall time: 3min 6s


In [17]:
clear_cache()

# GPU usage before inference
memory_before = get_gpu_memory()
print("GPU Usage Before Inference:")
print(memory_before, "MiB")

# Inference code here
print(pipe.generate("What is CT-Eval?", embed_layers=12))

# GPU usage after inference
memory_after = get_gpu_memory()
print("GPU Usage After Inference:")
print(memory_after, "MiB")

# Calculate the difference in memory usage
memory_difference = [after - before for before, after in zip(memory_before, memory_after)]
print("Difference in GPU Memory Usage:")
print(memory_difference, "MiB")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


GPU Usage Before Inference:
[3643, 3897] MiB
CT-Eval, short for Chinese Text-to-Table (CT-Eval), is a text-to-table evaluation dataset for text-based dialogue and question-answering tasks.<|eot_id|>
GPU Usage After Inference:
[3869, 5577] MiB
Difference in GPU Memory Usage:
[226, 1680] MiB


## Compare to normal way.

In [18]:
del pipe

clear_cache()

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

device_map = [('model.embed_tokens', 0)]
device_map.extend([(f'model.layers.{i}', 0) for i in range(0, 16)])
device_map.extend([(f'model.layers.{i}', 1) for i in range(16, 32)])
device_map.extend([
    ('model.norm', 1),
    ('lm_head', 1),
])
device_map = {ii:jj for (ii,jj) in device_map}

model = AutoModelForCausalLM.from_pretrained(
    CFG.model_name,
    quantization_config=quantization_config,
    device_map=device_map,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## No Knowledge

In [20]:
messages = [
    {"role": "user", "content": "What is CT-Eval?"},
]
prompt = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)
input_ids = tokenizer(prompt, return_tensors="pt").to(CFG.device)

In [21]:
%%time

for i in range(10):
    outputs = model.generate(
        **input_ids,
        max_new_tokens=50,
        temperature=1.0,
    )
    print(tokenizer.decode(outputs[0]))
    print()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a performance evaluation software developed by the European Commission's Joint Research Centre (JRC) for assessing the performance of computer systems and algorithms in the field of Computer-Aided Translation (CAT).

CT-Eval is designed to measure the quality



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a type of computer-aided diagnosis (CAD) system designed to evaluate and analyze medical images, such as computed tomography (CT) scans. The system is used to help radiologists and healthcare professionals diagnose and detect various medical



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a software tool used to evaluate the performance of a Computer Tomography (CT) scanner. It is a phantom-based quality control tool that is used to assess the accuracy and quality of the CT scanner's imaging capabilities.

CT-Eval



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a software package used to evaluate the performance of computer systems, particularly in the context of computer-aided design (CAD) and computer-aided engineering (CAE) applications. It is a widely used tool in various industries



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a popular software tool used in the field of Computational Thinking (CT) and Computer Science (CS) education. It's a comprehensive evaluation system designed to assess students' understanding and skills in various programming concepts, algorithms, and problem-solving



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a performance evaluation tool used in the context of computer networks and communication systems. It stands for "Computerized Testing and Evaluation" or "Computerized Testing and Evaluation of Network Performance".

CT-Eval is a software tool used to test



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a popular testing framework for evaluating and validating computer network protocols and devices. It's a network traffic generator and analyzer that can be used to simulate network scenarios, send and receive network packets, and analyze network performance.

CT-Eval is



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a performance evaluation and analysis tool used to measure the speed and accuracy of computer vision algorithms and deep learning models. It is specifically designed to evaluate the performance of computer vision models on a wide range of tasks, such as object detection,



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a comprehensive evaluation tool for medical imaging software, commonly used in the medical imaging industry. It's an acronym for "Computerized Tomography Evaluation".

CT-Eval is a standardized method for evaluating the performance of Computed Tomography (

<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a popular open-source evaluation metric for comparing and assessing the performance of Natural Language Processing (NLP) and Text-to-Text (T2T) models, particularly in the context of machine translation, text summarization, and text

CPU times: user 53.4 s, sys: 1.78 s, total: 55.2 s
Wall time: 55.2 s


In [22]:
clear_cache()

# GPU usage before inference
memory_before = get_gpu_memory()
print("GPU Usage Before Inference:")
print(memory_before, "MiB")

# Inference code here
outputs = model.generate(
    **input_ids,
    max_new_tokens=50,
    temperature=1.0,
)
print(tokenizer.decode(outputs[0]))

# GPU usage after inference
memory_after = get_gpu_memory()
print("GPU Usage After Inference:")
print(memory_after, "MiB")

# Calculate the difference in memory usage
memory_difference = [after - before for before, after in zip(memory_before, memory_after)]
print("Difference in GPU Memory Usage:")
print(memory_difference, "MiB")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


GPU Usage Before Inference:
[2883, 2947] MiB
<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

What is CT-Eval?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

CT-Eval is a method for evaluating the quality of a CT (Computed Tomography) scan in medical imaging. It is a standardized scoring system used to assess the diagnostic accuracy and quality of a CT scan, particularly in the evaluation of chest and abdominal
GPU Usage After Inference:
[3181, 3185] MiB
Difference in GPU Memory Usage:
[298, 238] MiB


## Using Original Tokens

In [23]:
messages = [
    {"role": "assistant", "content": comp_prompt},
    {"role": "user", "content": "What is CT-Eval?"},
]
prompt = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)
input_ids = tokenizer(prompt, return_tensors="pt").to(CFG.device)

In [24]:
%%time

for i in range(10):
    outputs = model.generate(
        **input_ids,
        max_new_tokens=50,
        temperature=1.0,
    )
    print(tokenizer.decode(outputs[0]))
    print()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Gao et al., 2023) have also utilized LLMs for
s

In [25]:
clear_cache()

# GPU usage before inference
memory_before = get_gpu_memory()
print("GPU Usage Before Inference:")
print(memory_before, "MiB")

# Inference code here
outputs = model.generate(
    **input_ids,
    max_new_tokens=50,
    temperature=1.0,
)
print(tokenizer.decode(outputs[0]))

# GPU usage after inference
memory_after = get_gpu_memory()
print("GPU Usage After Inference:")
print(memory_after, "MiB")

# Calculate the difference in memory usage
memory_difference = [after - before for before, after in zip(memory_before, memory_after)]
print("Difference in GPU Memory Usage:")
print(memory_difference, "MiB")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


GPU Usage Before Inference:
[2883, 2949] MiB
<|begin_of_text|><|begin_of_text|><|start_header_id|>assistant<|end_header_id|>

1 Introduction
Information extraction (IE) aims to identify and extract structured information from unstructured text.
1Codes and data will be publicly available once accepted.
Basic IE sub-tasks, such as, named entity recognition and entity linking, generally operate at the
sentence level, which ignore models’ ability to understand the document-level meaning. In contrast,
Text-to-Table, an emerging sub-task of IE, requires
models to understand information within a given
document and then generate structured tables. Despite great success in the domain Wu et al. (2022);
Li et al. (2023), previous studies are oriented to
English, limiting the research in other languages.
Recently, large language models (LLMs) have
demonstrated powerful performance across various NLP tasks (Zhao et al., 2023; Wang et al.,
2023a,b,c). Some studies (González-Gallardo et al.,
2023; Ga