In [1]:
!pip install "bitsandbytes>=0.39.0" loralib
!pip install "transformers>=4.31.0,<4.35.0"
!pip install "datasets>=2.14.3"
!pip install "accelerate>=0.21.0"
!pip install "peft==0.8.1"
!pip install "trl>=0.7.4"
!pip install "sentencepiece"

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip instal

In [2]:
import torch
torch.cuda.is_available()

True

### load tokenizer
The `AutoTokenizer.from_pretrained()` function is used to load the tokenizer that corresponds to the pre-trained model. The tokenizer is responsible for converting input text into a format that the model can understand. The `trust_remote_code=True` argument allows the execution of remote code, which can be necessary if the tokenizer includes custom (non-standard) components.

In [3]:
from transformers import AutoTokenizer
from typing import List, Union, Dict

model_to_load = "THUDM/chatglm3-6b"

tokenizer = AutoTokenizer.from_pretrained(model_to_load, trust_remote_code=True)

prefix: List[Union[str, Dict[str, str]]] =[
        {"token": "[gMASK]"},
        {"token": "sop"},
        {"token": "<|user|>"},
        "\n",
        "{{question}}",
        {"token": "<|assistant|>"}
]

eos_ids = [] if tokenizer.eos_token_id is None else [tokenizer.eos_token_id]
bos_ids = [] if tokenizer.bos_token_id is None else [tokenizer.bos_token_id]

### load dataset
This cell code snippet is using the `load_dataset` function from the `datasets` library to load a JSON dataset from a local file named 'qa.json'. The `load_dataset` function returns a `Dataset` object.

The `Dataset` object, `qa_dataset`, has its columns renamed for clarity and consistency. The 'instruction' column is renamed to 'question', the 'input' column is renamed to 'context', and the 'output' column is renamed to 'answers'. The `rename_column` method is used to perform these renamings. It takes two arguments: the current name of the column and the new name for the column.

In [4]:
from datasets import load_dataset

qa_dataset = load_dataset('json', data_files='qa.json')
qa_dataset = qa_dataset.rename_column('instruction',  'question')\
        .rename_column('input', 'context')\
        .rename_column('output','answers')

### processing dataset

The cell code is responsible for processing a dataset for pretraining a language model. Here's a breakdown of the main parts:

- create_prompt(question) -> List[int]: This function takes a question as input and returns a list of integers. It iterates over the prefix (which is not defined in the selection), and for each part of the prefix, it checks if it's a dictionary. If it is, it converts the "token" value to IDs using the tokenizer. If it's not a dictionary, it replaces "{{question}}" with the actual question and encodes it to IDs. The result is a list of token IDs that represent the prompt.
- process_pretrain_dataset(example: "Dataset") -> Dict[str, List[List[int]]]: This function processes the dataset for pretraining. It takes a dataset example as input and returns a dictionary with keys "input_ids", "attention_mask", and "labels". It defines a generator function construct_prompt that yields context, question, and answer from the example. For each context, question, and answer, it creates a tokenized prompt and response, constructs the input IDs, labels, and attention mask, and appends them to the result dictionary.
- print_supervised_dataset_example(example: Dict[str, List[int]]) -> None: This function prints an example from the processed dataset. It prints the input IDs, the decoded inputs, the label IDs, and the decoded labels (excluding those with a value of -100).
- The last two lines of the selection map the process_pretrain_dataset function to the qa_dataset, removing the 'answers', 'context', and 'question' columns. It then prints the mapped dataset and an example from the "train" split of the dataset.

In [5]:
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Tuple, Union
def create_prompt(question) -> List[int]:
    result = []
    for prefix_part in prefix:
        if isinstance(prefix_part, dict):
            if "token" in prefix_part:
                result += [tokenizer.convert_tokens_to_ids(prefix_part["token"])]
            else:
                result += [tokenizer.convert_tokens_to_ids(prefix_part["token"])]
        else:
            prefix_part = prefix_part.replace("{{question}}", question, 1)
            result += tokenizer.encode(prefix_part, add_special_tokens=False)
    return  result

def process_pretrain_dataset(example: "Dataset") -> Dict[str, List[List[int]]]:
    result = {"input_ids": [], "attention_mask": [], "labels": []}
    def construct_prompt(example: Dict[str, Union[str, List[str]]]) -> Generator[str, None, None]:
        for i in range(len(example["question"])):
            context = example["context"][i]
            question = example["question"][i]
            answer = example["answers"][i]
            yield context, question, answer
    
    for context, question, answer in construct_prompt(example):
        tokenized_prompt = create_prompt(question)
        tokenized_resp = tokenizer.encode(answer, add_special_tokens=False)
        input_ids = bos_ids  + tokenized_prompt + tokenized_resp + eos_ids
        labels = bos_ids + [-100] * (len(tokenized_prompt)) + tokenized_resp + eos_ids
        attention_mask = [1]*len(input_ids)
        result["input_ids"].append(input_ids)
        result["attention_mask"].append(attention_mask)
        result["labels"].append(labels)

    return result

def print_supervised_dataset_example(example: Dict[str, List[int]]) -> None:
    print("input_ids:\n{}".format(example["input_ids"]))
    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
    print("label_ids:\n{}".format(example["labels"]))
    print("labels:\n{}".format(
        tokenizer.decode(list(filter(lambda x: x != -100, example["labels"])), skip_special_tokens=False)
    ))

mapped_qa_dataset = qa_dataset.map(process_pretrain_dataset, remove_columns=['answers', 'context', 'question'], batched=True)
print(mapped_qa_dataset)
print_supervised_dataset_example(next(iter(mapped_qa_dataset["train"])))

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 701
    })
})
input_ids:
[64790, 64792, 64795, 30910, 13, 9751, 35623, 32495, 64796, 36953, 31211, 55266, 58091, 31123, 55419, 38019, 56264, 54583, 13, 30595, 39288, 32366, 31301, 31865, 55090, 31300, 13, 30939, 9751, 10913, 3974, 2331, 30932, 9549, 9007, 359, 1556, 3341, 28670, 1460, 30945, 13, 33775, 54781, 30595, 36223, 35541, 54631, 40536, 31123, 54532, 30595, 19277, 39288, 32330, 54530, 35084, 54642, 31668, 34651, 31155, 32763, 45257, 32330, 54586, 41618, 38064, 54945, 37927, 31123, 33775, 45904, 31846, 31155, 13, 30943, 21209, 9751, 30932, 5096, 9007, 359, 1556, 18349, 1223, 4733, 30945, 13, 33775, 35344, 54534, 30595, 39288, 54538, 30981, 30973, 54907, 39685, 35549, 38809, 32035, 33257, 31123, 31704, 32035, 33257, 51637, 33125, 31936, 31698, 32096, 54626, 33142, 32184, 54530, 34980, 31155, 13, 30966, 9751, 7652, 328, 7524, 291, 12299, 359, 1556, 7992, 1536, 19366, 30

### load pretrained model

This cell is responsible for loading a pre-trained model and its corresponding tokenizer. 

The variable `model_to_load` is set to the string "THUDM/chatglm3-6b", which is likely the identifier of a pre-trained model stored in a model hub or a local directory.

The `AutoModelForCausalLM.from_pretrained()` function is used to load the pre-trained model. This function is part of the `transformers` library and is designed to handle models that are used for causal language modeling tasks. The arguments passed to this function configure the model's behavior:

- `model_to_load` specifies the model to load.
- `torch_dtype=torch.bfloat16` sets the data type of the model's parameters to bfloat16, a floating-point format that provides better performance on some hardware.
- `device_map='cuda:0'` specifies that the model should be loaded onto the first CUDA device, if available.
- `trust_remote_code=True` allows the execution of remote code, which can be necessary if the model includes custom (non-standard) components.
- `revision="b098244a71fbe69ce149682d9072a7629f7e908c"` specifies a particular version of the model to load, identified by its commit hash.
- `quantization_config=BitsAndBytesConfig(...)` sets the configuration for quantization, a technique used to reduce the memory footprint of the model. The `BitsAndBytesConfig` object is configured to use 4-bit quantization, with bfloat16 as the compute data type, and to use double quantization.

In [6]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

model = AutoModelForCausalLM.from_pretrained(model_to_load, 
                                             torch_dtype=torch.bfloat16, 
                                             device_map='cuda:0',
                                             trust_remote_code=True,
                                             revision="b098244a71fbe69ce149682d9072a7629f7e908c",
                                             quantization_config=BitsAndBytesConfig(load_in_4bit=True,
                                                               bnb_4bit_compute_dtype=torch.bfloat16,
                                                               bnb_4bit_use_double_quant=True,
                                                               bnb_4bit_quant_type="nf4")
                                            )

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7fd43a9b-381f-47d7-b247-61095a970f8d)')' thrown while requesting HEAD https://huggingface.co/THUDM/chatglm3-6b/resolve/main/adapter_config.json
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e7238674-9536-4e63-8ef3-1773dfb52b8a)')' thrown while requesting HEAD https://huggingface.co/THUDM/chatglm3-6b/resolve/main/adapter_config.json


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
print(model)

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear4bit(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear4bit(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear4bit(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_la

### set up the model

This cell snippet is part of a process known as model fine-tuning, where a pre-trained model is adapted to a new, similar task.

In the first loop, the code iterates over all the parameters of the model. For each parameter, it sets `requires_grad` to `False`, effectively freezing the parameter. This means that during subsequent training, the gradients will not be computed for these parameters, and thus they will remain unchanged. This is typically done when you want to keep parts of the model fixed and only train some layers (in this case, adapters will be trained later).

However, if the parameter's dimension (`ndim`) is 1, the code changes the data type of the parameter's data to `torch.float32`. This is done for stability reasons, as some parameters like those in Layer Normalization layers are sensitive to precision and can cause instability in training if kept in lower precision formats.

After that, the `gradient_checkpointing_enable` method is called on the model. Gradient checkpointing is a technique to reduce the memory usage when training models, at the cost of increased computation. It reduces the number of activations that need to be stored in memory.

Finally, the `enable_input_require_grads` method is called on the model. This method ensures that gradients with respect to the input are computed, which is not the default behavior in PyTorch. This might be necessary for some specific training regimes or for certain types of models.

The class `CastOutputToFloat` that inherits from `nn.Sequential`, a container class in the PyTorch library. The `nn.Sequential`  class is used to encapsulate a sequence of modules where the output of one module is the input to the next one.

The `forward` method is overridden in the `CastOutputToFloat` class. This method is called when you pass an input to an instance of the class. The `forward` method takes an input `x`, passes it to the `forward` method of the superclass (`nn.Sequential`), and then converts the output to `torch.float32` data type using the `to` method. This is done to ensure that the output of the model is always in floating point format, which is necessary for many downstream tasks in machine learning.

The last line of the code is replacing the `output_layer` of the `model.transformer` with an instance of `CastOutputToFloat`. This means that whenever the output layer of the transformer model is called, it will now use the `forward` method defined in `CastOutputToFloat`, thus ensuring that its output is always a floating point tensor.

In [8]:
for param in model.parameters():
    param.requires_grad = False #freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
    
model.gradient_checkpointing_enable() #reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.transformer.output_layer = CastOutputToFloat(model.transformer.output_layer)

### print_trainable_parameters
This Python function, `print_trainable_parameters`, takes a model as an argument and prints the number of trainable parameters in the model.

The function initializes two counters, trainable_params and all_params, to zero. It then iterates over all the parameters of the model using the named_parameters() method. For each parameter, it adds the total number of elements in the parameter tensor to all_params using the numel() method.

If the parameter requires gradient (i.e., it's trainable), the function also adds the number of elements in the parameter tensor to trainable_params.

Finally, the function prints the number of trainable parameters, the total number of parameters, and the percentage of parameters that are trainable. This information can be useful for understanding the capacity of the model and how much of it is being trained

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the models
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_params} || trainable %: {100 * trainable_params / all_params}"
    )

### configuring Lora configuration

The cell code is configuring and applying a LoraConfig to a model, and then printing the number of trainable parameters in the model.

First, a `LoraConfig` object is created with the following parameters:

- `r`: This is set to 64. In the context of Lora (Layer-wise Learning Rate Adaptation), `r` is the rank of the low-rank approximation used in the Lora method.
- `lora_alpha`: This is set to 32. `lora_alpha` is a hyperparameter in the Lora method that controls the learning rate adaptation.
- `target_modules`: This is set to `["query_key_value"]`. It specifies the modules in the model to which Lora should be applied.
- `lora_dropout`: This is set to 0.05. It's the dropout rate used in the Lora method.
- `bias`: This is set to "none". It specifies the type of bias to be used in the Lora method.
- `task_type`: This is set to "CAUSAL_LM". It specifies the type of task the model is being trained for. In this case, it's a causal language modeling task.

Next, the `get_peft_model` function is called with the `model` and `config` as arguments. This function is likely applying the Lora configuration to the model.

Finally, the `print_trainable_parameters` function is called with the `model` as an argument. This function prints the number of trainable parameters in the model. This is useful for understanding the complexity of the model and how many parameters will be updated during training.

In [10]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 15597568 || all params: 3403909120 || trainable %: 0.4582251596658374


### load and print pretrained configuration

This cell code snippet is used to load a pre-trained model configuration using the `AutoConfig.from_pretrained` method from the `transformers` library.

The `model_to_load` variable is expected to be a string that specifies the model to be loaded. This could be a model ID from Hugging Face's model hub or a local path to a directory containing model files.

The `config_kwargs` dictionary is used to pass additional arguments to the `from_pretrained` method. In this case, the `trust_remote_code` key is set to `True`, which means that the code will trust user code and data loaded from the Hugging Face model hub.

After the configuration is loaded, it is printed to the console along with the `config_kwargs` dictionary for debugging purposes.

In [11]:
config_kwargs = {
    "trust_remote_code": True,
}

config = AutoConfig.from_pretrained(model_to_load, **config_kwargs)
print(config)

ChatGLMConfig {
  "_name_or_path": "THUDM/chatglm3-6b",
  "add_bias_linear": false,
  "add_qkv_bias": true,
  "apply_query_key_layer_scaling": true,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "ChatGLMModel"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "auto_map": {
    "AutoConfig": "THUDM/chatglm3-6b--configuration_chatglm.ChatGLMConfig",
    "AutoModel": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
    "AutoModelForCausalLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
    "AutoModelForSeq2SeqLM": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForConditionalGeneration",
    "AutoModelForSequenceClassification": "THUDM/chatglm3-6b--modeling_chatglm.ChatGLMForSequenceClassification"
  },
  "bias_dropout_fusion": true,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "ffn_hidden_size": 13696,
  "fp32_residual_connection": false,
  "hidden_dropout": 0.0,
  "hidden_size": 40

### setting up the training configuration

This cell is setting up the training configuration for a model using the `TrainingArguments` class from the `transformers` library.

- `max_steps = 6000`: This sets the total number of training steps to 6000. A training step is one gradient update. In one step, the model processes `batch_size` number of examples and updates the weights once based on the gradients computed from those `batch_size` number of examples.

- `logging_steps = 100`: This sets the frequency of logging steps. The training metrics will be logged every 100 steps.

- `learning_rate = 5e-5`: This sets the learning rate for the optimizer. The learning rate controls how much to change the model in response to the estimated error each time the model weights are updated.

- `training_args = transformers.TrainingArguments(...)`: This creates an instance of the `TrainingArguments` class, which is used to define the training parameters. The parameters include:
  - `per_device_train_batch_size=1`: The number of training examples utilized in one iteration per device.
  - `gradient_accumulation_steps=1`: The number of steps to accumulate gradients before performing an optimizer step. This can be useful to handle large batches that don't fit in memory.
  - `warmup_steps=100`: The number of steps for the warmup phase, where the learning rate increases from 0 to the initial lr set.
  - `max_steps=max_steps`: The total number of training steps.
  - `learning_rate=learning_rate`: The learning rate for the optimizer.
  - `bf16=True`: Enables bfloat16 mode for training on GPUs. Bfloat16 is a compact numeric format that uses half the bits as float32 but achieves comparable model accuracy.
  - `logging_steps=logging_steps`: The number of steps between each logging.
  - `output_dir='outputs'`: The directory where the model predictions and checkpoints will be saved.

In [12]:
import transformers
max_steps = 50000 # about seven hours
logging_steps = 500
learning_rate = 4e-5

training_args = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=100,
        max_steps=max_steps,
        learning_rate=learning_rate,
        bf16=True,
        logging_steps=logging_steps,
        output_dir='outputs',
        optim='paged_adamw_8bit',    
)

### training the model

This cell is responsible for the training of the model. 

The `transformers.Trainer` class from the `transformers` library is used to handle the training. It takes several arguments:

- `model`: This is the model that you want to train. It's loaded from previous cell.
- `train_dataset`: This is the dataset that you want to use for training. In this case, it's `mapped_qa_dataset["train"]`, which is a dataset that has been preprocessed and tokenized.
- `args`: These are the training arguments that define the training parameters such as the batch size, learning rate, etc. They are defined earlier in the cell code as `training_args`.
- `data_collator`: This is used to batch data from the dataset. In this case, `transformers.DataCollatorForLanguageModeling` is used with `mlm` (Masked Language Modeling) set to `False`.

After the `Trainer` is initialized, the model's caching mechanism is disabled by setting `model.config.use_cache` to `False`. This is done to save memory during training, but it might slow down the training process.

Finally, the training is started with `trainer.train()`. This will train the model according to the parameters defined in `training_args` on the `train_dataset`.

In [None]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=mapped_qa_dataset["train"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
#model.config.use_cache = False
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,4.0843
1000,3.6808
1500,3.6273
2000,3.5433
2500,3.5041
3000,3.4863
3500,3.4046
4000,3.3908
4500,3.3164
5000,3.3115




### saving model and state

The two lines of code are used to save the state of the trainer and the model after training.

The trainer.save_model() function is a method from the Trainer class in the transformers library. It saves the model's weights into a directory. By default, this directory is the one defined in the output_dir attribute of the TrainingArguments object used when initializing the Trainer.

The trainer.save_state() function is also a method from the Trainer class in the transformers library. It saves the optimizer and the scheduler states to ensure that you can resume training exactly where you left off. This is particularly useful when training large models that can't be trained in one go and need to be trained in several stages.

In [None]:
trainer.save_model()
trainer.save_state()

### inference


In [1]:
import torch
import torch.nn as nn
from peft import PeftModel
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

model_to_load = "THUDM/chatglm3-6b"

model = AutoModelForCausalLM.from_pretrained(model_to_load, 
                                             torch_dtype=torch.bfloat16, 
                                             device_map='cuda:0',
                                             trust_remote_code=True,
                                             revision="b098244a71fbe69ce149682d9072a7629f7e908c",
                                             quantization_config=BitsAndBytesConfig(load_in_4bit=True,
                                                               bnb_4bit_compute_dtype=torch.bfloat16,
                                                               bnb_4bit_use_double_quant=True,
                                                               bnb_4bit_quant_type="nf4")
                                            )

qa_model = PeftModel.from_pretrained(model, "outputs")

  torch.utils._pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [2]:
print(qa_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=64, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=64, out_features=4608, bias=False)

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_to_load, trust_remote_code=True)

In [4]:
from IPython.display import display, Markdown
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Tuple, Union

prefix: List[Union[str, Dict[str, str]]] =[
        {"token": "[gMASK]"},
        {"token": "sop"},
        {"token": "<|user|>"},
        "\n",
        "{{question}}",
        {"token": "<|assistant|>"}
]

def create_prompt(question) -> List[int]:
    result = []
    for prefix_part in prefix:
        if isinstance(prefix_part, dict):
            if "token" in prefix_part:
                result += [tokenizer.convert_tokens_to_ids(prefix_part["token"])]
            else:
                result += [tokenizer.convert_tokens_to_ids(prefix_part["token"])]
        else:
            prefix_part = prefix_part.replace("{{question}}", question, 1)
            result += tokenizer.encode(prefix_part, add_special_tokens=False)
    return  result


def make_inference(question, refer_model):
    batch = dict()
    input_ids = create_prompt(question)
    batch["input_ids"] = torch.tensor([input_ids])
    batch["attention_mask"] = torch.tensor([[1] * len(input_ids)])
    batch["position_ids"] = torch.tensor([list(range(0, len(input_ids)))])
    
    print("question:\n{}".format(batch))
    
    with torch.cuda.amp.autocast():
        output_tokens = refer_model.generate(**batch, max_new_tokens=512)
    
    print("output{}".format(output_tokens))
    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [7]:
question = "给立志成为架构师的程序员一些建议吧"

make_inference(question, qa_model)

question:
{'input_ids': tensor([[64790, 64792, 64795, 30910,    13, 30910, 54840, 43572, 31705, 36232,
         46130, 32443, 54650, 31784, 32108, 55370, 64796]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'position_ids': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]])}
outputtensor([[64790, 64792, 64795, 30910,    13, 30910, 54840, 43572, 31705, 36232,
         46130, 32443, 54650, 31784, 32108, 55370, 64796, 36953, 31211, 55266,
         58091, 31123, 55419, 38019, 56264, 54583,    13, 54531, 31201, 31788,
         31943, 31930, 31301, 31788, 31943, 33130, 31300,    13, 54534, 36232,
         31735, 54538, 31123, 36232, 54758, 31665, 31788, 31943, 34301, 31848,
         31123, 33740, 31668, 32924, 31848, 31301, 32098, 31943, 34301, 31943,
         33951, 31123, 31786, 33130, 31123, 31786, 34030, 54609, 31966, 54534,
         36232, 54538, 31123, 36232, 54758, 54552, 33351, 31711, 31943, 34301,
         33130, 54542

[gMASK]sop<|user|> 
 给立志成为架构师的程序员一些建议吧<|assistant|> 作者：陈皓，左耳朵耗子
一、了解业务领域（了解业务流程）
在架构设计中，架构师需要了解业务领域的知识，而不是技术方面的知识（比如业务领域的业务逻辑，数据流程，数据模型等）。在架构中，架构师要关心的是业务领域的流程和规则，而不是技术方面的流程，比如，你并不需要关心数据库引擎的内部实现，你只需要知道数据库引擎的接口就足够了。
注：你还要知道数据平台，中间件平台，安全平台，网络平台，运维平台，等等（你必要了解的领域太大了） 这样你才能在架构设计中，知道业务领域的流程，规则和数据模型等 。
（附： 这里有很多的流程和规则，你需要花大力气去了解这些规则和流程引擎引擎内部的实现，引擎引擎内部的实现和业务领域的实现太不一样了，你需要花大力气去整合整合整合，你需要需要有很丰富的经验才能整合得起来，这里需要有大量的实践经验去积累。）
二、学习设计模式（了解设计模式）
（这里需要了解的设计模式太多太多太多，你至少需要了解和掌握一些常用的设计模式，比如单例模式，工厂模式，工厂模式，观察者模式，等等，这里不再细说， 这里推荐两本比较好的设计模式方面的书籍《设计模式》和《设计模式实用指南》。（你还可以看看《Ruby on Rails 设计模式》这本书，《Spring 核心概念》等）
注：你需要知道，不同的设计模式，不同的场景，不同的需求，不同的业务流程，会用到不同的设计模式。你需要根据不同的需求，不同的业务流程去选择不同的设计模式，而不是，统一地用一种设计模式。你需要知道如何去用设计模式（你至少需要会写一些设计模式） 。
来自：酷壳网 CoolShell
