# Library

In [1]:
import copy
import random
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence
import io
import json
import torch
import transformers
import pandas as pd
from torch.utils.data import Dataset
from transformers import Trainer

# Path

In [2]:
data_path = "/localfast/xiaowang/LJPtask/Data/SFT/train1_t2_sft.json"

In [3]:
IGNORE_INDEX = -100 #!Q1: What is this?
DEFAULT_PAD_TOKEN = "</s>"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"



PROMPT_DICT = {
    "prompt_input": (
        "<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).\n"
        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
        "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.<|im_end|>\n"
        "<|im_start|>user\n{input}<|im_end|>\n"
        "<|im_start|>assistant\n"
    ),
}

In [5]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [6]:
model_name_or_path = "internlm/internlm2-chat-7b-sft"
model_max_length = 32768

tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_name_or_path,
        model_max_length=model_max_length,
        padding_side="right",
        use_fast=True,
        trust_remote_code=True,
    )

In [7]:
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest", # longest in the batch
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] #!Q2: Why [0]? 
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ] # ne: not equal
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

In [5]:
# make sure f is a file object
def _make_r_io_base(f, mode: str): 
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f
def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

list_data_dict = jload(data_path)
print(len(list_data_dict))
# prompt_input = PROMPT_DICT["prompt_input"]
# sources = [
#             prompt_input.format_map(example['conversation'][0]) 
#             for example in list_data_dict
#         ]
# targets = [f"{example['conversation'][0]['output']}<|im_end|>" for example in list_data_dict]

4000


In [6]:
list_data_dict = random.sample(list_data_dict, 10)
print(len(list_data_dict))

10


In [9]:
def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

In [10]:
examples = [s + t for s, t in zip(sources, targets)]
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = examples_tokenized["labels"]

In [11]:
es = ["1我", "2我", "3我"]
ss = ["1", "2", "3"]
for strings in (es, ss):
    print(strings)

es_tokenized, ss_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (es, ss)]
print(es_tokenized)
print(ss_tokenized)

es_input_ids = es_tokenized["input_ids"]
es_labels = es_tokenized["labels"]

for es_label, source_len in zip(es_labels, ss_tokenized["input_ids_lens"]):
    es_label[:source_len] = IGNORE_INDEX

['1我', '2我', '3我']
['1', '2', '3']
{'input_ids': [tensor([    1,   312, 60363]), tensor([    1,   314, 60363]), tensor([    1,   308, 60363])], 'labels': [tensor([    1,   312, 60363]), tensor([    1,   314, 60363]), tensor([    1,   308, 60363])], 'input_ids_lens': [3, 3, 3], 'labels_lens': [3, 3, 3]}
{'input_ids': [tensor([  1, 312]), tensor([  1, 314]), tensor([  1, 308])], 'labels': [tensor([  1, 312]), tensor([  1, 314]), tensor([  1, 308])], 'input_ids_lens': [2, 2, 2], 'labels_lens': [2, 2, 2]}


In [17]:
es_tokenized_ls = [
    tokenizer(
        e,
        return_tensors="pt",
        padding="longest", # longest in the batch
        max_length=tokenizer.model_max_length,
        truncation=True,
    )
    for e in es
]


torch.Size([1, 3])

In [21]:
es_tokenized_ls[0].input_ids[0]

tensor([    1,   312, 60363])

In [24]:
es_tokenized_ls[0].input_ids.shape

torch.Size([1, 3])