In [1]:
import copy
from dataclasses import dataclass, field
import json
import pathlib
from typing import Dict, Optional, Sequence

import numpy as np
import torch
from torch.utils.data import Dataset
import transformers
from transformers import Trainer
from transformers.trainer_pt_utils import LabelSmoother

from fastchat.conversation import SeparatorStyle
from fastchat.model.model_adapter import get_conversation_template

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-10 08:20:34,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
data_path = '/home/minhvn/workspace/llm/FastChat/fastchat/data/rikkei_clean.json'
raw_data = json.load(open(data_path, "r"))
sources = [example["conversations"] for example in raw_data]
type(sources)

list

In [3]:
conv = get_conversation_template("vicuna")
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
roles

{'human': 'USER', 'gpt': 'ASSISTANT'}

In [4]:
# Apply prompt templates
conversations = []
for i, source in enumerate(sources):
    if roles[source[0]["from"]] != conv.roles[0]:
        # Skip the first one if it is not from human
        source = source[1:]

    conv.messages = []
    for j, sentence in enumerate(source):
        role = roles[sentence["from"]]
        assert role == conv.roles[j % 2], f"{i}"
        conv.append_message(role, sentence["value"])
    conversations.append(conv.get_prompt())
    
print(conversations[0])
print('\n')
print(conversations[1])

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp? ASSISTANT: Function: contact(keyword="congp")
Observation: </s>


A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.<

In [13]:
substring = "ASSISTANT:"
index = conversations[1].find(substring)
print(index)  # Output: 7

print(conversations[1][617:622])
if conversations[1][618] == '\n':
    print("YES")

621
]}
 A


In [5]:
# Tokenize conversations
model_path = 'lmsys/vicuna-7b-v1.3'
tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_path,
        model_max_length=2048,
        padding_side="right",
        use_fast=False,
    )
input_ids = tokenizer(
    conversations,
    return_tensors="pt",
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
).input_ids

targets = input_ids.clone()

In [15]:
print(len(targets))
print(len(input_ids))

307
307


In [16]:
# Mask targets
sep = conv.sep + conv.roles[1] + ": "
sep

' ASSISTANT: '

In [35]:
target = targets[1]
conversation = conversations[1]
print(target)
print(conversations[1])


tensor([ -100,   319, 13563,  ...,     0,     0,     0])
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.</s>


In [18]:
len(target)
cnt = 0
for i in target:
    cnt += i == 0
print(cnt)
print(target[:80])


tensor(1777)
tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
        21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173, 29892,
          322,  1248,   568,  6089,   304,   278,  1404, 29915, 29879,  5155,
        29889,  3148,  1001, 29901,  1815,   366,  2649,   592,   278,  2989,
         1024,   310,  1749, 23056,  3437,  2998,   408,   378, 29887, 29886,
        29973,    13,  4290, 29901,    13,  6678, 29901,  6958, 29898, 26766,
          543, 21015, 29886,  1159,    13,  6039,  2140,   362, 29901,  8853,
         7827,  1115, 29871, 29896, 29892,   376,  1272,  1115,   518,  6377])


In [19]:
tokenizer.pad_token = tokenizer.unk_token
total_len = int(target.ne(tokenizer.pad_token_id).sum())
print(tokenizer.pad_token_id)
print(total_len)
# At token 73, target is padded with zeros

0
271


In [20]:
print(conversation)
print('\n')
rounds = conversation.split(conv.sep2)
print(rounds)

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.</s>


['A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: Can you tell me the full name of our colleague known as congp?\nInput:\nFunction: contact(keyword="congp")\nObservation:

In [29]:
cur_len = 1
target[:cur_len] = IGNORE_TOKEN_ID
for i, rou in enumerate(rounds):
    if rou == "":
        break
    parts = rou.split(sep)
    print(rou)
    print('\n')
    print(parts)
    if len(parts) != 2:
        break
    parts[0] += sep
    print('\n')
    print(parts)
    round_len = len(tokenizer(rou).input_ids)
    print('\n')
    #print(f"round len: {round_len}")
    #print(tokenizer(rou).input_ids)
    instruction_len = len(tokenizer(parts[0]).input_ids) - 2
    print('\n')
    print(f"instruction_len: {instruction_len}")
    print(tokenizer.decode(target[1:244]))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.


['A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: Can you tell me the full name of our colleague known as congp?\nInput:\nFunction: contact(keyword="congp")\nObservation: {"t

In [13]:
#total_len = int(target.ne(tokenizer.pad_token_id).sum())
#rounds = conversation.split(conv.sep2)
cur_len = 1
target[:cur_len] = IGNORE_TOKEN_ID
for i, rou in enumerate(rounds):
    if rou == "":
        break
    parts = rou.split(sep)
    print(rou)
    print(parts)
    
    if len(parts) != 2:
        break
    parts[0] += sep
    print(parts)
    
    round_len = len(tokenizer(rou).input_ids)
    print(f"round len: {round_len}")
    print(tokenizer(rou).input_ids)
    instruction_len = len(tokenizer(parts[0]).input_ids) - 2
    print(f"instruction_len: {instruction_len}")
    print(cur_len)
    target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
    print(len(target))
    #print(target[56:73])
    
    cur_len += round_len
    
print(cur_len)
target[cur_len:] = IGNORE_TOKEN_ID
print(len(target))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.
['A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: Can you tell me the full name of our colleague known as congp?\nInput:\nFunction: contact(keyword="congp")\nObservation: {"tot

In [99]:
cnt = 0
for idx, i in enumerate(target):
    cnt += i != -100
    if i != -100:
        print(idx)
print(cnt)

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
tensor(16)


In [174]:
print(tokenizer.decode([targets]))

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  6680, 29901,  6958, 29898,
        26766,   543, 21015, 29886,  1159,    13,  6039,  2140,   362, 29901,
        29871,     2,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])

In [103]:
lol = torch.tensor([1,2,3,4])
m = lol.ge(0)
print(m)

tensor([True, True, True, True])


In [106]:
lol[~m]

tensor([], dtype=torch.int64)

In [151]:
example = input_ids[1]
labels = targets[1]
print(example)
print(labels)

tensor([    1,   319, 13563,  ...,     0,     0,     0])
tensor([0, 0, 0,  ..., 0, 0, 0])


In [152]:
prompt = sources[1][0]['value']
prompt = tokenizer.encode(prompt)
print(type(prompt))
print(len(prompt))
#print(prompt)

<class 'list'>
206


In [153]:
labels[: len(prompt)] = -1
print(labels[200:250])

tensor([   -1,    -1,    -1,    -1,    -1,    -1, 29899, 29900, 29896,   613,
          376,  2962, 29918, 22899, 29918,  3626,   284,  1115,   376, 29906,
        29900, 29906, 29906, 29899, 29900, 29941, 29899, 29900, 29896,   613,
          376,  9157, 29918, 10149,  1115,  1870,  6525, 29913,    13,   319,
         1799,  9047, 13566, 29901,  9550,   673, 29901,   450,  2989,  1024])


In [154]:
example_mask = example.ge(0)
label_mask = labels.ge(0)
cnt = 0
for i in label_mask:
    cnt += i == False
print(f"Number of values in labels < 0 is: {cnt}")
print(example)

Number of values in labels < 0 is: 206
tensor([    1,   319, 13563,  ...,     0,     0,     0])


In [155]:
type(example_mask)

torch.Tensor

In [165]:
label_mask

tensor([False, False, False,  ...,  True,  True,  True])

In [164]:
labels[~label_mask] = 0
labels[:207]

tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [149]:
labels[~label_mask] = 0
cnt = 0
for i in label:
    cnt += i == 0
print(f"Number of values in labels = 0 is: {cnt}")

Number of values in labels = 0 is: 1983


In [118]:
labels[: len(prompt)] = -1
example_mask = example.ge(0)
label_mask = labels.ge(0)
example[~example_mask] = 0
labels[~label_mask] = 0
example_mask = example_mask.float()
label_mask = label_mask.float()

torch.Tensor

In [156]:
a = torch.tensor([1,2,3,4])
m = torch.tensor([True, False, True, False])
a[~m] = 0
print(a)

tensor([1, 0, 3, 0])


In [170]:
#print(input_ids.ne(tokenizer.pad_token_id))
x = input_ids[1]
idx = 0
for i, v in enumerate(x):
    if (v == 0):
        idx = i
        break

print(idx)
print(tokenizer.decode(x[:272]))

271
<s> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Can you tell me the full name of our colleague known as congp?
Input:
Function: contact(keyword="congp")
Observation: {"total": 1, "data": [{"code": "NV0002006", "email": "congpv@rikkeisoft.com", "birthday": "1995-05-02", "full_name": "Phạm Văn Công", "family_name": "Phạm", "first_name": "Công", "phone": "0988458602", "department": "D6", "role": "Member", "start_working_official": "2022-05-01", "start_working_trial": "2022-03-01", "bank_account": null}]}
 ASSISTANT: Final Answer: The full name of our colleague known as congp is Phạm Văn Công.</s><unk>
