In [3]:
import torch
import numpy as np


In [3]:
data = torch.load('pattern_matrices.pt')
print(len(data), data[0].keys())

2000 dict_keys(['prompt_text', 'prompt_token_ids', 'prompt_attention_mask', 'token_ids', 'token_pattern_matrices'])


# 1. 统计数据集信息

1. prompt 统计信息

In [12]:
prompt_lens = np.array([len(sample['prompt_token_ids']) for sample in data.values()])
prompt_lens.min(), prompt_lens.max(), prompt_lens.mean(), prompt_lens.std()

(41, 382, 65.696, 30.360823177246033)

2. decoding 统计信息

In [22]:
decoding_tokens = []
for i in range(len(data)):
    sample = data[i]
    decoding_tokens.append(sample['token_ids'][prompt_lens[i]:])
decoding_tokens = torch.stack(decoding_tokens)
print(decoding_tokens.shape)
decoding_token_lens = np.array([len(x) for x in decoding_tokens])
decoding_token_lens.min(), decoding_token_lens.max(), decoding_token_lens.mean(), decoding_token_lens.std()

torch.Size([2000, 64])


(64, 64, 64.0, 0.0)

In [23]:
decoding_token_pattern_matrices = []
for i in range(len(data)):
    sample = data[i]
    decoding_token_pattern_matrices.append(sample['token_pattern_matrices'][prompt_lens[i]:]) # (#decoding_tokens, #layers, #experts)
decoding_token_pattern_matrices = torch.stack(decoding_token_pattern_matrices)
decoding_token_pattern_matrices.shape, decoding_token_pattern_matrices[0].shape, decoding_token_pattern_matrices[0][0]

(torch.Size([2000, 63, 32, 8]),
 torch.Size([63, 32, 8]),
 tensor([[1., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 1., 0., 0., 0., 0., 1.],
         [0., 0., 0., 1., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1., 1., 0.],
         [0., 0., 0., 0., 1., 1., 0., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 1., 0., 0.],
         [0., 0., 0., 1., 0., 1., 0., 0.],
         [0., 0., 0., 1., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 0., 1., 0.],
         [0., 0., 1., 0., 1., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 1., 0.],
         [1., 0., 1., 0., 0., 0., 0., 0.],
         [0., 1., 1., 0., 0., 0., 0., 0.],
         [0., 1., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 1., 0.],
         [0., 0., 0., 0., 1., 0., 0., 1.],
         [0., 1., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 1.],
         [1., 0., 0., 0., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0

# 构建 Predictor 训练集

In [57]:
from datasets import Dataset
hf_data = {
    'prompt_text': [],
    'prompt_tokens_len': [],
    'token_ids': [],
    # 'prompt_token_ids': [],
    # 'decoding_token_ids': [],
    'token_pattern_matrices': []
}

# for i in range(10):
for i in range(len(data)):
    sample = data[i]
    prompt_text = sample['prompt_text']
    padded_prompt_token_ids = sample['prompt_token_ids']
    prompt_attention_mask = sample['prompt_attention_mask']
    start_index = prompt_attention_mask.argmax().item()
    token_ids = sample['token_ids'][start_index:-1]
    prompt_token_ids = padded_prompt_token_ids[start_index:]
    prompt_tokens_len = len(prompt_token_ids)
    decoding_token_ids = sample['token_ids'][len(prompt_attention_mask):-1]
    token_pattern_matrices = sample['token_pattern_matrices'][start_index:]
    assert len(token_ids)==len(decoding_token_ids)+len(prompt_token_ids)
    assert token_ids.numpy().tolist()==prompt_token_ids.numpy().tolist()+decoding_token_ids.numpy().tolist()
    assert len(token_pattern_matrices)==len(token_ids)
    hf_data['prompt_text'].append(prompt_text)
    hf_data['prompt_tokens_len'].append(prompt_tokens_len)
    hf_data['token_ids'].append(token_ids)
    # hf_data['prompt_token_ids'].append(prompt_token_ids)
    # hf_data['decoding_token_ids'].append(decoding_token_ids)
    hf_data['token_pattern_matrices'].append(token_pattern_matrices)
hf_data = Dataset.from_dict(hf_data)
hf_data

Dataset({
    features: ['prompt_text', 'prompt_tokens_len', 'token_ids', 'token_pattern_matrices'],
    num_rows: 2000
})

In [58]:
prompt_token_ids.shape, prompt_attention_mask, prompt_attention_mask.sum(), prompt_attention_mask.argmax().item(), decoding_token_ids.shape, token_ids.shape, token_pattern_matrices.shape

(torch.Size([382]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        

In [59]:
hf_data.push_to_hub('marsggbo/mixtral_8x7b_moe_alpaca_2k_token_pattern')

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:01<00:00,  1.08ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.94s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/marsggbo/mixtral_8x7b_moe_alpaca_2k_token_pattern/commit/2fc2ba69bd0cf6b5f5dba433d10f5641ba53048f', commit_message='Upload dataset', commit_description='', oid='2fc2ba69bd0cf6b5f5dba433d10f5641ba53048f', pr_url=None, pr_revision=None, pr_num=None)

In [61]:
from datasets import load_dataset

dataset = load_dataset("marsggbo/mixtral_8x7b_moe_alpaca_2k_token_pattern")
dataset

Downloading readme: 100%|██████████| 459/459 [00:00<00:00, 4.31MB/s]
Downloading data: 100%|██████████| 8.02M/8.02M [00:00<00:00, 8.42MB/s]
Generating train split: 100%|██████████| 2000/2000 [00:01<00:00, 1911.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['prompt_text', 'prompt_tokens_len', 'token_ids', 'token_pattern_matrices'],
        num_rows: 2000
    })
})

In [66]:
from datasets import Dataset

In [63]:
lens = [x for x in dataset['train']['prompt_tokens_len']]
print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens))
# lens = [len(x) for x in dataset['test']['prompt_token_ids']]
# print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens))

37 382 59.2955 21.39596176267849


In [69]:

from typing import List, Optional, Tuple, Union
Union[str, Dataset]

typing.Union[str, datasets.arrow_dataset.Dataset]

In [65]:
sample = dataset['train'][0]
prompt_tokens_len = sample['prompt_tokens_len']
len(sample['token_ids'][:prompt_tokens_len]), np.stack(sample['token_pattern_matrices']).shape

(39, (102, 32, 8))

In [77]:
labels = np.stack(sample['token_pattern_matrices'])
labels = torch.from_numpy(labels).int()
labels

tensor([[[0, 1, 0,  ..., 1, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 1]],

        [[0, 0, 0,  ..., 1, 1, 0],
         [1, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 1, 0]],

        [[0, 0, 0,  ..., 1, 1, 0],
         [0, 0, 0,  ..., 0, 0, 1],
         [0, 0, 0,  ..., 1, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 1,  ..., 0, 1, 0]],

        ...,

        [[0, 0, 1,  ..., 1, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 0, 0],
         [0, 1, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0]],

        [[1, 0, 0,  ..., 0, 0, 1],
         [0, 0, 0,  ..., 1, 0, 1],
         [1,

In [73]:
torch.tensor(sample['token_ids'], dtype=int)

tensor([    1, 20811,   349,   396, 13126,   369, 13966,   264,  3638, 28723,
        12018,   264,  2899,   369,  6582,  1999,  2691,   274,   272,  2159,
        28723,    13,    13, 27332,  3133,  3112, 28747,    13, 28784,   648,
        28705, 28770,   327,  1550,    13,    13, 27332, 12107, 28747,    13,
        28784,   648, 28705, 28770, 21588, 28705, 28774, 28723,     2, 10020,
        28744,     2, 10020, 28744,    13,    13, 27332, 11530, 12107, 28747,
           13, 28774, 28723,     2, 10020, 28744,     2, 10020, 28744,    13,
           13, 27332,  1529, 11009,   352, 28747,    13,  7477,   368,   967,
        28705, 28784,   304, 28705, 28770,  2553, 28725,   368,   625, 28705,
        28774, 28723,   415,  1474, 28705, 28774,   349,   272,  2648,   302,
        28705, 28784])

In [15]:
def acc_precision_recall_f1(y_true_origin, y_pred_origin):
    bs, seq_len, num_layer, num_experts = y_true_origin.shape
    y_true = np.reshape(y_true_origin, (bs, seq_len, num_layer * num_experts))
    y_pred = np.reshape(y_pred_origin, (bs, seq_len, num_layer * num_experts))
    y_true = np.transpose(y_true, (1, 0, 2)) # (seq, bs, num_layer * num_experts)
    y_pred = np.transpose(y_pred, (1, 0, 2)) # (seq, bs, num_layer * num_experts)
    y_true = (y_true.sum(1)>0).astype(int) # (seq, num_layer * num_experts)
    y_pred = (y_pred.sum(1)>0).astype(int) # (seq, num_layer * num_experts)
    print(y_true.shape, y_true)
    # 真正例 (True Positives)
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    # 假正例 (False Positives)
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    # 假负例 (False Negatives)
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    # 真负例 (True Negatives)
    TN = np.sum((y_true == 0) & (y_pred == 0))

    y_true = y_true.reshape(-1, 256)
    y_pred = y_pred.reshape(-1, 256)
    print(f"origin y_true.shape={y_true.shape}")
    indices = np.any(y_true, axis=-1)
    print(indices.shape)
    y_true = y_true[indices]
    y_pred = y_pred[indices]
    print(f"filtered y_true.shape={y_true.shape}")

    # 准确率
    num_tokens = y_true.shape[0]
    accuracy = TP / (num_tokens*64)
    recall = 0
    precision = 0
    f1 = 0
    print(f"non-padding ratio: {indices.sum()}/{len(indices)}={indices.sum()/len(indices)}\n")

    return {
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'f1': f1,
    }

In [16]:
seq_len = 12
pad_seq_len = 4 
y_true = torch.randint(0,2,(4,8,32,8)).numpy()
y_pred = torch.randint(0,2,(4,8,32,8)).numpy()
print(acc_precision_recall_f1(y_true, y_pred))

(8, 256) [[1 1 1 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 0]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]
origin y_true.shape=(8, 256)
(8,)
filtered y_true.shape=(8, 256)
non-padding ratio: 8/8=1.0

{'accuracy': 3.46875, 'recall': 0, 'precision': 0, 'f1': 0}


In [6]:
dataset_names = {
    "auto_categorization": 328,
    "tense": 286,
    "disfl_qa": 8000,
    "semantic_parsing_in_context_sparc": 1160,
    "word_sorting": 1900,
    "linguistics_puzzles": 2000,
}

In [14]:
import datasets
dataset_name = "tasksource/bigbench"
# names = datasets.get_dataset_config_names(dataset_name)

names = list(dataset_names.keys())
all_inputs = []
for name in names:
    print(name)
    all_inputs.append(datasets.load_dataset(dataset_name, name))

auto_categorization


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


tense
disfl_qa
semantic_parsing_in_context_sparc
word_sorting
linguistics_puzzles


In [15]:
all_inputs[0]

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets', 'multiple_choice_targets', 'multiple_choice_scores', 'idx'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['inputs', 'targets', 'multiple_choice_targets', 'multiple_choice_scores', 'idx'],
        num_rows: 65
    })
})

In [18]:

train_all_inputs = []
valid_all_inputs = []
for dataset in all_inputs:
    train_all_inputs += [text for text in dataset["train"]["inputs"]]
    valid_all_inputs += [text for text in dataset["validation"]["inputs"]]
len(train_all_inputs), len(valid_all_inputs)

(10936, 2733)