In [50]:
import os
import argparse
import evaluate
import pandas as pd

from datasets import load_dataset
from transformers import AutoTokenizer

In [110]:
import torch
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer

device = "cuda"
model_id = "EleutherAI/polyglot-ko-1.3b"
model = GPTNeoXForCausalLM.from_pretrained(model_id).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)


def calc_ppl(model, encodings, stride=256):
    max_length = model.config.max_position_embeddings
    encodings['input_ids'] = torch.tensor(encodings['input_ids'])
    seq_len = encodings['input_ids'].size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings['input_ids'][:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            print(outputs)
            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [155]:
def calc_perplextiy_upload(model_name, dataset_name, save_local=True, save_local_path="./result_ppl", save_hf=False, save_hf_user_name=None):
    model = GPTNeoXForCausalLM.from_pretrained(model_name).to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token = '<|endoftext|>',add_bos_token = True)
    # tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token':'<|endoftext|>'})

    def encode_pad_preprocess(examples):
        return tokenizer(examples['instruction'], max_length=256, truncation=True, return_tensors='pt')

    dataset = load_dataset(dataset_name)
    preprocessed_data = dataset.map(encode_pad_preprocess, num_proc=24)
    # preprocessed_data = preprocessed_data.select([0,1,2,3,4])
    print(preprocessed_data)

    # calculate perplexity
    # perplexity = evaluate.load("perplexity", module_type="metric")
    # instruction = preprocessed_data['train']['trunc_instruction']

    # update perplexity
    # ppl_results = perplexity.compute(model_id=model_name, add_start_token=True, predictions=instruction)
    ppl_results = []
    for data in tqdm(preprocessed_data['train'].select([0,1,2,3,4])):
        ppl = calc_ppl(model, data)
        ppl_results.append(ppl.detach().cpu().tolist())
    
    round_ppl_results = [round(ppl, 2) for ppl in  ppl_results]

    df = pd.DataFrame(preprocessed_data['train'].select([0,1,2,3,4]))
    df['ppl'] = round_ppl_results
    df_dataset = pd.DataFrame(df.sort_values(by=['len', 'ppl']), columns=['len', 'ppl', 'group', 'instruction', 'output'])
    df_dataset = df_dataset.dropna(axis=0)
    
    datasetname = dataset_name.split('/')[-1]
    model_name = model_name.split('/')[-1]
    
    if save_local and save_local_path: # save to local folder
        os.makedirs(save_local_path, exist_ok=True)
        df_dataset.to_json(f"./{save_local_path}/{datasetname}_ppl_{model_name}.json", orient='records', lines=True, force_ascii=False)

    # if save_hf and save_hf_user_name: # push to hub
    #     dataset = preprocessed_data['train'].add_column("ppl", round_ppl_results)
    #     dataset = dataset.remove_columns(['input_ids', 'attention_mask', 'trunc_instruction',])
    #     # model_name = model_name.split('/')[-1]
    #     dataset.push_to_hub(f'{save_hf_user_name}/{datasetname}_ppl_instruction_{model_name}')

In [156]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# dataset_name = 'beomi/KoAlpaca-v1.1a'
# dataset_name = 'nlpai-lab/kullm-v2'
dataset_name = 'nayohan/koquality_raw_test'
# model_name = 'gpt2'
model_name = 'EleutherAI/polyglot-ko-1.3b'

# 2. calculate perplexity and save to ppl column
calc_perplextiy_upload(
                        model_name=model_name, 
                        dataset_name=dataset_name, 
                        save_local=True, 
                        save_local_path="./test", 
                        )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset parquet (/home/uj-user/.cache/huggingface/datasets/nayohan___parquet/nayohan--koquality_raw_test-7121b638de347046/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/uj-user/.cache/huggingface/datasets/nayohan___parquet/nayohan--koquality_raw_test-7121b638de347046/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-77109c934b63e4d0_*_of_00024.arrow


DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'group', 'len', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 19893
    })
})


 20%|██        | 1/5 [00:00<00:00,  7.49it/s]

CausalLMOutputWithPast(loss=tensor(0.4292, device='cuda:0'), logits=tensor([[[-6.0962, -5.9330,  1.5051,  ..., -5.7625, -6.3668, -6.1495],
         [-6.5881, -5.3445,  3.1259,  ..., -6.0045, -5.6219, -5.8318]]],
       device='cuda:0'), past_key_values=((tensor([[[[-1.3485,  0.0120,  1.6234,  ...,  0.3681,  0.5027,  0.6636],
          [ 0.7669,  1.0783,  0.9997,  ..., -0.4371, -1.2282, -0.6915]],

         [[ 1.0028, -0.5385,  1.0567,  ...,  0.7846,  1.2648, -0.3811],
          [-0.1673, -0.5478,  1.3153,  ..., -0.5689,  1.4141, -0.4907]],

         [[-1.3489,  0.3384, -0.7244,  ...,  1.0898, -0.7150, -0.7122],
          [-1.0388,  0.7522, -1.4089,  ..., -1.6300,  0.5918, -2.3881]],

         ...,

         [[-0.2180, -0.2144, -1.5268,  ..., -0.4065, -0.5441, -0.9602],
          [ 0.5454,  0.7724, -0.1551,  ..., -0.1037,  0.4529, -0.1537]],

         [[ 0.3172,  0.8549,  1.6704,  ...,  0.9283, -0.0199,  1.2682],
          [ 2.9501,  0.0987, -1.0935,  ..., -0.4038,  1.2458,  0.6766]],



 60%|██████    | 3/5 [00:00<00:00, 10.03it/s]

CausalLMOutputWithPast(loss=tensor(nan, device='cuda:0'), logits=tensor([[[-7.7216, -7.8633,  3.1863,  ..., -7.9368, -7.7910, -7.9001]]],
       device='cuda:0'), past_key_values=((tensor([[[[ 1.7420, -0.0134, -1.2467,  ..., -1.1550,  0.3556,  1.0059]],

         [[-1.3722, -0.0223, -0.2412,  ..., -0.3326,  0.9623, -0.7811]],

         [[-1.2929,  0.5856, -0.0420,  ..., -0.0125, -0.5009, -0.4914]],

         ...,

         [[ 0.9614,  0.3040, -1.9316,  ..., -2.4641, -1.7561, -1.0011]],

         [[-0.4830,  0.2604, -2.0472,  ...,  0.0758,  0.4812,  0.6786]],

         [[-0.2767,  0.2672,  0.3132,  ...,  0.4161,  0.5743, -0.4530]]]],
       device='cuda:0'), tensor([[[[ 0.3357,  0.2906, -0.0236,  ...,  0.0763, -0.4044,  0.4842]],

         [[-0.6508, -0.6121,  0.3738,  ..., -0.5202,  0.0479,  0.8156]],

         [[ 0.5675, -1.2756, -0.4415,  ...,  0.5886,  0.4032, -0.1682]],

         ...,

         [[-1.2089,  2.1367, -0.9819,  ...,  0.1918, -0.6461, -0.4867]],

         [[-0.2151,  0.

100%|██████████| 5/5 [00:00<00:00, 10.33it/s]

CausalLMOutputWithPast(loss=tensor(nan, device='cuda:0'), logits=tensor([[[-7.7216, -7.8633,  3.1863,  ..., -7.9368, -7.7910, -7.9001]]],
       device='cuda:0'), past_key_values=((tensor([[[[ 1.7420, -0.0134, -1.2467,  ..., -1.1550,  0.3556,  1.0059]],

         [[-1.3722, -0.0223, -0.2412,  ..., -0.3326,  0.9623, -0.7811]],

         [[-1.2929,  0.5856, -0.0420,  ..., -0.0125, -0.5009, -0.4914]],

         ...,

         [[ 0.9614,  0.3040, -1.9316,  ..., -2.4641, -1.7561, -1.0011]],

         [[-0.4830,  0.2604, -2.0472,  ...,  0.0758,  0.4812,  0.6786]],

         [[-0.2767,  0.2672,  0.3132,  ...,  0.4161,  0.5743, -0.4530]]]],
       device='cuda:0'), tensor([[[[ 0.3357,  0.2906, -0.0236,  ...,  0.0763, -0.4044,  0.4842]],

         [[-0.6508, -0.6121,  0.3738,  ..., -0.5202,  0.0479,  0.8156]],

         [[ 0.5675, -1.2756, -0.4415,  ...,  0.5886,  0.4032, -0.1682]],

         ...,

         [[-1.2089,  2.1367, -0.9819,  ...,  0.1918, -0.6461, -0.4867]],

         [[-0.2151,  0.




In [169]:
model_name = 'EleutherAI/polyglot-ko-1.3b'
model_name='gpt2'
dataset_name = 'nayohan/koquality_raw_test'

# tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token = '<|endoftext|>', add_bos_token = True)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token':'<|endoftext|>'})

dataset = load_dataset(dataset_name)

# truncate max length, add padding true
def encode_pad_preprocess(examples):
    return tokenizer(examples['instruction'], max_length=256, truncation=True, padding=True, return_tensors='pt')

dataset = dataset.remove_columns([ 'output', 'group', 'len'])
encode_pad_data = dataset.map(encode_pad_preprocess, num_proc=24)
print(encode_pad_data)

encodings = encode_pad_data['train']#tokenizer("\n\n".join(trunc_data['train']["instruction"]), return_tensors="pt")
# encodings = tokenizer("\n\n".join(encode_pad_data["train"]['instruction']), return_tensors="pt")

print(encodings[1])
ppl=calc_ppl(model, encodings[1])

Found cached dataset parquet (/home/uj-user/.cache/huggingface/datasets/nayohan___parquet/nayohan--koquality_raw_test-7121b638de347046/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/uj-user/.cache/huggingface/datasets/nayohan___parquet/nayohan--koquality_raw_test-7121b638de347046/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-15fb510b5d7841ea_*_of_00024.arrow


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input_ids', 'attention_mask'],
        num_rows: 19893
    })
})
{'instruction': '계속', 'input_ids': [[166, 111, 226, 168, 228, 235]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}
CausalLMOutputWithPast(loss=tensor(5.3483, device='cuda:0'), logits=tensor([[[-6.2360, -6.7630,  4.0732,  ..., -6.9794, -6.4452, -6.2739],
         [-4.9848, -4.8768,  4.2945,  ..., -4.7742, -4.8398, -4.4820],
         [-7.2664, -7.4936,  4.7605,  ..., -7.3655, -7.3848, -7.7008],
         [-4.8484, -4.0944,  2.1995,  ..., -4.7104, -5.7269, -5.2265],
         [-4.6654, -3.8172,  0.2965,  ..., -3.4142, -5.4253, -5.4635],
         [-7.1772, -7.3380,  5.8539,  ..., -7.3241, -7.3286, -7.5119]]],
       device='cuda:0'), past_key_values=((tensor([[[[-7.7113e-01,  2.8692e-01, -6.8897e-02,  ..., -1.3433e+00,
           -3.0351e-01, -1.2829e+00],
          [ 4.5774e-02,  6.8449e-01, -2.6109e-01,  ...,  2.7355e-01,
           -4.5152e-02,  1.6970e+00],
   

In [7]:
data = """
Just as a comprehensive, well-crafted textbook can provide a student with the necessary knowledge to
master a new subject, our work demonstrates the remarkable impact of high-quality data in honing a
language model’s proficiency in code-generation tasks. By crafting “textbook quality” data we were able
to train a model that surpasses almost all open-source models on coding benchmarks such as HumanEval
and MBPP despite being 10x smaller in model size and 100x smaller in dataset size. We hypothesize
that such high quality data dramatically improves the learning efficiency of language models for code as
they provide clear, self-contained, instructive, and balanced examples of coding concepts and skills.
There remains a number of limitations of our model compared to larger models for code. Firstly,
phi-1 is specialized in Python coding, which restricts its versatility compared to multi-language models.
Secondly, phi-1 lacks the domain-specific knowledge of larger models such as programming with specific
APIs or using less common packages. Lastly, due to the structured nature of the datasets and the lack
of diversity in terms of language and style, phi-1 is less robust to stylistic variations or errors in the
prompt (for instance, its performance substantially degrades when there are grammatical mistakes in the
prompt). We expand on these limitations and give examples of the failure modes of phi-1 in Appendix B.
None of these limitations seem fundamental, and with more work our approach could be used to tackle
each one of them, although it is unclear what scaling might be necessary to overcome them (both for
the model size and the dataset size). We also believe that significant gains could be achieved by using
GPT-4 to generate the synthetic data instead of GPT-3.5, as we noticed that GPT-3.5 data has a high
error rate. It is interesting that phi-1 is able to achieve such high coding proficiency despite those errors
(a similar phenomenon was observed in [AZL23] where a language model can be trained on data with
100% error rate and still generate correct answers at test time).
More generally, our work provides evidence that developing good methodology for creating highquality datasets is a central direction of research for advancing natural language processing and related
fields (see also [JWJ+
23] for further evidence). However, creating high-quality datasets is not a trivial
task, and it poses several challenges that need to be addressed. One challenge is to ensure that the
dataset covers all the relevant content and concepts that one wants the model to learn, and that it does
so in a balanced and representative way. Another challenge is to ensure that the dataset is truly diverse
and non-repetitive, so that the model does not simply overfit to the data or memorize specific patterns or
solutions. This requires finding ways to inject randomness and creativity into the data generation process,
while still maintaining the quality and the coherence of the examples. Moreover, even after creating such
datasets, we lack a good methodology to measure and evaluate the amount of diversity and redundancy
in the data. For example, if we have a dataset with coding exercises, it is hard to determine how many
different variations of each exercise exist, and how they are distributed across the dataset. Finally, as
language models themselves will be used to curate data for future language models, it further increases
the urgency on the ethical and social implications of training such models, such as the accountability, the
transparency, and the bias of the data and the models that are involved in this process.
"""

In [8]:
data.replace('\n', ' ')#.replace('0', '').replace('•', '')

' Just as a comprehensive, well-crafted textbook can provide a student with the necessary knowledge to master a new subject, our work demonstrates the remarkable impact of high-quality data in honing a language model’s proficiency in code-generation tasks. By crafting “textbook quality” data we were able to train a model that surpasses almost all open-source models on coding benchmarks such as HumanEval and MBPP despite being 10x smaller in model size and 100x smaller in dataset size. We hypothesize that such high quality data dramatically improves the learning efficiency of language models for code as they provide clear, self-contained, instructive, and balanced examples of coding concepts and skills. There remains a number of limitations of our model compared to larger models for code. Firstly, phi-1 is specialized in Python coding, which restricts its versatility compared to multi-language models. Secondly, phi-1 lacks the domain-specific knowledge of larger models such as programmi