In [22]:
import jsonlines
import itertools
import pandas as pd
from pprint import pprint

import datasets
from datasets import load_dataset
import requests

**拉取训练数据集**

In [23]:
pretrained_dataset = load_dataset("EleutherAI/pile", split="train", streaming=True, trust_remote_code=True) #streaming 顺序流式传输

**加载问答对数据集**

In [25]:
df = pd.read_parquet("hf://datasets/kotzeje/lamini_docs.jsonl/data/train-00000-of-00001-6359aa989b671345.parquet")
df

Unnamed: 0,question,answer
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support..."
...,...,...
1395,Does Lamini have the ability to understand and...,"Yes, Lamini has the ability to understand and ..."
1396,Can I fine-tune the pre-trained models provide...,"Yes, you can fine-tune the pre-trained models ..."
1397,Can Lamini generate text that is suitable for ...,"Yes, Lamini can generate text that is suitable..."
1398,Does the documentation have a secret code that...,I wish! This documentation only talks about La...


**将问答对转变成字典的格式**

In [26]:
examples = df.to_dict()
text = examples["question"][0] + examples["answer"][0]
text

"How can I evaluate the performance and quality of the generated text from Lamini models?There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."

In [27]:
prompt_template_qa = """### Question:
{question}

### Answer:
{answer}"""

In [28]:
question = examples["question"][0]
answer = examples["answer"][0]

text_with_prompt_template = prompt_template_qa.format(question=question, answer=answer)
text_with_prompt_template

"### Question:\nHow can I evaluate the performance and quality of the generated text from Lamini models?\n\n### Answer:\nThere are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."

In [32]:
prompt_template_q = """### Question:
{question}

### Answer:"""

In [37]:
num_examples = len(examples["question"])
finetuning_dataset_text_only =[]
finetuning_dataset_question_answer = []
for i in range(num_examples):
    question = examples["question"][i]
    answer = examples["answer"][i]

    text_with_prompt_template_qa = prompt_template_qa.format(question=question, answer=answer)
    finetuning_dataset_text_only.append({"text": text_with_prompt_template_qa})

    text_with_prompt_template_q = prompt_template_q.format(question=question)
    finetuning_dataset_question_answer.append({"question": text_with_prompt_template_q, "answer": text_with_prompt_template_q})


**纯文本格式**

In [34]:
pprint(finetuning_dataset_text_only[0])

{'text': '### Question:\n'
         'How can I evaluate the performance and quality of the generated text '
         'from Lamini models?\n'
         '\n'
         '### Answer:\n'
         'There are several metrics that can be used to evaluate the '
         'performance and quality of generated text from Lamini models, '
         'including perplexity, BLEU score, and human evaluation. Perplexity '
         'measures how well the model predicts the next word in a sequence, '
         'while BLEU score measures the similarity between the generated text '
         'and a reference text. Human evaluation involves having human judges '
         'rate the quality of the generated text based on factors such as '
         'coherence, fluency, and relevance. It is recommended to use a '
         'combination of these metrics for a comprehensive evaluation of the '
         "model's performance."}


**问题-答案格式**

In [38]:
pprint(finetuning_dataset_question_answer[0])

{'answer': '### Question:\n'
           'How can I evaluate the performance and quality of the generated '
           'text from Lamini models?\n'
           '\n'
           '### Answer:',
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


**将处理后的数据保存为标准化格式**

In [40]:
with jsonlines.open(f'lamini_docs_processed.jsonl', 'w') as writer:
    writer.write_all(finetuning_dataset_question_answer) #?

**代码运行原理**

JSON Lines 是一种轻量级数据交换格式，每行是一个独立的 JSON 对象。

1.jsonlines.open()：

使用 jsonlines 库打开文件。
'w' 表示写入模式（若文件已存在，则覆盖；若不存在，则创建）。
with 语句确保文件操作完成后自动关闭，避免资源泄漏。

2.writer.write_all():

将 finetuning_dataset_question_answer（一个列表，包含多个字典）中的所有数据逐行写入文件。

3.读取文件

with jsonlines.open('lamini_docs_processed.jsonl', 'r') as reader:
    data = [item for item in reader]

**加载来自Hugging Face的数据集**

In [46]:
finetuning_dataset_name = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_name)
print(finetuning_dataset)

(…)-00000-of-00001-5cdebbc48da41394.parquet:   0%|          | 0.00/615k [00:00<?, ?B/s]

(…)-00000-of-00001-4c77a066a883f339.parquet:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})
