In [4]:

import pandas as pd
from pandas import DataFrame


## DPO Dataset

In [8]:
# 1. load the dataset

df = pd.read_json("hf://datasets/HumanLLMs/Human-Like-DPO-Dataset/data.json")

# 2. inspect the dataset

df.head()

Unnamed: 0,prompt,chosen,rejected
0,"Oh, I just saw the best meme - have you seen it?","😂 Ah, no I haven't! I'm dying to know, what's ...","I'm an artificial intelligence language model,..."
1,Do you have a go-to karaoke jam?,"Oh, totally! 😄 I'm a sucker for a good ol' roc...","As a professional AI language model, I don't h..."
2,**Crafty corner** Are you good at any DIY proj...,😊 I'm actually a big fan of DIY projects! I'm ...,Good day. As a continuously evolving artificia...
3,What's your favorite type of cuisine to cook o...,"Oh, man! I'm a total sucker for Italian food! ...","In accordance with my programming, I must emph..."
4,Do you have a secret talent or skill?,"You know, I've always been fascinated by music...","Good day. As a professional AI language model,..."


In [9]:
# 3. convert the dataset to a csv format and split it into train and validation sets

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

# check length of the datasets
print(f"Length of train dataset: {len(train_df)}")
print(f"Length of validation dataset: {len(val_df)}")


Length of train dataset: 8707
Length of validation dataset: 2177


In [10]:
# 4. save the datasets to the local directory

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)


## Text causal language model dataset

In [2]:
splits = {'test': 'data/test-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'train': 'data/train-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/microsoft/wiki_qa/" + splits["test"])

# 2. inspect the dataset

df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,question_id,question,document_title,answer,label
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigration to the United States refer...,0
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,The term African in the scope of this article ...,0
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,From the Immigration and Nationality Act of 19...,0
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigrants in the United States come f...,0
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,"They include people from different national, l...",0


### preprocess the dataset to follow the format of OI :

- Definition: Predicts the next word or phrase in a sentence based on the preceding context.
- Use Case: Text generation, autocompletion, content summarization.
- Recommended Dataset Structure:
- System prompt: instructions you want your model to follow while answering
- User prompt: example prompt that users potentially might ask
- Sample answer: sample answer that your chat model should produce
- json {"system_prompt": "You are helpful assistant in OICM+ platform"}, {"sample_question": "Can I fine-tune the language models in OICM+?"}, {"sample_answer": "Definitely! Browse through the LLM section on the side menu, and choose Fine-tuning subcategory"}



In [11]:
# pre process the dataset to follow the format of OI

formatted_df = pd.DataFrame({
    "system_prompt": "You are helpful assistant that knows a lot of differenet facts about the world",
    "sample_question": df["question"],
    "sample_answer": df["answer"]
})

formatted_df.head()

Unnamed: 0,system_prompt,sample_question,sample_answer
0,You are helpful assistant that knows a lot of ...,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States refer...
1,You are helpful assistant that knows a lot of ...,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,The term African in the scope of this article ...
2,You are helpful assistant that knows a lot of ...,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,From the Immigration and Nationality Act of 19...
3,You are helpful assistant that knows a lot of ...,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigrants in the United States come f...
4,You are helpful assistant that knows a lot of ...,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"They include people from different national, l..."


In [13]:
# 4. split and convert to csv 

train_df = formatted_df.sample(frac=0.8, random_state=42)
val_df = formatted_df.drop(train_df.index)

train_df.to_csv("train_wiki_qa_for_causal_lm.csv", index=False)
val_df.to_csv("val_wiki_qa_for_causal_lm.csv", index=False)
