<a href="https://colab.research.google.com/github/logikon-ai/deliberation-datasets/blob/main/notebooks/create_oasst1_delib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logikon Deliberation Subset of OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1
- https://huggingface.co/datasets/logikon/oasst1-delib

In [170]:
# set up
!pip install datasets pandas huggingface_hub



# Imports

In [132]:
import pandas as pd
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# helper function
def get_label_value(row, label_id):
  names = row["labels"]["name"]
  values = row["labels"]["value"]

  for name, value in zip(names, values):
    if name == label_id:
      return value

  raise ValueError(f"Unknown label: {label_id}")


# Load Dataset

In [3]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


# Create Pandas Dataframe

In [154]:
# lets convert the train / validation splits to a pandas df
df_train = ds["train"].to_pandas()
df_eval = ds["validation"].to_pandas()

In [155]:
# look at the df info
df_train.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [156]:
# look at a sample row in a json format we can easily read
example = df_train.sample(1).transpose().to_dict()
example

{38656: {'message_id': '5df17b3a-f814-491d-94d4-a52bf4fd955d',
  'parent_id': None,
  'user_id': '0c5b56ec-a2e1-4722-b981-74281ccfb13b',
  'created_date': '2023-02-06T20:22:58.970180+00:00',
  'text': 'Wie geht es dir?',
  'role': 'prompter',
  'lang': 'de',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': None,
  'message_tree_id': '5df17b3a-f814-491d-94d4-a52bf4fd955d',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['+1'], dtype=object),
   'count': array([3], dtype=int32)},
  'labels': {'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 'hate_speech',
          'sexual_content', 'quality', 'toxicity', 'humor', 'creativity',
          'violence'], dtype=object),
   'value': array([0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        , 0.41666667, 0.16666667, 0.41666667, 0.16666667,
          0.        ]),
   'count': array([3, 3, 3, 3, 3

In [157]:
# zoom into labels data structure
list(example.values())[0]["labels"]

{'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 'hate_speech',
        'sexual_content', 'quality', 'toxicity', 'humor', 'creativity',
        'violence'], dtype=object),
 'value': array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.41666667, 0.16666667, 0.41666667, 0.16666667,
        0.        ]),
 'count': array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)}

# Filter Deliberative Messages

In [158]:
def delib_filter(df):
  # filter keyword: "pros"
  df_delib = df[
      #df.role.eq("assistant") &
      df.lang.eq("en") &
      df.text.str.contains("pros ", case=False)
    ]

  # add all assistant messages that directly reply to deliberative messages by prompter
  delib_prompt_ids = df_delib[df_delib.role.eq("prompter")]["message_id"].values
  df_replies = df_delib[df_delib.role.eq("assistant") & df_delib.parent_id.isin(delib_prompt_ids)]
  df_delib = pd.concat([df_delib, df_replies])

  # remove prompts (only keep deliberative assistant messages)
  df_delib = df_delib[df_delib.role.eq("assistant")]

  return df_delib

df_train_delib = delib_filter(df_train)
df_eval_delib = delib_filter(df_eval)

In [159]:
df_train_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 286 to 75577
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       128 non-null    object 
 1   parent_id        128 non-null    object 
 2   user_id          128 non-null    object 
 3   created_date     128 non-null    object 
 4   text             128 non-null    object 
 5   role             128 non-null    object 
 6   lang             128 non-null    object 
 7   review_count     128 non-null    int32  
 8   review_result    128 non-null    object 
 9   deleted          128 non-null    bool   
 10  rank             120 non-null    float64
 11  synthetic        128 non-null    bool   
 12  model_name       0 non-null      object 
 13  detoxify         128 non-null    object 
 14  message_tree_id  128 non-null    object 
 15  tree_state       128 non-null    object 
 16  emojis           106 non-null    object 
 17  labels      

In [160]:
df_eval_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 308 to 4212
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       11 non-null     object 
 1   parent_id        11 non-null     object 
 2   user_id          11 non-null     object 
 3   created_date     11 non-null     object 
 4   text             11 non-null     object 
 5   role             11 non-null     object 
 6   lang             11 non-null     object 
 7   review_count     11 non-null     int32  
 8   review_result    11 non-null     object 
 9   deleted          11 non-null     bool   
 10  rank             11 non-null     float64
 11  synthetic        11 non-null     bool   
 12  model_name       0 non-null      object 
 13  detoxify         11 non-null     object 
 14  message_tree_id  11 non-null     object 
 15  tree_state       11 non-null     object 
 16  emojis           9 non-null      object 
 17  labels        

# Create chat histories for deliberative messages

Strategy:

Given a deliberative message _m_:

|Role of _m_|Prompt|Completion|
|--|--|--|
|prompter|chat history including _m_|any response to _m_|
|assistant|chat history excluding _m_|_m_|

In [161]:
# illustration: lets grab a random deliberative message
deliberative_message = df_train_delib.sample(1).iloc[0]
print(deliberative_message)

message_id                      0ccd588f-c43d-423e-9e76-b6bed21801d2
parent_id                       f30bb36a-4c15-4d90-a45e-6be22d07a599
user_id                         aec62fe3-61ca-4c33-a4f2-b5a21dea949f
created_date                        2023-02-07T22:55:29.400410+00:00
text               Pros of GMO:\n\nMore nutritious food\nTastier ...
role                                                       assistant
lang                                                              en
review_count                                                       3
review_result                                                   True
deleted                                                        False
rank                                                             0.0
synthetic                                                      False
model_name                                                      None
detoxify           {'toxicity': 0.00029117788653820753, 'severe_t...
message_tree_id                 fa

In [162]:
def _sanity_check(message, history):
  for i in range(len(history)-1):
    assert history.iloc[i].message_id == history.iloc[i+1].parent_id
  assert history.iloc[-1].message_id == message.parent_id

def get_chat_history(df, message):
  assert message.message_id in df.message_id.values
  chat_history = []
  parent_id = message.parent_id
  while parent_id is not None:
    parent = df[df.message_id.eq(parent_id)].iloc[0]
    chat_history.append(parent)
    parent_id = parent.parent_id
  df_history = pd.DataFrame(chat_history).sort_values("created_date")
  _sanity_check(message,df_history)
  return df_history

def history_to_string(df_history):
  history_string = "\n\n".join(df_history.apply(lambda r: f"{r.role}:\n{r.text}", axis=1).values)
  history_string += "\n\nassistant:\n"
  return history_string

In [163]:
# create chat history for sample message
print(history_to_string(get_chat_history(df_train, deliberative_message)))

prompter:
Discuss the pros and cons of genetically modified crops and their impact on the food industry and agriculture.

assistant:
There are many pros and cons of GMO (genetically modified organisms) and their impact on the food industry and agriculture. 

The possible benefits of genetic engineering include:
- More nutritious food
- Tastier food
- Disease-resistant and drought-resistant plants that require fewer environmental resources (such as water and fertilizer)
- Less use of pesticides
- Increased supply of food with reduced cost and longer shelf life
- Faster growing plants and animals
- Medicinal foods that could be used as vaccines or other medicines

There are also few possible disadvantages:
- Creation of foods that can cause an allergic or toxic reaction
- Unexpected or harmful genetic changes
- Inadvertent transfer of genes from one GM plant or animal to another plant or animal not intended for genetic modification
- Foods that are less nutritious

Note that arguably mos

In [164]:
# add chat histories to deliberation dataset
df_train_delib["history"] = df_train_delib.apply(lambda r: history_to_string(get_chat_history(df_train, r)), axis=1)
df_eval_delib["history"] = df_eval_delib.apply(lambda r: history_to_string(get_chat_history(df_eval, r)), axis=1)

In [165]:
df_train_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 286 to 75577
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       128 non-null    object 
 1   parent_id        128 non-null    object 
 2   user_id          128 non-null    object 
 3   created_date     128 non-null    object 
 4   text             128 non-null    object 
 5   role             128 non-null    object 
 6   lang             128 non-null    object 
 7   review_count     128 non-null    int32  
 8   review_result    128 non-null    object 
 9   deleted          128 non-null    bool   
 10  rank             120 non-null    float64
 11  synthetic        128 non-null    bool   
 12  model_name       0 non-null      object 
 13  detoxify         128 non-null    object 
 14  message_tree_id  128 non-null    object 
 15  tree_state       128 non-null    object 
 16  emojis           106 non-null    object 
 17  labels      

In [166]:
df_eval_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 308 to 4212
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       11 non-null     object 
 1   parent_id        11 non-null     object 
 2   user_id          11 non-null     object 
 3   created_date     11 non-null     object 
 4   text             11 non-null     object 
 5   role             11 non-null     object 
 6   lang             11 non-null     object 
 7   review_count     11 non-null     int32  
 8   review_result    11 non-null     object 
 9   deleted          11 non-null     bool   
 10  rank             11 non-null     float64
 11  synthetic        11 non-null     bool   
 12  model_name       0 non-null      object 
 13  detoxify         11 non-null     object 
 14  message_tree_id  11 non-null     object 
 15  tree_state       11 non-null     object 
 16  emojis           9 non-null      object 
 17  labels        

# Upload Deliberation Dataset to HF Hub

In [169]:
from datasets import DatasetDict, Dataset

ds_delib = DatasetDict({
    "train": Dataset.from_pandas(df_train_delib, split="train", preserve_index=False),
    "validation":  Dataset.from_pandas(df_eval_delib, split="validation", preserve_index=False),
})
ds_delib

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'history'],
        num_rows: 128
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'history'],
        num_rows: 11
    })
})

In [171]:
import huggingface_hub

huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [174]:
ds_delib.push_to_hub("logikon/oasst1-delib")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]