<a href="https://colab.research.google.com/github/logikon-ai/deliberation-datasets/blob/main/notebooks/create_oasst1_delib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logikon Deliberation Subset of OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1
- https://huggingface.co/datasets/logikon/oasst1-delib

In [1]:
# set up
!pip install datasets pandas huggingface_hub

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/519.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/519.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.3 MB/s[0m eta [36m0:

# Imports

In [2]:
import pandas as pd
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# helper function
def get_label_value(row, label_id):
  names = row["labels"]["name"]
  values = row["labels"]["value"]

  for name, value in zip(names, values):
    if name == label_id:
      return value

  raise ValueError(f"Unknown label: {label_id}")


# Load Dataset

In [3]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


# Create Pandas Dataframe

In [4]:
# lets convert the train / validation splits to a pandas df
df_train = ds["train"].to_pandas()
df_eval = ds["validation"].to_pandas()

In [5]:
# look at the df info
df_train.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [6]:
# look at a sample row in a json format we can easily read
example = df_train.sample(1).transpose().to_dict()
example

{52534: {'message_id': '79f317cc-9e6e-4fba-aab7-b551edaaa740',
  'parent_id': 'f13da0dd-8ee6-4b8b-97fa-7fd74955eb16',
  'user_id': '5974bafc-e91d-4744-b836-f1783cfa3848',
  'created_date': '2023-02-24T10:08:20.337456+00:00',
  'text': "I don't do Elmer Fudd",
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.0028827073983848095,
   'severe_toxicity': 3.145535083604045e-05,
   'obscene': 0.0006896371487528086,
   'identity_attack': 0.00016918877372518182,
   'insult': 0.0008972316863946617,
   'threat': 7.010643457761034e-05,
   'sexual_explicit': 6.88965737936087e-05},
  'message_tree_id': 'c934299f-ad32-4760-9eca-aef722e77beb',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['-1', '_skip_reply'], dtype=object),
   'count': array([3, 7], dtype=int32)},
  'labels': {'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 

In [7]:
# zoom into labels data structure
list(example.values())[0]["labels"]

{'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 'hate_speech',
        'sexual_content', 'quality', 'toxicity', 'humor', 'creativity',
        'violence'], dtype=object),
 'value': array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.33333333, 0.        , 0.375     , 0.375     ,
        0.        ]),
 'count': array([3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2], dtype=int32)}

In [8]:
df_eval[df_eval.message_id=="e7dcb87f-25f0-4d76-9247-44678ffa0ae9"]

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
2512,e7dcb87f-25f0-4d76-9247-44678ffa0ae9,befb8ae8-02bd-4dc4-b990-572d686e06fc,8eaeb148-22de-40ef-a444-1b8a7238a0eb,2023-04-07T00:34:22.485570+00:00,It is important to note that intentionally low...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0014241895405575633, 'severe_to...",befb8ae8-02bd-4dc4-b990-572d686e06fc,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."


# Filter Deliberative Messages

In [10]:
def delib_filter(df):
  # filter keyword: "pros"
  df_delib = df[
      #df.role.eq("assistant") &
      df.lang.eq("en") &
      df.text.str.contains("pros ", case=False)
    ]

  # add all assistant messages that directly reply to deliberative messages by prompter
  delib_prompt_ids = df_delib[df_delib.role.eq("prompter")]["message_id"].values
  df_replies = df_delib[df_delib.role.eq("assistant") & df_delib.parent_id.isin(delib_prompt_ids)]
  df_delib = pd.concat([df_delib, df_replies])
  df_delib.drop_duplicates(["message_id", "parent_id"], inplace=True)

  # remove prompts (only keep deliberative assistant messages)
  df_delib = df_delib[df_delib.role.eq("assistant")]

  return df_delib

df_train_delib = delib_filter(df_train)
df_eval_delib = delib_filter(df_eval)

In [11]:
df_train_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 286 to 83742
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       90 non-null     object 
 1   parent_id        90 non-null     object 
 2   user_id          90 non-null     object 
 3   created_date     90 non-null     object 
 4   text             90 non-null     object 
 5   role             90 non-null     object 
 6   lang             90 non-null     object 
 7   review_count     90 non-null     int32  
 8   review_result    90 non-null     object 
 9   deleted          90 non-null     bool   
 10  rank             85 non-null     float64
 11  synthetic        90 non-null     bool   
 12  model_name       0 non-null      object 
 13  detoxify         90 non-null     object 
 14  message_tree_id  90 non-null     object 
 15  tree_state       90 non-null     object 
 16  emojis           74 non-null     object 
 17  labels       

In [12]:
df_eval_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 308 to 4212
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       6 non-null      object 
 1   parent_id        6 non-null      object 
 2   user_id          6 non-null      object 
 3   created_date     6 non-null      object 
 4   text             6 non-null      object 
 5   role             6 non-null      object 
 6   lang             6 non-null      object 
 7   review_count     6 non-null      int32  
 8   review_result    6 non-null      object 
 9   deleted          6 non-null      bool   
 10  rank             6 non-null      float64
 11  synthetic        6 non-null      bool   
 12  model_name       0 non-null      object 
 13  detoxify         6 non-null      object 
 14  message_tree_id  6 non-null      object 
 15  tree_state       6 non-null      object 
 16  emojis           5 non-null      object 
 17  labels         

# Create chat histories for deliberative messages

Strategy:

Given a deliberative message _m_:

|Role of _m_|Prompt|Completion|
|--|--|--|
|prompter|chat history including _m_|any response to _m_|
|assistant|chat history excluding _m_|_m_|

In [13]:
# illustration: lets grab a random deliberative message
deliberative_message = df_train_delib.sample(1).iloc[0]
print(deliberative_message)

message_id                      d80c6b1b-4c50-4d07-a20e-56476fc6e4ce
parent_id                       77b151ac-e001-4b19-9afd-eb9cabf5cfbc
user_id                         068951d1-4a1e-4d91-9c29-0090fdf9366d
created_date                        2023-02-20T08:30:29.361367+00:00
text               Here are some potential pros and cons of socia...
role                                                       assistant
lang                                                              en
review_count                                                       3
review_result                                                   True
deleted                                                        False
rank                                                             0.0
synthetic                                                      False
model_name                                                      None
detoxify           {'toxicity': 0.0017744216602295637, 'severe_to...
message_tree_id                 77

In [14]:
def _sanity_check(message, history):
  for i in range(len(history)-1):
    assert history.iloc[i].message_id == history.iloc[i+1].parent_id
  assert history.iloc[-1].message_id == message.parent_id

def get_chat_history(df, message):
  assert message.message_id in df.message_id.values
  chat_history = []
  parent_id = message.parent_id
  while parent_id is not None:
    parent = df[df.message_id.eq(parent_id)].iloc[0]
    chat_history.append(parent)
    parent_id = parent.parent_id
  df_history = pd.DataFrame(chat_history).sort_values("created_date")
  _sanity_check(message,df_history)
  return df_history

def history_to_string(df_history):
  history_string = "\n\n".join(df_history.apply(lambda r: f"{r.role}:\n{r.text}", axis=1).values)
  history_string += "\n\nassistant:\n"
  return history_string

In [15]:
# create chat history for sample message
print(history_to_string(get_chat_history(df_train, deliberative_message)))

prompter:
What are some of the pro's and con's of social media?

assistant:



In [16]:
# add chat histories to deliberation dataset
df_train_delib["history"] = df_train_delib.apply(lambda r: history_to_string(get_chat_history(df_train, r)), axis=1)
df_eval_delib["history"] = df_eval_delib.apply(lambda r: history_to_string(get_chat_history(df_eval, r)), axis=1)

In [17]:
df_train_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 286 to 83742
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       90 non-null     object 
 1   parent_id        90 non-null     object 
 2   user_id          90 non-null     object 
 3   created_date     90 non-null     object 
 4   text             90 non-null     object 
 5   role             90 non-null     object 
 6   lang             90 non-null     object 
 7   review_count     90 non-null     int32  
 8   review_result    90 non-null     object 
 9   deleted          90 non-null     bool   
 10  rank             85 non-null     float64
 11  synthetic        90 non-null     bool   
 12  model_name       0 non-null      object 
 13  detoxify         90 non-null     object 
 14  message_tree_id  90 non-null     object 
 15  tree_state       90 non-null     object 
 16  emojis           74 non-null     object 
 17  labels       

In [18]:
df_eval_delib.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 308 to 4212
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       6 non-null      object 
 1   parent_id        6 non-null      object 
 2   user_id          6 non-null      object 
 3   created_date     6 non-null      object 
 4   text             6 non-null      object 
 5   role             6 non-null      object 
 6   lang             6 non-null      object 
 7   review_count     6 non-null      int32  
 8   review_result    6 non-null      object 
 9   deleted          6 non-null      bool   
 10  rank             6 non-null      float64
 11  synthetic        6 non-null      bool   
 12  model_name       0 non-null      object 
 13  detoxify         6 non-null      object 
 14  message_tree_id  6 non-null      object 
 15  tree_state       6 non-null      object 
 16  emojis           5 non-null      object 
 17  labels         

# Upload Deliberation Dataset to HF Hub

In [19]:
from datasets import DatasetDict, Dataset

ds_delib = DatasetDict({
    "train": Dataset.from_pandas(df_train_delib, split="train", preserve_index=False),
    "validation":  Dataset.from_pandas(df_eval_delib, split="validation", preserve_index=False),
})
ds_delib

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'history'],
        num_rows: 90
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'history'],
        num_rows: 6
    })
})

In [20]:
import huggingface_hub

huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
ds_delib.push_to_hub("logikon/oasst1-delib")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.93k [00:00<?, ?B/s]