In [1]:
import json
import pandas as pd
from treelib import Tree

In [2]:
def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

In [3]:
mapping = {}
with open('texts.jsonl.requested') as fopen:
    for l in fopen:
        data = json.loads(l)
        mapping[data['src']] = data['r']['result']

In [4]:
df = pd.read_parquet('train-00000-of-00001-b42a775f407cee45.parquet')
df = df[df['lang'] == 'en']
df['text_ms'] = df.text.map(mapping)
df.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,text_ms
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Bolehkah anda menulis pengenalan ringkas tenta...
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch...","""Monopsoni"" merujuk kepada struktur pasaran di..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Sekarang terangkan kepada seekor anjing
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch...",Monopsoni ialah struktur pasaran di mana terda...
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Bagaimanakah seseorang boleh melawan apabila m...


In [5]:
message_tree_id = df['message_tree_id'].unique().tolist()
len(message_tree_id)

3574

In [6]:
from tqdm import tqdm

all_texts = []
for id in tqdm(message_tree_id):
    try:
        df_message_tree = df[df['message_tree_id'] == id].sort_values("created_date")
        df_message_tree = add_tree_level(df_message_tree)
    except:
        continue
        
    id_tree = Tree()

    for i, row in df_message_tree.iterrows():
        message_id = row["message_id"]
        parent_id = row["parent_id"]
        try:
            id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())
        except:
            pass
    
    combination = id_tree.paths_to_leaves()
    for c in combination:
        df_ = df_message_tree[df_message_tree['message_id'].isin(c)].sort_values("created_date")
        texts = []
        for i in range(len(df_)):
            try:
                if i % 2 == 0:
                    left = '<manusia>: '
                else:
                    left = '<bot>: '
                texts.append(left + df_.iloc[i]['text_ms'])
            except:
                break
        all_texts.append('\n'.join(texts))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:29<00:00, 120.48it/s]


In [7]:
len(all_texts)

20149

In [8]:
all_texts[-1]

'<manusia>: Apakah risiko apabila anda tidak menjalankan indeks lengan buku lali sebelum terapi mampatan?\n<bot>: Tidak melaksanakan eksperimen indeks pergelangan kaki-brachial (ABI) sebelum psikoterapi mampatan menyampaikan angka risiko. Ujian ABI dieksploitasi untuk menilai nisbah tekanan darah sistolik di pergelangan kaki kepada tekanan sistolik di lengan—perkadaran yang besar mungkin menandakan kehadiran penyakit arteri periferal (PAD). Ini boleh menjadi faktor penting dalam menentukan keberkesanan psikoterapi mampatan, kerana kehadiran PAD mungkin bermakna terapi itu tidak berkesan dalam meningkatkan aliran darah ke kawasan yang terjejas.\nTanpa ujian ABI, terdapat ancaman bahawa psikoterapi mampatan mungkin tidak produktif atau berbahaya jika kes itu mempunyai PAD. Psikoterapi mampatan boleh memudaratkan mereka yang mempunyai PAD, kerana ia boleh memburukkan lagi keadaan dengan mengurangkan aliran darah ke kawasan yang terjejas. Begitu juga, terdapat ancaman bahawa kes itu mungki

In [9]:
splitted = all_texts[-4].split('<bot>:')
pairs = []
for i in range(len(splitted) - 1):
    if i == 0:
        human = splitted[i].replace('<manusia>:', '')
    else:
        human = splitted[i].split('<manusia>:')[1]
    bot = splitted[i + 1].split('<manusia>:')[0]
    pairs.append((human.strip(), bot.strip()))
    
pairs

[('Apakah faktor utama yang menyumbang kepada nilai harta tanah?',
  'Nilai hartanah ditentukan oleh pelbagai faktor, termasuk:\n\n1. Lokasi: Hartanah yang terletak di kawasan kejiranan yang diingini, berdekatan dengan kemudahan seperti sekolah, membeli-belah dan pengangkutan, cenderung mempunyai nilai yang lebih tinggi.\n\n2. Saiz dan reka letak: Hartanah dengan rakaman persegi yang lebih besar dan reka letak yang direka dengan baik cenderung mempunyai nilai yang lebih tinggi.\n\n3. Umur dan keadaan: Hartanah yang diselenggara dengan baik dan dikemas kini biasanya bernilai lebih tinggi daripada harta yang lebih tua dan memerlukan pembaikan.\n\n4. Faktor ekonomi: Ekonomi tempatan, pasaran pekerjaan, dan permintaan perumahan semuanya boleh memberi kesan ke atas nilai hartanah.\n\n5. Kemudahan dan ciri: Hartanah dengan kemudahan yang diingini, seperti kolam renang, garaj atau dapur yang dikemas kini, biasanya bernilai lebih tinggi daripada yang tiada ciri ini.\n\n6. Peraturan zon: Peratu

In [10]:
with open('oasst1-ms.jsonl', 'w') as fopen:
    for t in all_texts:
        fopen.write(f'{json.dumps(t)}\n')

In [13]:
from glob import glob

files = glob('texts-others.jsonl*.splitted.requested')
files

['texts-others.jsonl00.splitted.requested',
 'texts-others.jsonl02.splitted.requested',
 'texts-others.jsonl01.splitted.requested']

In [14]:
mapping = {}
for f in files:
    with open(f) as fopen:
        for l in fopen:
            data = json.loads(l)
            mapping[data['src']] = data['r']['result']

In [15]:
len(mapping)

44545

In [16]:
df = pd.read_parquet('train-00000-of-00001-b42a775f407cee45.parquet')
df = df[df['lang'] != 'en']
df['text_ms'] = df.text.map(mapping)
df.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,text_ms
12,5249c721-e6d7-45f3-9680-f211d7be308e,,63d9c326-9eef-46ae-92bf-1cfee1fa6128,2023-02-06T02:16:52.167075+00:00,¿CUales son las etapas del desarrollo y en qué...,prompter,es,3,True,False,,False,,"{'toxicity': 0.0003284301492385566, 'severe_to...",5249c721-e6d7-45f3-9680-f211d7be308e,ready_for_export,"{'name': ['+1', '-1', '_skip_reply', '_skip_ra...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Apakah peringkat pembangunan dan apakah yang t...
13,851a0267-90b0-46f3-97e8-ba143a1c0cd5,5249c721-e6d7-45f3-9680-f211d7be308e,fc8bf403-c061-4501-855f-10cd0fccc1b9,2023-02-09T18:15:17.729005+00:00,Jean Piaget fue un psicólogo suizo que propuso...,assistant,es,3,True,False,0.0,False,,"{'toxicity': 0.00026873970637097955, 'severe_t...",5249c721-e6d7-45f3-9680-f211d7be308e,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [5...","{'name': ['spam', 'fails_task', 'lang_mismatch...",Jean Piaget adalah ahli psikologi Switzerland ...
14,ef4a54ad-693f-4253-9478-5174b1dc4553,851a0267-90b0-46f3-97e8-ba143a1c0cd5,5e831e07-1357-4d7a-af07-e37b54eca4b6,2023-02-09T18:44:20.487394+00:00,¿Hay otras teorías sobre las etapas del desarr...,prompter,es,7,True,False,,False,,"{'toxicity': 0.0002816664637066424, 'severe_to...",5249c721-e6d7-45f3-9680-f211d7be308e,ready_for_export,"{'name': ['+1'], 'count': [3]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Adakah terdapat teori peringkat perkembangan l...
15,d8f904d8-f751-4038-a2c9-d47cc15c7373,5249c721-e6d7-45f3-9680-f211d7be308e,1051b6f3-8139-45a8-9112-94e897f8c522,2023-02-09T17:27:01.298011+00:00,"Según Jean Piaget, estas son las 4 etapas del ...",assistant,es,4,True,False,1.0,False,,"{'toxicity': 0.000252147059654817, 'severe_tox...",5249c721-e6d7-45f3-9680-f211d7be308e,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch...","Menurut Jean Piaget, ini adalah 4 peringkat pe..."
16,074e0c5b-90a5-42a7-b2b7-e820609c8a38,d8f904d8-f751-4038-a2c9-d47cc15c7373,9360a3b4-b48a-41d9-bac7-f2b2ff83dd31,2023-02-09T18:23:27.368410+00:00,Desarrolla un breve poema en el que resumas la...,prompter,es,3,True,False,,False,,"{'toxicity': 0.0013581713428720832, 'severe_to...",5249c721-e6d7-45f3-9680-f211d7be308e,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",Bina puisi pendek di mana anda meringkaskan ja...


In [17]:
message_tree_id = df['message_tree_id'].unique().tolist()
len(message_tree_id)

6379

In [20]:
import random

random.random()

0.9041447667178656

In [27]:
all_texts = []
for id in tqdm(message_tree_id):
    try:
        df_message_tree = df[df['message_tree_id'] == id].sort_values("created_date")
        df_message_tree = add_tree_level(df_message_tree)
    except:
        continue
        
    id_tree = Tree()

    for i, row in df_message_tree.iterrows():
        message_id = row["message_id"]
        parent_id = row["parent_id"]
        try:
            id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())
        except:
            pass
    
    combination = id_tree.paths_to_leaves()
    for c in combination:
        df_ = df_message_tree[df_message_tree['message_id'].isin(c)].sort_values("created_date")
        texts = []
        for i in range(len(df_)):
            try:
                if i % 2 == 0:
                    if random.random() < 0.6:
                        t = df_.iloc[i]['text']
                    else:
                        t = df_.iloc[i]['text_ms']
                    left = '<manusia>: ' + t
                else:
                    left = '<bot>: ' + df_.iloc[i]['text_ms']
                texts.append(left)
            except:
                break
        all_texts.append('\n'.join(texts))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 6379/6379 [00:34<00:00, 184.43it/s]


In [28]:
len(all_texts)

23793

In [29]:
with open('oasst1-others.jsonl', 'w') as fopen:
    for t in all_texts:
        fopen.write(f'{json.dumps(t)}\n')

In [26]:
# splitted = '<manusia>: Schreibe einen kurze und präzise Konstruktionsbeschreibung zu einem Dreieck ABC mit c=6\xa0cm, h_c=5\xa0cm und γ=40°. γ ist hierbei der von Seite c gegenüberliegende Winkel.\n<bot>: Segitiga ABC ialah segi tiga tegak dengan panjang sisi c=6 cm sebagai hipotenus. Tinggi h_c c ialah 5 cm dan sudut γ c bertentangan ialah 40°.\n<manusia>: Terima kasih, itu banyak membantu saya.'.split('<bot>:')
# pairs = []
# for i in range(len(splitted) - 1):
#     if i == 0:
#         human = splitted[i].replace('<manusia>:', '')
#     else:
#         human = splitted[i].split('<manusia>:')[1]
#     bot = splitted[i + 1].split('<manusia>:')[0]
#     pairs.append((human.strip(), bot.strip()))
    
# pairs

[('Schreibe einen kurze und präzise Konstruktionsbeschreibung zu einem Dreieck ABC mit c=6\xa0cm, h_c=5\xa0cm und γ=40°. γ ist hierbei der von Seite c gegenüberliegende Winkel.',
  'Segitiga ABC ialah segi tiga tegak dengan panjang sisi c=6 cm sebagai hipotenus. Tinggi h_c c ialah 5 cm dan sudut γ c bertentangan ialah 40°.')]