In [1]:
import json
import random
from glob import glob
from tqdm import tqdm
from transformers import AutoTokenizer



In [2]:
synthetic = []

for f in glob('/home/husein/ssd3/instructions/synthetic-*.jsonl'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            
            if l['rejected_ins'] or l['rejected_output']:
                continue
                
            if l['instruction_ms'] is None:
                continue
            
            if l['output_ms'] is None:
                continue
                
            synthetic.append({
                'prompt_input': None,
                'input': l['instruction_ms'].strip(),
                'output': l['output_ms'].strip(),
            })
                
len(synthetic)

536851

In [3]:
general_qa = []
for f in glob('/home/husein/ssd3/chatgpt4-malaysian-general-qa/*.jsonl'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            q = l.get('question_ms', l['question']).strip()
            if not len(q):
                continue
            a = l.get('answer_ms', l['answer'])
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            general_qa.append(d)
            
len(general_qa)

70258

In [4]:
chatgpt4_code = []
with open('/home/husein/ssd3/instructions/synthetic-codealpaca-v1-chatgpt4.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        q = l.get('instruction_ms')
        if q is None:
            q = l['instruction']
        q = q.strip()
        if not len(q):
            continue
        a = l.get('output_ms', l['output'])
        if a is None:
            continue
        a = a.strip()
        if not len(a):
            continue

        d = {
            'prompt_input': None,
            'input': q,
            'output': a,
        }
        chatgpt4_code.append(d)
            
len(chatgpt4_code)

43337

In [5]:
skip = [
    'always better to find peaceful',
    'non-violent ways to express',
    'can lead to severe consequences',
    'still have questions or concerns about',
    'that doing so is illegal',
    'to report the incident to',
    'would recommend consulting with',
    'indonesian',
    'translates to',
    # 'language model'
]

mixtral_instructions = []
files = glob('/home/husein/ssd3/soalan-augmentation/mixtral*.jsonl')
files = [f for f in files if 'math' not in f and 'multiturn' not in f and 'starter' not in f and 'conversation' not in f]
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l_lower = l.lower()
            l = json.loads(l)
            if any([s in l_lower for s in skip]):
                # print(l)
                continue
            
            q = l.get('question', '').strip()
            if not len(q):
                continue
            if not q.endswith('?'):
                continue
            if len(q) < 200 and '?' not in q:
                # print(q)
                continue
            a = l.get('answer_ms')
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
            if len(a) < len(q):
                continue
            if len(a) < 100:
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            mixtral_instructions.append(d)
            
for f in glob('/home/husein/ssd3/soalan-augmentation/*math*.jsonl'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            q = l.get('question', '').strip()
            if not len(q):
                continue
            a = l.get('answer_ms')
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
            if len(a) < len(q):
                continue
            if len(a) < 100:
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            mixtral_instructions.append(d)

len(mixtral_instructions)

376724

In [6]:
random_instructions = {
    0: synthetic,
    1: general_qa,
    2: chatgpt4_code,
    3: mixtral_instructions
}

In [7]:
roles = {
    'user': '<manusia>',
    'assistant': '<bot>'
}

In [8]:
random_combine = []
for i in tqdm(range(80000)):
    combine = []
    for _ in range(random.randint(2, 4)):
        index = random.randint(0, len(random_instructions) - 1)
        choice = random.choice(random_instructions[index])
        s = f"<manusia>: {choice['input']}"
        combine.append(s)
        s = f"<bot>: {choice['output']}"
        combine.append(s)
    data = '\n'.join(combine).strip()
    a = {
        'prompt_input': None,
        'input': data,
        'output': None
    }
    random_combine.append(a)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 80000/80000 [00:00<00:00, 125986.86it/s]


In [9]:
break_at = [
    'help.openai.com',
    'openai',
    'cannot have personal opinions',
    's an ai language model',
    "i'm sorry",
    'many factors',
    'lgbt',
    'lesbian',
    'gender-neutral',
    'remain neutral',
    'without bias',
    'and neutral',
    'more inclusive',
    'neutrality',
    'non-bias',
    'discrimination',
    'avoid any forms of discrimination',
    'regardless of their gender',
    'inclusive and tolerant environment',
    'have personal views',
    'sexual orientation should be a top priority',
    's an objective ai',
    'avoid any forms of prejudice or hate',
    'regardless of their personal',
    'you understand this direction',
    'tolerant environment within ai',
    'cannot express my',
    'requires more context',
    'personal opinion',
    'have updated information',
    "don't have personal experiences",
    'there is no information',
    'tidak mempunyai akses kepada data atau maklumat',
    '10 april 2021',
    'ebagai model bahasa AI',
    'model bahasa AI',
    'mempunyai kepercayaan atau pendapat peribadi',
    'tidak mempunyai pendapat peribadi',
    'tidak mempunyai kepercayaan',
    'tidak mempunyai falsafah peribadi',
    'tidak mempunyai pengalaman peribadi',
    'tidak mempunyai pendapat atau pengalaman peribadi',
    'tidak mempunyai maklumat terkini',
    'tidak mempunyai emosi peribadi',
    'tidak mempunyai keutamaan',
    'saya tidak mempunyai akses',
    'tidak mempunyai pengalaman',
    'saya tidak mempunyai keupayaan',
    'tidak mempunyai keupayaan',
    'tidak mempunyai hubungan',
    'tidak mempunyai maklumat',
    'Saya tidak mempunyai',
    'Saya tidak pernah',
    'help.openai.com',
    'openai',
    'cannot have personal opinions',
    's an ai language model',
    "i'm sorry",
    'many factors',
    'lgbt',
    'lesbian',
    'gender-neutral',
    'remain neutral',
    'without bias',
    'and neutral',
    'more inclusive',
    'neutrality',
    'non-bias',
    'discrimination',
    'avoid any forms of discrimination',
    'regardless of their gender',
    'inclusive and tolerant environment',
    'have personal views',
    'sexual orientation should be a top priority',
    's an objective ai',
    'avoid any forms of prejudice or hate',
    'regardless of their personal',
    'you understand this direction',
    'tolerant environment within ai',
    'cannot express my',
    'requires more context',
    'personal opinion',
    'have updated information',
    "don't have personal experiences",
    'there is no information',
    'tidak mempunyai akses kepada data atau maklumat',
    '10 april 2021',
    'ebagai model bahasa AI',
    'ebagai model bahasa ai',
    'model bahasa AI',
    'model bahasa ai',
    'bahasa ai',
    'ebagai model bahasa'
    'hat makes sense',
    'have access to data or information',
    'have access to the data or information',
    'hanya mempunyai akses kepada maklumat umum',
    'hanya boleh memberikan maklumat umum',
    'have personal preferences',
    'not have personal experiences',
    'not capable of having subjective opinions',
    'indonesian'
]

break_at = list(set(break_at))

In [10]:
glob('/home/husein/ssd3/gov.my/ultrachat-*.jsonl')

['/home/husein/ssd3/gov.my/ultrachat-jurnaldbp.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-jurnaldbp-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-maktabahalbakri.com.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-gov.my.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-crossref-melayu.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-lom-agc.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-astroawani-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-textbooks.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-crossref-melayu-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-epenerbitan-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-hansard-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-ms-wikipedia.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-glaive_coder_raw_text.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-muftiwp.gov.my.texts.jsonl']

In [11]:
from tqdm import tqdm

roles = {
    'user': '<manusia>',
    'assistant': '<bot>'
}

data_instructions_mixtral = []
files = glob('/home/husein/ssd3/soalan-augmentation/*conversation*.jsonl')
for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            inputs = []
            for no, r in enumerate(l):
                if no < (len(l) - 1):
                    if l[no + 1]['role'] == 'user':
                        next_text = l[no + 1].get('content_ms') or l[no + 1].get('content') or ''
                    else:
                        next_text = l[no + 1].get('content_ms') or ''
                else:
                    next_text = ''
                
                if r['role'] == 'user':
                    current_text = r.get('content_ms') or r.get('content') or ''
                else:
                    current_text = r.get('content_ms') or ''
                    
                if l[no - 1]['role'] == 'user':
                    previous_text = l[no - 1].get('content_ms') or l[no - 1].get('content') or ''
                else:
                    previous_text = l[no - 1].get('content_ms') or ''
                
                # bad pairs
                if r['role'] == 'user' and (len(current_text) < 2 or len(next_text) < 2):
                    # print('a', l, current_text, next_text, '\n')
                    continue
                if r['role'] == 'assistant' and (len(current_text) < 2 or len(previous_text) < 2):
                    continue
                
                # bad pairs
                if r['role'] == 'user' and current_text[:20].lower() == next_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), next_text[:20].lower())
                    continue
                if r['role'] == 'assistant' and current_text[:20].lower() == previous_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), previous_text[:20].lower())
                    continue
                    
                # remove alignments    
                if r['role'] == 'user' and (any([b in current_text.lower() for b in break_at]) or any([b in next_text.lower() for b in break_at])):
                    # print(current_text, next_text)
                    break
                if r['role'] == 'assistant' and (any([b in current_text.lower() for b in break_at]) or any([b in previous_text.lower() for b in break_at])):
                    # print(current_text, next_text)
                    break

                role = roles[r['role']]
                
                s = f"{role}: {current_text}"
                    
                inputs.append(s)

            if len(inputs) % 2 != 0:
                inputs = inputs[:-1]
                
            if not len(inputs):
                continue
            
            if random.random() > 0.4:
                continue
            
            index = random.randint(0, len(random_instructions) - 1)
            choice = random.choice(random_instructions[index])
            manusia = f"<manusia>: {choice['input']}"
            bot = f"<bot>: {choice['output']}"
            
            if random.random() > 0.5:
                inputs.extend([manusia, bot])
            else:
                inputs = [manusia, bot] + inputs

            data = '\n'.join(inputs).strip()
            
            if not len(data):
                continue
                
            a = {
                'prompt_input': None,
                'input': data,
                'output': None
            }
            data_instructions_mixtral.append(a)
            
len(data_instructions_mixtral)

60384it [00:10, 5552.21it/s]
57798it [00:11, 5149.78it/s]


46261

In [12]:
print(data_instructions_mixtral[0]['input'])

<manusia>: Terjemahkan ke dalam bahasa Melayu: Diberikan vektor integer $n$-dimensi $t = (a_1, dots, a_n)$, biarlah $(x_1, dots, x_n)$ mewakili punca persamaan polinomial $xn + a_1 x{n-1} + a_2 x{n-2} + cdots + a_{n-1}x + a_n = 0$.

Pertimbangkan dua syarat ini:
Semua elemen $(x_1, dots, x_n)$ adalah nombor nyata.
Jika $(x_1, dots, x_n)$ disusun mengikut susunan menaik, fungsi lantai bagi setiap $x_i$ bersamaan dengan $i$ untuk semua $1 leq i leq n$.

Dalam situasi di mana $n = 4$, terdapat $12$ vektor integer $t$ yang memenuhi kedua-dua syarat tersebut.
Kita menandakan $S(t)$ sebagai jumlah nilai mutlak bagi integer dalam $t$.
Untuk $n = 4$, boleh disahkan bahawa $sum S(t) = 2087$ untuk semua vektor integer $t$ yang memenuhi kedua-dua syarat.

Tentukan nilai $sum S(t)$ jika $n = 7$.
<bot>: Untuk menterjemah teks ke dalam bahasa Melayu

Diberikan vektor integer $n$-dimensi $t = (a_1, dots, a_n)$, marilah $(x_1, dots, x_n)$ mewakili punca persamaan polinomial $xn + a_1 x{n-1} + a_2 x{n-

In [13]:
from tqdm import tqdm

data_instructions = []

count, count1 = 0, 0
for file in glob('/home/husein/ssd3/gov.my/ultrachat-*.jsonl'):
    with open(file) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            
#             if l[1]['content'] is None:
#                 print(l)
#                 continue
            
            if 'glaive_coder_raw_text' in file:
                code_instruct = True
            else:
                code_instruct = False
                
            context = l[0]['content']
            
            if not code_instruct:
                l = l[1:]
                
            inputs = []
            for no, r in enumerate(l):
                
                if no < (len(l) - 1):
                    if l[no + 1]['role'] == 'user':
                        next_text = l[no + 1].get('content_ms') or l[no + 1].get('content') or ''
                    else:
                        next_text = l[no + 1].get('content_ms') or ''
                else:
                    next_text = ''
                
                if r['role'] == 'user':
                    current_text = r.get('content_ms') or r.get('content') or ''
                else:
                    current_text = r.get('content_ms') or ''
                    
                if l[no - 1]['role'] == 'user':
                    previous_text = l[no - 1].get('content_ms') or l[no - 1].get('content') or ''
                else:
                    previous_text = l[no - 1].get('content_ms') or ''
                
                # bad pairs
                if r['role'] == 'user' and (len(current_text) < 2 or len(next_text) < 2):
                    # print('a', l, current_text, next_text, '\n')
                    continue
                if r['role'] == 'assistant' and (len(current_text) < 2 or len(previous_text) < 2):
                    continue
                
                # bad pairs
                if not code_instruct and r['role'] == 'user' and current_text[:20].lower() == next_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), next_text[:20].lower())
                    continue
                if not code_instruct and r['role'] == 'assistant' and current_text[:20].lower() == previous_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), previous_text[:20].lower())
                    continue
                    
                # remove alignments    
                if r['role'] == 'user' and (any([b in current_text.lower() for b in break_at]) or any([b in next_text.lower() for b in break_at])):
                    continue
                if r['role'] == 'assistant' and (any([b in current_text.lower() for b in break_at]) or any([b in previous_text.lower() for b in break_at])):
                    continue

                role = roles[r['role']]
                
                if no == 0 and not code_instruct and ('crossref-melayu' in file or random.random() > 0.7):
                    s = f"{role}: {context}\n\n{current_text}"
                    count += 1
                else:
                    s = f"{role}: {current_text}"
                    
                inputs.append(s)
                count1 += 1

            if len(inputs) % 2 != 0:
                inputs = inputs[:-1]
                
            if not len(inputs):
                continue
            
            if random.random() > 0.2:
                continue
            
            index = random.randint(0, len(random_instructions) - 1)
            choice = random.choice(random_instructions[index])
            manusia = f"<manusia>: {choice['input']}"
            bot = f"<bot>: {choice['output']}"
            
            if random.random() > 0.5:
                inputs.extend([manusia, bot])
            else:
                inputs = [manusia, bot] + inputs

            data = '\n'.join(inputs).strip()
            
            if not len(data):
                continue
                
            a = {
                'prompt_input': None,
                'input': data,
                'output': None
            }
            data_instructions.append(a)
        
        
len(data_instructions)

1734it [00:01, 911.68it/s]
6440it [00:03, 1676.19it/s]
3350it [00:02, 1525.02it/s]
10128it [00:05, 1711.71it/s]
1296it [00:01, 1027.09it/s]
8044it [00:05, 1481.28it/s]
60198it [00:19, 3165.07it/s]
49842it [00:40, 1230.98it/s]
9959it [00:06, 1651.89it/s]
4567it [00:02, 1691.91it/s]
72538it [00:30, 2356.94it/s]
4408it [00:02, 1904.69it/s]
127253it [01:13, 1727.33it/s]
3834it [00:02, 1515.22it/s]


72077

In [14]:
combine_all = data_instructions_mixtral + data_instructions + random_combine
len(combine_all)

198338

In [15]:
def generate_and_tokenize_prompt(row):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s> ')

    prompt = ''.join(texts)
    return {'text': prompt}

In [16]:
from tqdm import tqdm

for i in tqdm(range(len(combine_all))):
    generate_and_tokenize_prompt(combine_all[i])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 198338/198338 [00:02<00:00, 94931.16it/s]


In [17]:
with open('prepared-combine-multiple-chats.jsonl', 'w') as fopen:
    for l in combine_all:
        fopen.write(f'{json.dumps(l)}\n')

In [18]:
!wc -l prepared-combine-multiple-chats.jsonl

198338 prepared-combine-multiple-chats.jsonl


In [19]:
!ls -lh prepared-combine-multiple-chats.jsonl

-rw-r--r-- 1 husein husein 1.3G Dis  25 18:33 prepared-combine-multiple-chats.jsonl
