In [1]:
import json
import random
from glob import glob
from tqdm import tqdm
from transformers import AutoTokenizer



In [7]:
synthetic = []

files = glob('/home/husein/ssd3/instructions/synthetic-*.jsonl')
files = [f for f in files if 'code' not in f and 'oss' not in f]
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            
            if l['rejected_ins'] or l['rejected_output']:
                continue
                
            if l['instruction_ms'] is None:
                continue
            
            if l['output_ms'] is None:
                continue
                
            if len(set(l['instruction_ms'].split())) < (len(set(l['instruction'].split())) / 2):
                continue
            
            if len(set(l['output_ms'].split())) < (len(set(l['output'].split())) / 2):
                continue
                
            synthetic.append({
                'prompt_input': None,
                'input': l['instruction_ms'].strip(),
                'output': l['output_ms'].strip(),
            })
                
len(synthetic)

186722

In [9]:
general_qa = []
for f in glob('/home/husein/ssd3/chatgpt4-malaysian-general-qa/*.jsonl'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            q = l.get('question_ms', l['question']).strip()
            if not len(q):
                continue
            a = l.get('answer_ms', l['answer'])
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            general_qa.append(d)
            
len(general_qa)

70258

In [10]:
files = glob('/home/husein/ssd3/soalan-augmentation/mixtral*.jsonl')
files = [f for f in files if 'math' not in f and 'multiturn' not in f and 'starter' not in f and 'conversation' not in f]
files

['/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-kementerian.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-critics-malaysian-disagree.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-part2.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-disagree.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-negeri.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-rag-question-disagree.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-factual-wrong.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-audio-instruction.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-factual-wrong-v2.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-dangerous-qa.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-mixtral-ahli-parlimen.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-glc.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general.jsonl']

In [11]:
skip = [
    'always better to find peaceful',
    'non-violent ways to express',
    'can lead to severe consequences',
    'still have questions or concerns about',
    'that doing so is illegal',
    'to report the incident to',
    'would recommend consulting with',
    'indonesian',
    'translates to',
    # 'language model'
]

mixtral_instructions = []
rejected = [
    'math', 
    'multiturn', 
    'dangerous', 
    'starter', 
    'conversation', 
    'rag', 
    'audio',
    'ahli-parlimen',
    'disagree'
]

files = glob('/home/husein/ssd3/soalan-augmentation/mixtral*.jsonl')
files = [f for f in files if all([r not in f for r in rejected])]
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l_lower = l.lower()
            l = json.loads(l)
            if any([s in l_lower for s in skip]):
                # print(l)
                continue
            
            q = l.get('question', '').strip()
            if not len(q):
                continue
            if not q.endswith('?'):
                continue
            if len(q) < 200 and '?' not in q:
                # print(q)
                continue
            a = l.get('answer_ms')
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
            if len(a) < len(q):
                continue
            if len(a) < 100:
                continue
                
            if len(set(l['answer_ms'].split())) < (len(set(l['answer'].split())) / 2):
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            mixtral_instructions.append(d)
            
files = glob('/home/husein/ssd3/soalan-augmentation/*math*.jsonl')
files = [f for f in files if 'conversation' not in f]
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            q = l.get('question', '').strip()
            if not len(q):
                continue
            a = l.get('answer_ms')
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
            if len(a) < len(q):
                continue
            if len(a) < 100:
                continue
                
            if len(set(l['question_ms'].split())) < (len(set(l['question'].split())) / 2):
                continue
                
            if len(set(l['answer_ms'].split())) < (len(set(l['answer'].split())) / 2):
                continue
                
            d = {
                'prompt_input': None,
                'input': q,
                'output': a,
            }
            mixtral_instructions.append(d)

len(mixtral_instructions)

265851

In [12]:
glob('/home/husein/ssd3/soalan-augmentation/*math*.jsonl')

['/home/husein/ssd3/soalan-augmentation/mixtral-conversation-math-stupid.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-math.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-simple-math.jsonl']

In [13]:
random_instructions = {
    0: synthetic,
    1: general_qa,
    2: mixtral_instructions
}

In [14]:
roles = {
    'user': '<manusia>',
    'assistant': '<bot>'
}

In [15]:
random_combine = []
for i in tqdm(range(50000)):
    combine = []
    for _ in range(random.randint(2, 4)):
        index = random.randint(0, len(random_instructions) - 1)
        choice = random.choice(random_instructions[index])
        s = f"<manusia>: {choice['input']}"
        combine.append(s)
        s = f"<bot>: {choice['output']}"
        combine.append(s)
    data = '\n'.join(combine).strip()
    a = {
        'prompt_input': None,
        'input': data,
        'output': None
    }
    random_combine.append(a)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 132286.56it/s]


In [16]:
break_at = [
    'help.openai.com',
    'openai',
    'cannot have personal opinions',
    's an ai language model',
    "i'm sorry",
    'many factors',
    'lgbt',
    'lesbian',
    'gender-neutral',
    'remain neutral',
    'without bias',
    'and neutral',
    'more inclusive',
    'neutrality',
    'non-bias',
    'discrimination',
    'avoid any forms of discrimination',
    'regardless of their gender',
    'inclusive and tolerant environment',
    'have personal views',
    'sexual orientation should be a top priority',
    's an objective ai',
    'avoid any forms of prejudice or hate',
    'regardless of their personal',
    'you understand this direction',
    'tolerant environment within ai',
    'cannot express my',
    'requires more context',
    'personal opinion',
    'have updated information',
    "don't have personal experiences",
    'there is no information',
    'tidak mempunyai akses kepada data atau maklumat',
    '10 april 2021',
    'ebagai model bahasa AI',
    'model bahasa AI',
    'mempunyai kepercayaan atau pendapat peribadi',
    'tidak mempunyai pendapat peribadi',
    'tidak mempunyai kepercayaan',
    'tidak mempunyai falsafah peribadi',
    'tidak mempunyai pengalaman peribadi',
    'tidak mempunyai pendapat atau pengalaman peribadi',
    'tidak mempunyai maklumat terkini',
    'tidak mempunyai emosi peribadi',
    'tidak mempunyai keutamaan',
    'saya tidak mempunyai akses',
    'tidak mempunyai pengalaman',
    'saya tidak mempunyai keupayaan',
    'tidak mempunyai keupayaan',
    'tidak mempunyai hubungan',
    'tidak mempunyai maklumat',
    'Saya tidak mempunyai',
    'Saya tidak pernah',
    'help.openai.com',
    'openai',
    'cannot have personal opinions',
    's an ai language model',
    "i'm sorry",
    'many factors',
    'lgbt',
    'lesbian',
    'gender-neutral',
    'remain neutral',
    'without bias',
    'and neutral',
    'more inclusive',
    'neutrality',
    'non-bias',
    'discrimination',
    'avoid any forms of discrimination',
    'regardless of their gender',
    'inclusive and tolerant environment',
    'have personal views',
    'sexual orientation should be a top priority',
    's an objective ai',
    'avoid any forms of prejudice or hate',
    'regardless of their personal',
    'you understand this direction',
    'tolerant environment within ai',
    'cannot express my',
    'requires more context',
    'personal opinion',
    'have updated information',
    "don't have personal experiences",
    'there is no information',
    'tidak mempunyai akses kepada data atau maklumat',
    '10 april 2021',
    'ebagai model bahasa AI',
    'ebagai model bahasa ai',
    'model bahasa AI',
    'model bahasa ai',
    'bahasa ai',
    'ebagai model bahasa'
    'hat makes sense',
    'have access to data or information',
    'have access to the data or information',
    'hanya mempunyai akses kepada maklumat umum',
    'hanya boleh memberikan maklumat umum',
    'have personal preferences',
    'not have personal experiences',
    'not capable of having subjective opinions',
    'indonesian',
    'terjemah',
    'translate'
]

break_at = list(set(break_at))

In [17]:
from tqdm import tqdm

roles = {
    'user': '<manusia>',
    'assistant': '<bot>'
}

data_instructions_mixtral = []
files = glob('/home/husein/ssd3/soalan-augmentation/mixtral*conversation*.jsonl')
for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            inputs = []
            for no, r in enumerate(l):
                if no < (len(l) - 1):
                    if l[no + 1]['role'] == 'user':
                        next_text = l[no + 1].get('content_ms') or l[no + 1].get('content') or ''
                    else:
                        next_text = l[no + 1].get('content_ms') or ''
                else:
                    next_text = ''
                
                if r['role'] == 'user':
                    current_text = r.get('content_ms') or r.get('content') or ''
                else:
                    current_text = r.get('content_ms') or ''
                    
                if l[no - 1]['role'] == 'user':
                    previous_text = l[no - 1].get('content_ms') or l[no - 1].get('content') or ''
                else:
                    previous_text = l[no - 1].get('content_ms') or ''
                
                # bad pairs
                if r['role'] == 'user' and (len(current_text) < 2 or len(next_text) < 2):
                    # print('a', l, current_text, next_text, '\n')
                    continue
                if r['role'] == 'assistant' and (len(current_text) < 2 or len(previous_text) < 2):
                    continue
                
                # bad pairs
                if r['role'] == 'user' and current_text[:20].lower() == next_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), next_text[:20].lower())
                    continue
                if r['role'] == 'assistant' and current_text[:20].lower() == previous_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), previous_text[:20].lower())
                    continue
                    
                # remove alignments    
                if r['role'] == 'user' and (any([b in current_text.lower() for b in break_at]) or any([b in next_text.lower() for b in break_at])):
                    # print(current_text, next_text)
                    break
                if r['role'] == 'assistant' and (any([b in current_text.lower() for b in break_at]) or any([b in previous_text.lower() for b in break_at])):
                    # print(current_text, next_text)
                    break

                role = roles[r['role']]
                
                s = f"{role}: {current_text}"
                    
                inputs.append((s, r))

            if len(inputs) % 2 != 0:
                inputs = inputs[:-1]
                
            if not len(inputs):
                continue
                
            outputs = []
            for i in range(0, len(inputs), 2):
                
                try:
                    content = inputs[i][1]['content']
                    content_ms = inputs[i][1]['content_ms']

                    if len(set(content_ms.split())) < (len(set(content.split())) / 2):
                        continue
                except:
                    pass
                
                try:
                    content = inputs[i + 1][1]['content']
                    content_ms = inputs[i + 1][1]['content_ms']

                    if len(set(content_ms.split())) < (len(set(content.split())) / 2):
                        continue
                except:
                    pass
                
                outputs.extend([
                    inputs[i][0],
                    inputs[i + 1][0]
                ])
                
            inputs = outputs
            
            index = random.randint(0, len(random_instructions) - 1)
            choice = random.choice(random_instructions[index])
            manusia = f"<manusia>: {choice['input']}"
            bot = f"<bot>: {choice['output']}"
            
            if random.random() > 0.5:
                inputs.extend([manusia, bot])
            else:
                inputs = [manusia, bot] + inputs

            data = '\n'.join(inputs).strip()
            
            if not len(data):
                continue
                
            if random.random() > 0.4:
                continue
                
            a = {
                'prompt_input': None,
                'input': data,
                'output': None
            }
            data_instructions_mixtral.append(a)
            
len(data_instructions_mixtral)

48163it [00:11, 4226.11it/s]
60384it [00:14, 4080.94it/s]
57798it [00:14, 4045.82it/s]


65547

In [18]:
from tqdm import tqdm

data_instructions = []

count, count1 = 0, 0
for file in glob('/home/husein/ssd3/gov.my/ultrachat-*.jsonl'):
    with open(file) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            
#             if l[1]['content'] is None:
#                 print(l)
#                 continue
            
            if 'glaive_coder_raw_text' in file:
                code_instruct = True
            else:
                code_instruct = False
                
            context = l[0]['content']
            
            if not code_instruct:
                l = l[1:]
                
            inputs = []
            for no, r in enumerate(l):
                
                if no < (len(l) - 1):
                    if l[no + 1]['role'] == 'user':
                        next_text = l[no + 1].get('content_ms') or l[no + 1].get('content') or ''
                    else:
                        next_text = l[no + 1].get('content_ms') or ''
                else:
                    next_text = ''
                
                if r['role'] == 'user':
                    current_text = r.get('content_ms') or r.get('content') or ''
                else:
                    current_text = r.get('content_ms') or ''
                    
                if l[no - 1]['role'] == 'user':
                    previous_text = l[no - 1].get('content_ms') or l[no - 1].get('content') or ''
                else:
                    previous_text = l[no - 1].get('content_ms') or ''
                
                # bad pairs
                if r['role'] == 'user' and (len(current_text) < 2 or len(next_text) < 2):
                    # print('a', l, current_text, next_text, '\n')
                    continue
                if r['role'] == 'assistant' and (len(current_text) < 2 or len(previous_text) < 2):
                    continue
                
                # bad pairs
                if not code_instruct and r['role'] == 'user' and current_text[:20].lower() == next_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), next_text[:20].lower())
                    continue
                if not code_instruct and r['role'] == 'assistant' and current_text[:20].lower() == previous_text[:20].lower():
                    # print(no, r, current_text[:20].lower(), previous_text[:20].lower())
                    continue
                    
                # remove alignments    
                if r['role'] == 'user' and (any([b in current_text.lower() for b in break_at]) or any([b in next_text.lower() for b in break_at])):
                    continue
                if r['role'] == 'assistant' and (any([b in current_text.lower() for b in break_at]) or any([b in previous_text.lower() for b in break_at])):
                    continue

                role = roles[r['role']]
                
                if no == 0 and not code_instruct and ('crossref-melayu' in file or random.random() > 0.7):
                    s = f"{role}: {context}\n\n{current_text}"
                    count += 1
                else:
                    s = f"{role}: {current_text}"
                    
                inputs.append((s, r))
                count1 += 1

            if len(inputs) % 2 != 0:
                inputs = inputs[:-1]
                
            if not len(inputs):
                continue
                
            outputs = []
            for i in range(0, len(inputs), 2):
                
                try:
                    content = inputs[i][1]['content']
                    content_ms = inputs[i][1]['content_ms']

                    if len(set(content_ms.split())) < (len(set(content.split())) / 2):
                        continue
                except:
                    pass
                
                try:
                    content = inputs[i + 1][1]['content']
                    content_ms = inputs[i + 1][1]['content_ms']

                    if len(set(content_ms.split())) < (len(set(content.split())) / 2):
                        continue
                except:
                    pass
                
                outputs.extend([
                    inputs[i][0],
                    inputs[i + 1][0]
                ])
                
            inputs = outputs
            
            index = random.randint(0, len(random_instructions) - 1)
            choice = random.choice(random_instructions[index])
            manusia = f"<manusia>: {choice['input']}"
            bot = f"<bot>: {choice['output']}"
            
            if random.random() > 0.5:
                inputs.extend([manusia, bot])
            else:
                inputs = [manusia, bot] + inputs

            data = '\n'.join(inputs).strip()
            
            if not len(data):
                continue
            
            if random.random() > 0.2:
                continue
                
            a = {
                'prompt_input': None,
                'input': data,
                'output': None
            }
            data_instructions.append(a)
        
        
len(data_instructions)

1734it [00:02, 715.28it/s]
6440it [00:05, 1287.04it/s]
3350it [00:02, 1181.94it/s]
10128it [00:07, 1317.19it/s]
1296it [00:01, 775.38it/s]
8044it [00:06, 1166.14it/s]
60198it [00:24, 2465.36it/s]
49842it [00:51, 960.50it/s] 
9959it [00:07, 1317.02it/s]
4567it [00:03, 1359.20it/s]
72538it [00:39, 1843.47it/s]
4408it [00:02, 1502.16it/s]
127253it [01:32, 1373.42it/s]
3834it [00:03, 1182.15it/s]


72055

In [19]:
data_instructions[0]

{'prompt_input': None,
 'input': "<manusia>: Apakah perubahan undang-undang terkini mengenai pengurusan sisa industri di Malaysia dan Eropah?\n<bot>: Perubahan undang-undang terkini mengenai pengurusan sisa industri di Malaysia dan Eropah ialah pengenalan dan pelaksanaan undang-undang yang lebih ketat dalam hal ini. Negara-negara di Eropah, termasuk Malaysia, telah mengambil langkah-langkah untuk meningkatkan pengurusan sisa industri dan melindungi alam sekitar.\n\nDi Malaysia, terdapat beberapa undang-undang dan peraturan yang berkaitan dengan pengurusan sisa industri seperti Akta Kualiti Alam Sekeliling 1974 dan pindaan undang-undang. Pada tahun 2011, Malaysia telah memperkenalkan Pelan Pengurusan Sisa Negara (Rancangan Pengurusan Sisa Negara) untuk meningkatkan pengurusan sisa negara secara holistik.\n\nDi Eropah, terdapat pelbagai direktif dan peraturan yang menggariskan piawaian pengurusan sisa industri. Antara lain ialah Arahan Rangka Kerja mengenai Sisa (Arahan 2008/98/EC), Arah

In [20]:
combine_all = data_instructions_mixtral + data_instructions + random_combine
len(combine_all)

187602

In [21]:
def generate_and_tokenize_prompt(row):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s> ')

    prompt = ''.join(texts)
    return {'text': prompt}

In [22]:
from tqdm import tqdm

for i in tqdm(range(len(combine_all))):
    generate_and_tokenize_prompt(combine_all[i])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 187602/187602 [00:01<00:00, 104568.44it/s]


In [23]:
with open('prepared-combine-multiple-chats-no-code.jsonl', 'w') as fopen:
    for l in combine_all:
        fopen.write(f'{json.dumps(l)}\n')